43#include "llvm/IR/IntrinsicsAMDGPU.h"
44#include "llvm/IR/IntrinsicsR600.h"
55#define DEBUG_TYPE "si-lower"
61 cl::desc(
"Do not align and prefetch loops"),
65 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
66 cl::desc(
"Use indirect register addressing for divergent indexes"),
80 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
81 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
83 return AMDGPU::SGPR0 +
Reg;
99 TRI->getDefaultVectorSuperClassForBitWidth(32);
105 TRI->getDefaultVectorSuperClassForBitWidth(64);
143 TRI->getDefaultVectorSuperClassForBitWidth(320));
147 TRI->getDefaultVectorSuperClassForBitWidth(352));
151 TRI->getDefaultVectorSuperClassForBitWidth(384));
155 TRI->getDefaultVectorSuperClassForBitWidth(512));
162 TRI->getDefaultVectorSuperClassForBitWidth(1024));
164 if (Subtarget->has16BitInsts()) {
165 if (Subtarget->useRealTrue16Insts()) {
195 TRI->getDefaultVectorSuperClassForBitWidth(1024));
211 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
212 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
213 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
214 MVT::i1, MVT::v32i32},
218 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
219 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
220 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
221 MVT::i1, MVT::v32i32},
289 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
296 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
297 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
298 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
301 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
302 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
303 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
307 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
308 MVT::v3i16, MVT::v4i16, MVT::Other},
313 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
329 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
330 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
331 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
332 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
333 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
334 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
335 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
336 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
368 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
382 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
396 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
410 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
424 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
439 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
440 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
443 if (Subtarget->hasPkMovB32()) {
464 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
465 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
470 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
474 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
475 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
476 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
477 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
501 if (Subtarget->hasSMemRealTime() ||
506 if (Subtarget->has16BitInsts()) {
516 if (Subtarget->hasMadMacF32Insts())
533 if (Subtarget->hasIntClamp())
536 if (Subtarget->hasAddNoCarryInsts())
542 {MVT::f32, MVT::f64},
Custom);
548 {MVT::f32, MVT::f64},
Legal);
550 if (Subtarget->haveRoundOpsF64())
580 if (Subtarget->has16BitInsts()) {
633 if (Subtarget->hasBF16TransInsts())
656 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
657 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
658 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
793 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
794 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
795 MVT::v32f16, MVT::v32bf16},
805 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
809 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
813 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
814 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
822 if (Subtarget->hasVOP3PInsts()) {
833 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
836 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
837 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
838 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
841 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
849 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
856 {MVT::v2f16, MVT::v4f16},
Custom);
862 if (Subtarget->hasBF16PackedInsts()) {
863 for (
MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
869 if (Subtarget->hasPackedFP32Ops()) {
873 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
880 if (Subtarget->has16BitInsts()) {
893 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
894 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
895 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
896 MVT::v32f16, MVT::v32bf16},
901 if (Subtarget->hasVectorMulU64())
903 else if (Subtarget->hasScalarSMulU64())
906 if (Subtarget->hasMad64_32())
909 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
912 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
914 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
917 if (Subtarget->hasMinimum3Maximum3F32())
920 if (Subtarget->hasMinimum3Maximum3PKF16()) {
924 if (!Subtarget->hasMinimum3Maximum3F16())
929 if (Subtarget->hasVOP3PInsts()) {
932 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
936 if (Subtarget->hasIntMinMax64())
941 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
942 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
947 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
948 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
949 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
950 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
954 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
955 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
956 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
957 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
972 if (Subtarget->hasBF16ConversionInsts()) {
977 if (Subtarget->hasBF16PackedInsts()) {
983 if (Subtarget->hasBF16TransInsts()) {
987 if (Subtarget->hasCvtPkF16F32Inst()) {
989 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1040 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1081 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1094 EVT DestVT,
EVT SrcVT)
const {
1096 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1097 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1099 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1106 LLT DestTy,
LLT SrcTy)
const {
1107 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1108 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1110 SrcTy.getScalarSizeInBits() == 16 &&
1131 return Subtarget->has16BitInsts()
1137 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1141 if (!Subtarget->has16BitInsts() && VT.
getSizeInBits() == 16)
1163 return (NumElts + 1) / 2;
1169 return NumElts * ((
Size + 31) / 32);
1178 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1187 MVT SimpleIntermediateVT =
1189 IntermediateVT = SimpleIntermediateVT;
1190 RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
1191 NumIntermediates = (NumElts + 1) / 2;
1192 return (NumElts + 1) / 2;
1197 IntermediateVT = RegisterVT;
1198 NumIntermediates = NumElts;
1199 return NumIntermediates;
1204 RegisterVT = MVT::i16;
1205 IntermediateVT = ScalarVT;
1206 NumIntermediates = NumElts;
1207 return NumIntermediates;
1211 RegisterVT = MVT::i32;
1212 IntermediateVT = ScalarVT;
1213 NumIntermediates = NumElts;
1214 return NumIntermediates;
1218 RegisterVT = MVT::i32;
1219 IntermediateVT = RegisterVT;
1220 NumIntermediates = NumElts * ((
Size + 31) / 32);
1221 return NumIntermediates;
1226 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1231 unsigned MaxNumLanes) {
1232 assert(MaxNumLanes != 0);
1236 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1247 unsigned MaxNumLanes) {
1253 assert(ST->getNumContainedTypes() == 2 &&
1254 ST->getContainedType(1)->isIntegerTy(32));
1268 return MVT::amdgpuBufferFatPointer;
1270 DL.getPointerSizeInBits(AS) == 192)
1271 return MVT::amdgpuBufferStridedPointer;
1280 DL.getPointerSizeInBits(AS) == 160) ||
1282 DL.getPointerSizeInBits(AS) == 192))
1289 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1290 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1291 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1293 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1294 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1295 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1296 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1297 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1298 case Intrinsic::amdgcn_flat_load_monitor_b32:
1299 case Intrinsic::amdgcn_global_load_monitor_b32:
1301 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1302 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1303 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1304 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1305 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1306 case Intrinsic::amdgcn_flat_load_monitor_b64:
1307 case Intrinsic::amdgcn_global_load_monitor_b64:
1309 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1310 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1311 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1312 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1313 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1314 case Intrinsic::amdgcn_flat_load_monitor_b128:
1315 case Intrinsic::amdgcn_global_load_monitor_b128:
1351 unsigned IntrID)
const {
1353 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1367 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1380 if (RsrcIntr->IsImage) {
1395 Info.ptrVal = RsrcArg;
1399 if (RsrcIntr->IsImage) {
1400 unsigned MaxNumLanes = 4;
1415 std::numeric_limits<unsigned>::max());
1425 if (RsrcIntr->IsImage) {
1445 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1447 Info.memVT = MVT::i32;
1454 case Intrinsic::amdgcn_raw_buffer_load_lds:
1455 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
1456 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1457 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
1458 case Intrinsic::amdgcn_struct_buffer_load_lds:
1459 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
1460 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
1461 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
1475 CI.
getContext(), Width * 8 * Subtarget->getWavefrontSize());
1484 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1485 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1486 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1487 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1490 std::numeric_limits<unsigned>::max());
1503 case Intrinsic::amdgcn_ds_ordered_add:
1504 case Intrinsic::amdgcn_ds_ordered_swap: {
1518 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1519 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1522 Info.ptrVal =
nullptr;
1528 case Intrinsic::amdgcn_ds_append:
1529 case Intrinsic::amdgcn_ds_consume: {
1543 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1544 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1545 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1550 Info.memVT = MVT::i64;
1557 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1558 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1559 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1562 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1565 ->getElementType(0));
1574 case Intrinsic::amdgcn_global_atomic_fmin_num:
1575 case Intrinsic::amdgcn_global_atomic_fmax_num:
1576 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1577 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1578 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1589 case Intrinsic::amdgcn_cluster_load_b32:
1590 case Intrinsic::amdgcn_cluster_load_b64:
1591 case Intrinsic::amdgcn_cluster_load_b128:
1592 case Intrinsic::amdgcn_ds_load_tr6_b96:
1593 case Intrinsic::amdgcn_ds_load_tr4_b64:
1594 case Intrinsic::amdgcn_ds_load_tr8_b64:
1595 case Intrinsic::amdgcn_ds_load_tr16_b128:
1596 case Intrinsic::amdgcn_global_load_tr6_b96:
1597 case Intrinsic::amdgcn_global_load_tr4_b64:
1598 case Intrinsic::amdgcn_global_load_tr_b64:
1599 case Intrinsic::amdgcn_global_load_tr_b128:
1600 case Intrinsic::amdgcn_ds_read_tr4_b64:
1601 case Intrinsic::amdgcn_ds_read_tr6_b96:
1602 case Intrinsic::amdgcn_ds_read_tr8_b64:
1603 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1612 case Intrinsic::amdgcn_flat_load_monitor_b32:
1613 case Intrinsic::amdgcn_flat_load_monitor_b64:
1614 case Intrinsic::amdgcn_flat_load_monitor_b128:
1615 case Intrinsic::amdgcn_global_load_monitor_b32:
1616 case Intrinsic::amdgcn_global_load_monitor_b64:
1617 case Intrinsic::amdgcn_global_load_monitor_b128: {
1628 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1629 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1630 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1641 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1642 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1643 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1654 case Intrinsic::amdgcn_ds_gws_init:
1655 case Intrinsic::amdgcn_ds_gws_barrier:
1656 case Intrinsic::amdgcn_ds_gws_sema_v:
1657 case Intrinsic::amdgcn_ds_gws_sema_br:
1658 case Intrinsic::amdgcn_ds_gws_sema_p:
1659 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1669 Info.memVT = MVT::i32;
1671 Info.align =
Align(4);
1673 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1680 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1681 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1682 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1683 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1684 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1685 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1686 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1687 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1702 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1703 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1704 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1705 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1720 case Intrinsic::amdgcn_load_to_lds:
1721 case Intrinsic::amdgcn_load_async_to_lds:
1722 case Intrinsic::amdgcn_global_load_lds:
1723 case Intrinsic::amdgcn_global_load_async_lds: {
1742 Width * 8 * Subtarget->getWavefrontSize());
1748 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1749 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1750 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1751 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1761 Info.memVT = MVT::i32;
1763 Info.align =
Align(4);
1769 case Intrinsic::amdgcn_s_prefetch_data:
1770 case Intrinsic::amdgcn_flat_prefetch:
1771 case Intrinsic::amdgcn_global_prefetch: {
1787 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1790 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1791 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1803 Type *&AccessTy)
const {
1804 Value *Ptr =
nullptr;
1805 switch (
II->getIntrinsicID()) {
1806 case Intrinsic::amdgcn_cluster_load_b128:
1807 case Intrinsic::amdgcn_cluster_load_b64:
1808 case Intrinsic::amdgcn_cluster_load_b32:
1809 case Intrinsic::amdgcn_ds_append:
1810 case Intrinsic::amdgcn_ds_consume:
1811 case Intrinsic::amdgcn_ds_load_tr8_b64:
1812 case Intrinsic::amdgcn_ds_load_tr16_b128:
1813 case Intrinsic::amdgcn_ds_load_tr4_b64:
1814 case Intrinsic::amdgcn_ds_load_tr6_b96:
1815 case Intrinsic::amdgcn_ds_read_tr4_b64:
1816 case Intrinsic::amdgcn_ds_read_tr6_b96:
1817 case Intrinsic::amdgcn_ds_read_tr8_b64:
1818 case Intrinsic::amdgcn_ds_read_tr16_b64:
1819 case Intrinsic::amdgcn_ds_ordered_add:
1820 case Intrinsic::amdgcn_ds_ordered_swap:
1821 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1822 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1823 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1824 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1825 case Intrinsic::amdgcn_global_atomic_fmax_num:
1826 case Intrinsic::amdgcn_global_atomic_fmin_num:
1827 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1828 case Intrinsic::amdgcn_global_load_tr_b64:
1829 case Intrinsic::amdgcn_global_load_tr_b128:
1830 case Intrinsic::amdgcn_global_load_tr4_b64:
1831 case Intrinsic::amdgcn_global_load_tr6_b96:
1832 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1833 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1834 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1835 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1836 Ptr =
II->getArgOperand(0);
1838 case Intrinsic::amdgcn_load_to_lds:
1839 case Intrinsic::amdgcn_load_async_to_lds:
1840 case Intrinsic::amdgcn_global_load_lds:
1841 case Intrinsic::amdgcn_global_load_async_lds:
1842 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1843 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1844 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1845 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1846 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1847 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1848 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1849 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1850 Ptr =
II->getArgOperand(1);
1855 AccessTy =
II->getType();
1861 unsigned AddrSpace)
const {
1862 if (!Subtarget->hasFlatInstOffsets()) {
1873 return AM.
Scale == 0 &&
1874 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1875 AM.
BaseOffs, AddrSpace, FlatVariant));
1879 if (Subtarget->hasFlatGlobalInsts())
1882 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1895 return isLegalMUBUFAddressingMode(AM);
1898bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1909 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1921 if (AM.HasBaseReg) {
1953 return isLegalMUBUFAddressingMode(AM);
1955 if (!Subtarget->hasScalarSubwordLoads()) {
1960 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
2008 return Subtarget->hasFlatScratchEnabled()
2010 : isLegalMUBUFAddressingMode(AM);
2057 unsigned Size,
unsigned AddrSpace,
Align Alignment,
2066 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
2069 Align RequiredAlignment(
2071 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
Size > 32 &&
2072 Alignment < RequiredAlignment)
2087 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
2093 RequiredAlignment =
Align(4);
2095 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2111 *IsFast = (Alignment >= RequiredAlignment) ? 64
2112 : (Alignment <
Align(4)) ? 32
2119 if (!Subtarget->hasDS96AndDS128())
2125 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2134 *IsFast = (Alignment >= RequiredAlignment) ? 96
2135 : (Alignment <
Align(4)) ? 32
2142 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2148 RequiredAlignment =
Align(8);
2150 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2159 *IsFast = (Alignment >= RequiredAlignment) ? 128
2160 : (Alignment <
Align(4)) ? 32
2177 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2179 return Alignment >= RequiredAlignment ||
2180 Subtarget->hasUnalignedDSAccessEnabled();
2188 bool AlignedBy4 = Alignment >=
Align(4);
2189 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2191 *IsFast = AlignedBy4 ?
Size : 1;
2196 *IsFast = AlignedBy4;
2207 return Alignment >=
Align(4) ||
2208 Subtarget->hasUnalignedBufferAccessEnabled();
2220 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2235 return Size >= 32 && Alignment >=
Align(4);
2240 unsigned *IsFast)
const {
2242 Alignment, Flags, IsFast);
2247 const AttributeList &FuncAttributes)
const {
2253 if (
Op.size() >= 16 &&
2257 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2275 unsigned DestAS)
const {
2278 Subtarget->hasGloballyAddressableScratch()) {
2308 unsigned Index)
const {
2320 unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
2325 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2360 auto [InputPtrReg, RC, ArgTy] =
2376 const SDLoc &SL)
const {
2383 const SDLoc &SL)
const {
2386 std::optional<uint32_t> KnownSize =
2388 if (KnownSize.has_value())
2415 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2430SDValue SITargetLowering::lowerKernargMemParameter(
2435 MachinePointerInfo PtrInfo =
2444 int64_t OffsetDiff =
Offset - AlignDownOffset;
2450 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2461 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2466 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain,
Offset);
2471 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2480 const SDLoc &SL)
const {
2549 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2552 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2553 if (ConvertedVal == ArgValue)
2554 return ConvertedVal;
2559SDValue SITargetLowering::lowerWorkGroupId(
2564 if (!Subtarget->hasClusters())
2565 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2573 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2574 SDLoc SL(ClusterIdXYZ);
2575 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2578 SDValue ClusterWorkGroupIdXYZ =
2579 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2589 return ClusterIdXYZ;
2591 using namespace AMDGPU::Hwreg;
2595 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2606SDValue SITargetLowering::getPreloadedValue(
2609 const ArgDescriptor *
Reg =
nullptr;
2610 const TargetRegisterClass *RC;
2614 const ArgDescriptor WorkGroupIDX =
2622 const ArgDescriptor WorkGroupIDZ =
2624 const ArgDescriptor ClusterWorkGroupIDX =
2626 const ArgDescriptor ClusterWorkGroupIDY =
2628 const ArgDescriptor ClusterWorkGroupIDZ =
2630 const ArgDescriptor ClusterWorkGroupMaxIDX =
2632 const ArgDescriptor ClusterWorkGroupMaxIDY =
2634 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2636 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2639 auto LoadConstant = [&](
unsigned N) {
2643 if (Subtarget->hasArchitectedSGPRs() &&
2650 Reg = &WorkGroupIDX;
2651 RC = &AMDGPU::SReg_32RegClass;
2655 Reg = &WorkGroupIDY;
2656 RC = &AMDGPU::SReg_32RegClass;
2660 Reg = &WorkGroupIDZ;
2661 RC = &AMDGPU::SReg_32RegClass;
2665 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2666 return LoadConstant(0);
2667 Reg = &ClusterWorkGroupIDX;
2668 RC = &AMDGPU::SReg_32RegClass;
2672 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2673 return LoadConstant(0);
2674 Reg = &ClusterWorkGroupIDY;
2675 RC = &AMDGPU::SReg_32RegClass;
2679 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2680 return LoadConstant(0);
2681 Reg = &ClusterWorkGroupIDZ;
2682 RC = &AMDGPU::SReg_32RegClass;
2687 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2688 Reg = &ClusterWorkGroupMaxIDX;
2689 RC = &AMDGPU::SReg_32RegClass;
2694 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2695 Reg = &ClusterWorkGroupMaxIDY;
2696 RC = &AMDGPU::SReg_32RegClass;
2701 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2702 Reg = &ClusterWorkGroupMaxIDZ;
2703 RC = &AMDGPU::SReg_32RegClass;
2707 Reg = &ClusterWorkGroupMaxFlatID;
2708 RC = &AMDGPU::SReg_32RegClass;
2739 for (
unsigned I = 0,
E = Ins.
size(), PSInputNum = 0;
I !=
E; ++
I) {
2743 "vector type argument should have been split");
2748 bool SkipArg = !Arg->
Used && !Info->isPSInputAllocated(PSInputNum);
2756 "unexpected vector split in ps argument type");
2770 Info->markPSInputAllocated(PSInputNum);
2772 Info->markPSInputEnabled(PSInputNum);
2788 if (Info.hasWorkItemIDX()) {
2794 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2798 if (Info.hasWorkItemIDY()) {
2799 assert(Info.hasWorkItemIDX());
2800 if (Subtarget->hasPackedTID()) {
2801 Info.setWorkItemIDY(
2804 unsigned Reg = AMDGPU::VGPR1;
2812 if (Info.hasWorkItemIDZ()) {
2813 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2814 if (Subtarget->hasPackedTID()) {
2815 Info.setWorkItemIDZ(
2818 unsigned Reg = AMDGPU::VGPR2;
2838 if (RegIdx == ArgVGPRs.
size()) {
2845 unsigned Reg = ArgVGPRs[RegIdx];
2857 unsigned NumArgRegs) {
2860 if (RegIdx == ArgSGPRs.
size())
2863 unsigned Reg = ArgSGPRs[RegIdx];
2905 const unsigned Mask = 0x3ff;
2908 if (Info.hasWorkItemIDX()) {
2910 Info.setWorkItemIDX(Arg);
2913 if (Info.hasWorkItemIDY()) {
2915 Info.setWorkItemIDY(Arg);
2918 if (Info.hasWorkItemIDZ())
2930 const unsigned Mask = 0x3ff;
2939 auto &
ArgInfo = Info.getArgInfo();
2951 if (Info.hasImplicitArgPtr())
2959 if (Info.hasWorkGroupIDX())
2962 if (Info.hasWorkGroupIDY())
2965 if (Info.hasWorkGroupIDZ())
2968 if (Info.hasLDSKernelId())
2979 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2980 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2986 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2987 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2992 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2993 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2999 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
3005 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
3014 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
3019 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
3020 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
3025 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
3026 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
3041 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
3043 bool InPreloadSequence =
true;
3045 bool AlignedForImplictArgs =
false;
3046 unsigned ImplicitArgOffset = 0;
3047 for (
auto &Arg :
F.args()) {
3048 if (!InPreloadSequence || !Arg.hasInRegAttr())
3051 unsigned ArgIdx = Arg.getArgNo();
3054 if (InIdx < Ins.
size() &&
3055 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
3058 for (; InIdx < Ins.
size() && Ins[InIdx].isOrigArg() &&
3059 Ins[InIdx].getOrigArgIndex() == ArgIdx;
3061 assert(ArgLocs[ArgIdx].isMemLoc());
3062 auto &ArgLoc = ArgLocs[InIdx];
3064 unsigned ArgOffset = ArgLoc.getLocMemOffset();
3066 unsigned NumAllocSGPRs =
3067 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
3070 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
3071 if (!AlignedForImplictArgs) {
3073 alignTo(LastExplicitArgOffset,
3074 Subtarget->getAlignmentForImplicitArgPtr()) -
3075 LastExplicitArgOffset;
3076 AlignedForImplictArgs =
true;
3078 ArgOffset += ImplicitArgOffset;
3082 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
3083 assert(InIdx >= 1 &&
"No previous SGPR");
3084 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
3085 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
3089 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3090 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
3093 InPreloadSequence =
false;
3099 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3101 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3103 if (PreloadRegs->
size() > 1)
3104 RC = &AMDGPU::SGPR_32RegClass;
3105 for (
auto &Reg : *PreloadRegs) {
3111 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3120 if (Info.hasLDSKernelId()) {
3121 Register Reg = Info.addLDSKernelId();
3122 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3131 bool IsShader)
const {
3132 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3133 if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
3139 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3141 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3145 unsigned NumRequiredSystemSGPRs =
3146 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3147 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3148 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3149 Register Reg = Info.addReservedUserSGPR();
3150 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3155 if (!HasArchitectedSGPRs) {
3156 if (Info.hasWorkGroupIDX()) {
3157 Register Reg = Info.addWorkGroupIDX();
3158 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3162 if (Info.hasWorkGroupIDY()) {
3163 Register Reg = Info.addWorkGroupIDY();
3164 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3168 if (Info.hasWorkGroupIDZ()) {
3169 Register Reg = Info.addWorkGroupIDZ();
3170 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3175 if (Info.hasWorkGroupInfo()) {
3176 Register Reg = Info.addWorkGroupInfo();
3177 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3181 if (Info.hasPrivateSegmentWaveByteOffset()) {
3183 unsigned PrivateSegmentWaveByteOffsetReg;
3186 PrivateSegmentWaveByteOffsetReg =
3187 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3191 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3193 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3196 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3198 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3199 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3202 assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
3203 Info.getNumPreloadedSGPRs() >= 16);
3218 if (HasStackObjects)
3219 Info.setHasNonSpillStackObjects(
true);
3224 HasStackObjects =
true;
3228 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3230 if (!ST.hasFlatScratchEnabled()) {
3231 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3238 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3240 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3250 Info.setScratchRSrcReg(ReservedBufferReg);
3269 if (!MRI.
isLiveIn(AMDGPU::SGPR32)) {
3270 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3277 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3279 Info.setStackPtrOffsetReg(
Reg);
3284 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3291 if (ST.getFrameLowering()->hasFP(MF)) {
3292 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3308 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3317 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3318 RC = &AMDGPU::SGPR_64RegClass;
3319 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3320 RC = &AMDGPU::SGPR_32RegClass;
3326 Entry->addLiveIn(*
I);
3331 for (
auto *Exit : Exits)
3333 TII->get(TargetOpcode::COPY), *
I)
3348 bool IsError =
false;
3352 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3370 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3371 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3373 if (!Subtarget->hasFlatScratchEnabled())
3378 !Subtarget->hasArchitectedSGPRs())
3379 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3380 !Info->hasWorkGroupIDZ());
3383 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3401 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3402 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3405 Info->markPSInputAllocated(0);
3406 Info->markPSInputEnabled(0);
3408 if (Subtarget->isAmdPalOS()) {
3417 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3418 if ((PsInputBits & 0x7F) == 0 ||
3419 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3422 }
else if (IsKernel) {
3423 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3435 if (IsKernel && Subtarget->hasKernargPreload())
3439 }
else if (!IsGraphics) {
3444 if (!Subtarget->hasFlatScratchEnabled())
3456 Info->setNumWaveDispatchSGPRs(
3458 Info->setNumWaveDispatchVGPRs(
3460 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3461 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3466 if (IsWholeWaveFunc) {
3468 {MVT::i1, MVT::Other}, Chain);
3480 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.
size(), ArgIdx = 0; i != e;
3491 if (IsEntryFunc && VA.
isMemLoc()) {
3514 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3518 int64_t OffsetDiff =
Offset - AlignDownOffset;
3525 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3528 Register VReg = MRI.getLiveInVirtReg(Reg);
3536 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3537 Ins[i].Flags.isSExt(), &Ins[i]);
3545 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3548 if (PreloadRegs.
size() == 1) {
3549 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3554 TRI->getRegSizeInBits(*RC)));
3562 for (
auto Reg : PreloadRegs) {
3563 Register VReg = MRI.getLiveInVirtReg(Reg);
3569 PreloadRegs.size()),
3586 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3587 Ins[i].Flags.isSExt(), &Ins[i]);
3599 "hidden argument in kernel signature was not preloaded",
3605 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3606 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3626 if (!IsEntryFunc && VA.
isMemLoc()) {
3627 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3638 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3639 RC = &AMDGPU::VGPR_32RegClass;
3640 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3641 RC = &AMDGPU::SGPR_32RegClass;
3647 if (Arg.
Flags.
isInReg() && RC == &AMDGPU::VGPR_32RegClass) {
3653 ReadFirstLane, Val);
3669 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3678 Info->setBytesInStackArgArea(StackArgSize);
3680 return Chains.
empty() ? Chain
3689 const Type *RetTy)
const {
3697 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3702 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3703 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3704 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3705 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3728 Info->setIfReturnsVoid(Outs.
empty());
3729 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3748 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3749 ++
I, ++RealRVLocIdx) {
3753 SDValue Arg = OutVals[RealRVLocIdx];
3776 ReadFirstLane, Arg);
3783 if (!Info->isEntryFunction()) {
3789 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3791 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3804 unsigned Opc = AMDGPUISD::ENDPGM;
3806 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3807 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3808 : AMDGPUISD::RET_GLUE;
3913 const auto [OutgoingArg, ArgRC, ArgTy] =
3918 const auto [IncomingArg, IncomingArgRC, Ty] =
3920 assert(IncomingArgRC == ArgRC);
3923 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3931 InputReg = getImplicitArgPtr(DAG,
DL);
3933 std::optional<uint32_t> Id =
3935 if (Id.has_value()) {
3946 if (OutgoingArg->isRegister()) {
3947 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3948 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3951 unsigned SpecialArgOffset =
3962 auto [OutgoingArg, ArgRC, Ty] =
3965 std::tie(OutgoingArg, ArgRC, Ty) =
3968 std::tie(OutgoingArg, ArgRC, Ty) =
3983 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3984 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3985 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3990 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3998 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
4008 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
4017 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
4018 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
4029 : IncomingArgY ? *IncomingArgY
4036 if (OutgoingArg->isRegister()) {
4038 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
4064 if (Callee->isDivergent())
4071 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
4075 if (!CallerPreserved)
4078 bool CCMatch = CallerCC == CalleeCC;
4091 if (Arg.hasByValAttr())
4105 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
4106 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4115 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4128 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4130 if (!CCVA.isRegLoc())
4135 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4137 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4161enum ChainCallArgIdx {
4183 bool UsesDynamicVGPRs =
false;
4184 if (IsChainCallConv) {
4189 auto RequestedExecIt =
4191 return Arg.OrigArgIndex == 2;
4193 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4195 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4198 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4201 "Haven't popped all the special args");
4204 CLI.
Args[ChainCallArgIdx::Exec];
4205 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4213 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4215 ChainCallSpecialArgs.
push_back(Arg.Node);
4218 PushNodeOrTargetConstant(RequestedExecArg);
4224 if (FlagsValue.
isZero()) {
4225 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4227 "no additional args allowed if flags == 0");
4229 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4233 if (!Subtarget->isWave32()) {
4235 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4238 UsesDynamicVGPRs =
true;
4239 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4240 CLI.
Args.end(), PushNodeOrTargetConstant);
4249 bool IsSibCall =
false;
4263 "unsupported call to variadic function ");
4271 "unsupported required tail call to function ");
4276 Outs, OutVals, Ins, DAG);
4280 "site marked musttail or on llvm.amdgcn.cs.chain");
4287 if (!TailCallOpt && IsTailCall)
4311 if (!Subtarget->hasFlatScratchEnabled())
4332 auto *
TRI = Subtarget->getRegisterInfo();
4339 if (!IsSibCall || IsChainCallConv) {
4340 if (!Subtarget->hasFlatScratchEnabled()) {
4346 RegsToPass.emplace_back(IsChainCallConv
4347 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4348 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4355 const unsigned NumSpecialInputs = RegsToPass.size();
4357 MVT PtrVT = MVT::i32;
4360 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4388 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4396 int32_t
Offset = LocMemOffset;
4403 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4409 ? Flags.getNonZeroByValAlign()
4436 if (Outs[i].Flags.isByVal()) {
4438 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4441 Outs[i].Flags.getNonZeroByValAlign(),
4443 nullptr, std::nullopt, DstInfo,
4449 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4455 if (!MemOpChains.
empty())
4471 unsigned ArgIdx = 0;
4472 for (
auto [Reg, Val] : RegsToPass) {
4473 if (ArgIdx++ >= NumSpecialInputs &&
4474 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4500 if (IsTailCall && !IsSibCall) {
4505 std::vector<SDValue>
Ops({Chain});
4511 Ops.push_back(Callee);
4528 Ops.push_back(Callee);
4539 if (IsChainCallConv)
4544 for (
auto &[Reg, Val] : RegsToPass)
4548 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4549 assert(Mask &&
"Missing call preserved mask for calling convention");
4559 MVT::Glue, GlueOps),
4564 Ops.push_back(InGlue);
4570 unsigned OPC = AMDGPUISD::TC_RETURN;
4573 OPC = AMDGPUISD::TC_RETURN_GFX;
4577 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4578 : AMDGPUISD::TC_RETURN_CHAIN;
4584 if (Info->isWholeWaveFunction())
4585 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4592 Chain =
Call.getValue(0);
4593 InGlue =
Call.getValue(1);
4595 uint64_t CalleePopBytes = NumBytes;
4616 EVT VT =
Op.getValueType();
4630 "Stack grows upwards for AMDGPU");
4632 Chain = BaseAddr.getValue(1);
4634 if (Alignment > StackAlign) {
4636 << Subtarget->getWavefrontSizeLog2();
4637 uint64_t StackAlignMask = ScaledAlignment - 1;
4644 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4650 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4661 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4677 if (
Op.getValueType() != MVT::i32)
4696 assert(
Op.getValueType() == MVT::i32);
4705 Op.getOperand(0), IntrinID, GetRoundBothImm);
4739 SDValue RoundModeTimesNumBits =
4759 TableEntry, EnumOffset);
4775 static_cast<uint32_t>(ConstMode->getZExtValue()),
4787 if (UseReducedTable) {
4793 SDValue RoundModeTimesNumBits =
4813 SDValue RoundModeTimesNumBits =
4822 NewMode = TruncTable;
4831 ReadFirstLaneID, NewMode);
4844 IntrinID, RoundBothImm, NewMode);
4850 if (
Op->isDivergent() &&
4851 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4861 if (Subtarget->hasSafeSmemPrefetch())
4869 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4878 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4879 EVT SrcVT = Src.getValueType();
4888 EVT DstVT =
Op.getValueType();
4897 if (
Op.getValueType() != MVT::i64)
4911 Op.getOperand(0), IntrinID, ModeHwRegImm);
4913 Op.getOperand(0), IntrinID, TrapHwRegImm);
4927 if (
Op.getOperand(1).getValueType() != MVT::i64)
4939 ReadFirstLaneID, NewModeReg);
4941 ReadFirstLaneID, NewTrapReg);
4943 unsigned ModeHwReg =
4946 unsigned TrapHwReg =
4954 IntrinID, ModeHwRegImm, NewModeReg);
4957 IntrinID, TrapHwRegImm, NewTrapReg);
4966 .
Case(
"m0", AMDGPU::M0)
4967 .
Case(
"exec", AMDGPU::EXEC)
4968 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4969 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4970 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4971 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4972 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4977 if (!Subtarget->hasFlatScrRegister() &&
4978 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4980 "\" for subtarget."));
4985 case AMDGPU::EXEC_LO:
4986 case AMDGPU::EXEC_HI:
4987 case AMDGPU::FLAT_SCR_LO:
4988 case AMDGPU::FLAT_SCR_HI:
4993 case AMDGPU::FLAT_SCR:
5012 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
5021static std::pair<MachineBasicBlock *, MachineBasicBlock *>
5043 auto Next = std::next(
I);
5054 MBB.addSuccessor(LoopBB);
5056 return std::pair(LoopBB, RemainderBB);
5063 auto I =
MI.getIterator();
5064 auto E = std::next(
I);
5086 Src->setIsKill(
false);
5096 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
5105 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5129 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5130 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5152 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5159 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5163 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5173 if (UseGPRIdxMode) {
5175 SGPRIdxReg = CurrentIdxReg;
5178 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5188 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5219 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5220 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5228 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5247 InitResultReg, DstReg, PhiReg, TmpExec,
5248 Offset, UseGPRIdxMode, SGPRIdxReg);
5254 LoopBB->removeSuccessor(RemainderBB);
5256 LoopBB->addSuccessor(LandingPad);
5267static std::pair<unsigned, int>
5271 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5276 return std::pair(AMDGPU::sub0,
Offset);
5333 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5334 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5340 std::tie(SubReg,
Offset) =
5343 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5346 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5350 if (UseGPRIdxMode) {
5357 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5366 .
addReg(SrcReg, {}, SubReg)
5370 MI.eraseFromParent();
5386 UseGPRIdxMode, SGPRIdxReg);
5390 if (UseGPRIdxMode) {
5392 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5394 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5399 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5400 .
addReg(SrcReg, {}, SubReg)
5404 MI.eraseFromParent();
5421 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5429 std::tie(SubReg,
Offset) =
5431 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5433 if (Idx->
getReg() == AMDGPU::NoRegister) {
5444 MI.eraseFromParent();
5449 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5453 if (UseGPRIdxMode) {
5457 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5466 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5467 TRI.getRegSizeInBits(*VecRC), 32,
false);
5473 MI.eraseFromParent();
5487 UseGPRIdxMode, SGPRIdxReg);
5490 if (UseGPRIdxMode) {
5492 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5494 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5500 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5501 TRI.getRegSizeInBits(*VecRC), 32,
false);
5502 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5508 MI.eraseFromParent();
5524 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5525 if (ST.hasScalarAddSub64()) {
5526 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5536 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5537 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5540 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5542 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5545 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5547 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5549 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5550 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5559 MI.eraseFromParent();
5565 case AMDGPU::S_MIN_U32:
5566 return std::numeric_limits<uint32_t>::max();
5567 case AMDGPU::S_MIN_I32:
5568 return std::numeric_limits<int32_t>::max();
5569 case AMDGPU::S_MAX_U32:
5570 return std::numeric_limits<uint32_t>::min();
5571 case AMDGPU::S_MAX_I32:
5572 return std::numeric_limits<int32_t>::min();
5573 case AMDGPU::V_ADD_F32_e64:
5575 case AMDGPU::V_SUB_F32_e64:
5577 case AMDGPU::S_ADD_I32:
5578 case AMDGPU::S_SUB_I32:
5579 case AMDGPU::S_OR_B32:
5580 case AMDGPU::S_XOR_B32:
5581 return std::numeric_limits<uint32_t>::min();
5582 case AMDGPU::S_AND_B32:
5583 return std::numeric_limits<uint32_t>::max();
5584 case AMDGPU::V_MIN_F32_e64:
5585 case AMDGPU::V_MAX_F32_e64:
5589 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5595 case AMDGPU::V_CMP_LT_U64_e64:
5596 return std::numeric_limits<uint64_t>::max();
5597 case AMDGPU::V_CMP_LT_I64_e64:
5598 return std::numeric_limits<int64_t>::max();
5599 case AMDGPU::V_CMP_GT_U64_e64:
5600 return std::numeric_limits<uint64_t>::min();
5601 case AMDGPU::V_CMP_GT_I64_e64:
5602 return std::numeric_limits<int64_t>::min();
5603 case AMDGPU::V_MIN_F64_e64:
5604 case AMDGPU::V_MAX_F64_e64:
5605 case AMDGPU::V_MIN_NUM_F64_e64:
5606 case AMDGPU::V_MAX_NUM_F64_e64:
5607 return 0x7FF8000000000000;
5608 case AMDGPU::S_ADD_U64_PSEUDO:
5609 case AMDGPU::S_SUB_U64_PSEUDO:
5610 case AMDGPU::S_OR_B64:
5611 case AMDGPU::S_XOR_B64:
5612 return std::numeric_limits<uint64_t>::min();
5613 case AMDGPU::S_AND_B64:
5614 return std::numeric_limits<uint64_t>::max();
5615 case AMDGPU::V_ADD_F64_e64:
5616 case AMDGPU::V_ADD_F64_pseudo_e64:
5617 return 0x8000000000000000;
5620 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5625 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5626 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5627 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5628 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5629 Opc == AMDGPU::S_XOR_B32 ||
Opc == AMDGPU::V_MIN_F32_e64 ||
5630 Opc == AMDGPU::V_MAX_F32_e64 ||
Opc == AMDGPU::V_ADD_F32_e64 ||
5631 Opc == AMDGPU::V_SUB_F32_e64;
5635 return Opc == AMDGPU::V_MIN_F32_e64 ||
Opc == AMDGPU::V_MAX_F32_e64 ||
5636 Opc == AMDGPU::V_ADD_F32_e64 ||
Opc == AMDGPU::V_SUB_F32_e64 ||
5637 Opc == AMDGPU::V_MIN_F64_e64 ||
Opc == AMDGPU::V_MAX_F64_e64 ||
5638 Opc == AMDGPU::V_MIN_NUM_F64_e64 ||
Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
5639 Opc == AMDGPU::V_ADD_F64_e64 ||
Opc == AMDGPU::V_ADD_F64_pseudo_e64;
5645 case AMDGPU::S_MIN_U32:
5646 return AMDGPU::V_MIN_U32_dpp;
5647 case AMDGPU::S_MIN_I32:
5648 return AMDGPU::V_MIN_I32_dpp;
5649 case AMDGPU::S_MAX_U32:
5650 return AMDGPU::V_MAX_U32_dpp;
5651 case AMDGPU::S_MAX_I32:
5652 return AMDGPU::V_MAX_I32_dpp;
5653 case AMDGPU::S_ADD_I32:
5654 case AMDGPU::S_SUB_I32:
5655 return ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_dpp
5656 : AMDGPU::V_ADD_CO_U32_dpp;
5657 case AMDGPU::S_AND_B32:
5658 return AMDGPU::V_AND_B32_dpp;
5659 case AMDGPU::S_OR_B32:
5660 return AMDGPU::V_OR_B32_dpp;
5661 case AMDGPU::S_XOR_B32:
5662 return AMDGPU::V_XOR_B32_dpp;
5681 unsigned Stratergy =
static_cast<unsigned>(
MI.getOperand(2).
getImm());
5682 enum WAVE_REDUCE_STRATEGY :
unsigned {
DEFAULT = 0, ITERATIVE = 1,
DPP = 2 };
5686 case AMDGPU::S_MIN_U32:
5687 case AMDGPU::S_MIN_I32:
5688 case AMDGPU::V_MIN_F32_e64:
5689 case AMDGPU::S_MAX_U32:
5690 case AMDGPU::S_MAX_I32:
5691 case AMDGPU::V_MAX_F32_e64:
5692 case AMDGPU::S_AND_B32:
5693 case AMDGPU::S_OR_B32: {
5699 case AMDGPU::V_CMP_LT_U64_e64:
5700 case AMDGPU::V_CMP_LT_I64_e64:
5701 case AMDGPU::V_CMP_GT_U64_e64:
5702 case AMDGPU::V_CMP_GT_I64_e64:
5703 case AMDGPU::V_MIN_F64_e64:
5704 case AMDGPU::V_MIN_NUM_F64_e64:
5705 case AMDGPU::V_MAX_F64_e64:
5706 case AMDGPU::V_MAX_NUM_F64_e64:
5707 case AMDGPU::S_AND_B64:
5708 case AMDGPU::S_OR_B64: {
5714 case AMDGPU::S_XOR_B32:
5715 case AMDGPU::S_XOR_B64:
5716 case AMDGPU::S_ADD_I32:
5717 case AMDGPU::S_ADD_U64_PSEUDO:
5718 case AMDGPU::V_ADD_F32_e64:
5719 case AMDGPU::V_ADD_F64_e64:
5720 case AMDGPU::V_ADD_F64_pseudo_e64:
5721 case AMDGPU::S_SUB_I32:
5722 case AMDGPU::S_SUB_U64_PSEUDO:
5723 case AMDGPU::V_SUB_F32_e64: {
5730 bool IsWave32 = ST.isWave32();
5731 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5732 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5733 unsigned BitCountOpc =
5734 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5738 auto NewAccumulator =
5743 case AMDGPU::S_XOR_B32:
5744 case AMDGPU::S_XOR_B64: {
5753 .
addReg(NewAccumulator->getOperand(0).getReg())
5756 if (
Opc == AMDGPU::S_XOR_B32) {
5768 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5771 MI, MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5773 MI, MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5783 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5791 case AMDGPU::S_SUB_I32: {
5800 .
addReg(NewAccumulator->getOperand(0).getReg());
5803 case AMDGPU::S_ADD_I32: {
5806 .
addReg(NewAccumulator->getOperand(0).getReg());
5809 case AMDGPU::S_ADD_U64_PSEUDO:
5810 case AMDGPU::S_SUB_U64_PSEUDO: {
5826 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5829 MI, MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5831 MI, MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5833 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5836 .
addReg(NewAccumulator->getOperand(0).getReg())
5846 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5848 : NewAccumulator->getOperand(0).getReg();
5859 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5865 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5871 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5878 case AMDGPU::V_ADD_F32_e64:
5879 case AMDGPU::V_ADD_F64_e64:
5880 case AMDGPU::V_ADD_F64_pseudo_e64:
5881 case AMDGPU::V_SUB_F32_e64: {
5888 TII->get(is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
5889 : AMDGPU::V_CVT_F64_I32_e64),
5891 .
addReg(NewAccumulator->getOperand(0).getReg())
5897 (
Opc == AMDGPU::V_SUB_F32_e64 ||
5898 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
5901 unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
5903 ? AMDGPU::V_MUL_F64_pseudo_e64
5904 : AMDGPU::V_MUL_F64_e64;
5914 BuildMI(BB,
MI,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5922 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
5924 TII->buildExtractSubRegOrImm(
MI, MRI, DestVregInst->getOperand(0),
5925 VregRC, AMDGPU::sub0, VregSubRC);
5927 TII->buildExtractSubRegOrImm(
MI, MRI, DestVregInst->getOperand(0),
5928 VregRC, AMDGPU::sub1, VregSubRC);
5937 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5957 bool IsWave32 = ST.isWave32();
5958 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5959 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5960 if (Stratergy == WAVE_REDUCE_STRATEGY::ITERATIVE ||
5987 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5991 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6004 I = ComputeLoop->begin();
6006 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
6010 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
6014 I = ComputeLoop->end();
6018 IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
6022 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
6031 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_MOV_B32_e32),
6043 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6059 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
6061 MI, MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
6063 MI, MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
6065 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
6069 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
6074 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
6081 case AMDGPU::S_OR_B64:
6082 case AMDGPU::S_AND_B64:
6083 case AMDGPU::S_XOR_B64: {
6086 .
addReg(LaneValue->getOperand(0).getReg())
6090 case AMDGPU::V_CMP_GT_I64_e64:
6091 case AMDGPU::V_CMP_GT_U64_e64:
6092 case AMDGPU::V_CMP_LT_I64_e64:
6093 case AMDGPU::V_CMP_LT_U64_e64: {
6098 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src);
6100 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), SrcIdx));
6102 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
6105 MI, MRI,
Accumulator->getOperand(0), VregClass, AMDGPU::sub0,
6108 MI, MRI,
Accumulator->getOperand(0), VregClass, AMDGPU::sub1,
6110 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
6117 .
addReg(LaneValue->getOperand(0).getReg())
6118 .
addReg(AccumulatorVReg);
6120 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6121 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
6125 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
6126 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
6127 .
addReg(LaneValue->getOperand(0).getReg())
6131 case AMDGPU::V_MIN_F64_e64:
6132 case AMDGPU::V_MIN_NUM_F64_e64:
6133 case AMDGPU::V_MAX_F64_e64:
6134 case AMDGPU::V_MAX_NUM_F64_e64:
6135 case AMDGPU::V_ADD_F64_e64:
6136 case AMDGPU::V_ADD_F64_pseudo_e64: {
6138 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src);
6140 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), SrcIdx));
6142 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
6149 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::COPY), AccumulatorVReg)
6152 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6158 .
addReg(LaneValue->getOperand(0).getReg())
6165 TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValLo);
6168 TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValHi);
6171 Iters, MRI, DstVregInst->getOperand(0), VregRC, AMDGPU::sub0,
6174 Iters, MRI, DstVregInst->getOperand(0), VregRC, AMDGPU::sub1,
6176 ReadLaneLo.add(Op1L);
6177 ReadLaneHi.add(Op1H);
6178 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
6179 TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
6186 case AMDGPU::S_ADD_U64_PSEUDO:
6187 case AMDGPU::S_SUB_U64_PSEUDO: {
6190 .
addReg(LaneValue->getOperand(0).getReg());
6198 unsigned BITSETOpc =
6199 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
6200 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
6206 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
6209 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
6211 .
addReg(NewActiveBitsReg)
6213 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
6218 assert(ST.hasDPP() &&
"Sub Target does not support DPP Operations");
6242 BuildMI(BB,
MI,
DL,
TII->get(AMDGPU::V_SET_INACTIVE_B32), SrcWithIdentity)
6262 BuildDPPMachineInstr(DPPRowShr1, SrcWithIdentity,
6265 BuildDPPMachineInstr(DPPRowShr2, DPPRowShr1,
6268 BuildDPPMachineInstr(DPPRowShr4, DPPRowShr2,
6271 BuildDPPMachineInstr(DPPRowShr8, DPPRowShr4,
6274 if (ST.hasDPPBroadcasts()) {
6284 BuildMI(BB,
MI,
DL,
TII->get(AMDGPU::DS_SWIZZLE_B32), SwizzledValue)
6291 Opc == AMDGPU::S_SUB_I32
6292 ?
static_cast<unsigned>(AMDGPU::S_ADD_I32)
6297 if (
TII->hasIntClamp(*ClampInstr) ||
TII->hasFPClamp(*ClampInstr))
6300 FinalDPPResult = RowBcast15;
6302 if (ST.hasDPPBroadcasts()) {
6330 BuildMI(BB,
MI,
DL,
TII->get(AMDGPU::V_ADD_U32_e64), ShiftedThreadID)
6340 .
addReg(ShiftedThreadID);
6342 BuildMI(BB,
MI,
DL,
TII->get(AMDGPU::DS_PERMUTE_B32), PermutedValue)
6343 .
addReg(PermuteByteOffset)
6349 Opc == AMDGPU::S_SUB_I32
6350 ?
static_cast<unsigned>(AMDGPU::S_ADD_I32)
6355 if (
TII->hasIntClamp(*ClampInstr) ||
TII->hasFPClamp(*ClampInstr))
6358 FinalDPPResult = RowBcast31;
6361 BuildMI(BB,
MI,
DL,
TII->get(AMDGPU::V_READLANE_B32), ReducedValSGPR)
6363 .
addImm(ST.getWavefrontSize() - 1);
6364 if (
Opc == AMDGPU::S_SUB_I32)
6365 BuildMI(BB,
MI,
DL,
TII->get(AMDGPU::S_SUB_I32), NegatedReducedVal)
6370 .
addReg(
Opc == AMDGPU::S_SUB_I32 ? NegatedReducedVal
6375 MI.eraseFromParent();
6390 switch (
MI.getOpcode()) {
6391 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6393 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6395 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6397 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6399 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6401 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
6404 ? AMDGPU::V_MIN_NUM_F64_e64
6405 : AMDGPU::V_MIN_F64_e64);
6406 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6408 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6410 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6412 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6414 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6416 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
6419 ? AMDGPU::V_MAX_NUM_F64_e64
6420 : AMDGPU::V_MAX_F64_e64);
6421 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6423 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6425 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6427 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
6430 ? AMDGPU::V_ADD_F64_pseudo_e64
6431 : AMDGPU::V_ADD_F64_e64);
6432 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6434 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6436 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6438 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
6443 ? AMDGPU::V_ADD_F64_pseudo_e64
6444 : AMDGPU::V_ADD_F64_e64);
6445 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6447 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6449 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6451 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6453 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6455 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6457 case AMDGPU::S_UADDO_PSEUDO:
6458 case AMDGPU::S_USUBO_PSEUDO: {
6464 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6466 : AMDGPU::S_SUB_U32;
6474 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6477 MI.eraseFromParent();
6480 case AMDGPU::S_ADD_U64_PSEUDO:
6481 case AMDGPU::S_SUB_U64_PSEUDO: {
6484 case AMDGPU::V_ADD_U64_PSEUDO:
6485 case AMDGPU::V_SUB_U64_PSEUDO: {
6486 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6492 if (ST.hasAddSubU64Insts()) {
6494 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6495 : AMDGPU::V_SUB_U64_e64),
6500 TII->legalizeOperands(*
I);
6501 MI.eraseFromParent();
6505 if (IsAdd && ST.hasLshlAddU64Inst()) {
6511 TII->legalizeOperands(*
Add);
6512 MI.eraseFromParent();
6516 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6526 : &AMDGPU::VReg_64RegClass;
6529 : &AMDGPU::VReg_64RegClass;
6532 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6534 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6537 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6539 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6542 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6544 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6547 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6554 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6568 TII->legalizeOperands(*LoHalf);
6569 TII->legalizeOperands(*HiHalf);
6570 MI.eraseFromParent();
6573 case AMDGPU::S_ADD_CO_PSEUDO:
6574 case AMDGPU::S_SUB_CO_PSEUDO: {
6586 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6592 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6597 if (
TRI->isVectorRegister(MRI, Src2.
getReg())) {
6598 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6603 if (ST.isWave64()) {
6604 if (ST.hasScalarCompareEq64()) {
6611 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6613 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6615 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6618 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6632 unsigned Opc =
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6633 ? AMDGPU::S_ADDC_U32
6634 : AMDGPU::S_SUBB_U32;
6639 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6645 MI.eraseFromParent();
6648 case AMDGPU::SI_INIT_M0: {
6651 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6654 MI.eraseFromParent();
6657 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6660 TII->get(AMDGPU::S_CMP_EQ_U32))
6665 case AMDGPU::GET_GROUPSTATICSIZE: {
6669 .
add(
MI.getOperand(0))
6671 MI.eraseFromParent();
6674 case AMDGPU::GET_SHADERCYCLESHILO: {
6689 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6692 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6695 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6704 .
add(
MI.getOperand(0))
6709 MI.eraseFromParent();
6712 case AMDGPU::SI_INDIRECT_SRC_V1:
6713 case AMDGPU::SI_INDIRECT_SRC_V2:
6714 case AMDGPU::SI_INDIRECT_SRC_V3:
6715 case AMDGPU::SI_INDIRECT_SRC_V4:
6716 case AMDGPU::SI_INDIRECT_SRC_V5:
6717 case AMDGPU::SI_INDIRECT_SRC_V6:
6718 case AMDGPU::SI_INDIRECT_SRC_V7:
6719 case AMDGPU::SI_INDIRECT_SRC_V8:
6720 case AMDGPU::SI_INDIRECT_SRC_V9:
6721 case AMDGPU::SI_INDIRECT_SRC_V10:
6722 case AMDGPU::SI_INDIRECT_SRC_V11:
6723 case AMDGPU::SI_INDIRECT_SRC_V12:
6724 case AMDGPU::SI_INDIRECT_SRC_V16:
6725 case AMDGPU::SI_INDIRECT_SRC_V32:
6727 case AMDGPU::SI_INDIRECT_DST_V1:
6728 case AMDGPU::SI_INDIRECT_DST_V2:
6729 case AMDGPU::SI_INDIRECT_DST_V3:
6730 case AMDGPU::SI_INDIRECT_DST_V4:
6731 case AMDGPU::SI_INDIRECT_DST_V5:
6732 case AMDGPU::SI_INDIRECT_DST_V6:
6733 case AMDGPU::SI_INDIRECT_DST_V7:
6734 case AMDGPU::SI_INDIRECT_DST_V8:
6735 case AMDGPU::SI_INDIRECT_DST_V9:
6736 case AMDGPU::SI_INDIRECT_DST_V10:
6737 case AMDGPU::SI_INDIRECT_DST_V11:
6738 case AMDGPU::SI_INDIRECT_DST_V12:
6739 case AMDGPU::SI_INDIRECT_DST_V16:
6740 case AMDGPU::SI_INDIRECT_DST_V32:
6742 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6743 case AMDGPU::SI_KILL_I1_PSEUDO:
6745 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6749 Register SrcCond =
MI.getOperand(3).getReg();
6753 const auto *CondRC =
TRI->getWaveMaskRegClass();
6758 : &AMDGPU::VReg_64RegClass;
6761 : &AMDGPU::VReg_64RegClass;
6764 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6766 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6769 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6771 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6774 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6776 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6797 MI.eraseFromParent();
6800 case AMDGPU::SI_BR_UNDEF: {
6802 .
add(
MI.getOperand(0));
6804 MI.eraseFromParent();
6807 case AMDGPU::ADJCALLSTACKUP:
6808 case AMDGPU::ADJCALLSTACKDOWN: {
6815 case AMDGPU::SI_CALL_ISEL: {
6816 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
6819 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6825 MI.eraseFromParent();
6828 case AMDGPU::V_ADD_CO_U32_e32:
6829 case AMDGPU::V_SUB_CO_U32_e32:
6830 case AMDGPU::V_SUBREV_CO_U32_e32: {
6832 unsigned Opc =
MI.getOpcode();
6834 bool NeedClampOperand =
false;
6835 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
6837 NeedClampOperand =
true;
6841 if (
TII->isVOP3(*
I)) {
6844 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
6845 if (NeedClampOperand)
6848 TII->legalizeOperands(*
I);
6850 MI.eraseFromParent();
6853 case AMDGPU::V_ADDC_U32_e32:
6854 case AMDGPU::V_SUBB_U32_e32:
6855 case AMDGPU::V_SUBBREV_U32_e32:
6858 TII->legalizeOperands(
MI);
6860 case AMDGPU::DS_GWS_INIT:
6861 case AMDGPU::DS_GWS_SEMA_BR:
6862 case AMDGPU::DS_GWS_BARRIER:
6863 case AMDGPU::DS_GWS_SEMA_V:
6864 case AMDGPU::DS_GWS_SEMA_P:
6865 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6873 case AMDGPU::S_SETREG_B32: {
6889 const unsigned SetMask = WidthMask <<
Offset;
6892 unsigned SetDenormOp = 0;
6893 unsigned SetRoundOp = 0;
6901 SetRoundOp = AMDGPU::S_ROUND_MODE;
6902 SetDenormOp = AMDGPU::S_DENORM_MODE;
6904 SetRoundOp = AMDGPU::S_ROUND_MODE;
6906 SetDenormOp = AMDGPU::S_DENORM_MODE;
6909 if (SetRoundOp || SetDenormOp) {
6911 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6912 unsigned ImmVal = Def->getOperand(1).getImm();
6926 MI.eraseFromParent();
6935 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6939 case AMDGPU::S_INVERSE_BALLOT_U32:
6940 case AMDGPU::S_INVERSE_BALLOT_U64:
6943 MI.setDesc(
TII->get(AMDGPU::COPY));
6945 case AMDGPU::ENDPGM_TRAP: {
6947 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6967 MI.eraseFromParent();
6970 case AMDGPU::SIMULATED_TRAP: {
6971 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6973 TII->insertSimulatedTrap(MRI, *BB,
MI,
MI.getDebugLoc());
6974 MI.eraseFromParent();
6977 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6978 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6984 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6985 Register OriginalExec = Setup->getOperand(0).getReg();
6987 MI.getOperand(0).setReg(OriginalExec);
7024 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
7028 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
7055 if (!Subtarget->hasMadMacF32Insts())
7056 return Subtarget->hasFastFMAF32();
7062 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
7065 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
7081 switch (Ty.getScalarSizeInBits()) {
7099 if (Ty.getScalarSizeInBits() == 16)
7101 if (Ty.getScalarSizeInBits() == 32)
7102 return Subtarget->hasMadMacF32Insts() &&
7112 EVT VT =
N->getValueType(0);
7114 return Subtarget->hasMadMacF32Insts() &&
7116 if (VT == MVT::f16) {
7117 return Subtarget->hasMadF16() &&
7132 unsigned Opc =
Op.getOpcode();
7133 EVT VT =
Op.getValueType();
7134 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
7135 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
7136 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
7137 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7138 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
7139 VT == MVT::v32bf16);
7155 [[maybe_unused]]
EVT VT =
Op.getValueType();
7157 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
7158 VT == MVT::v16i32) &&
7159 "Unexpected ValueType.");
7168 unsigned Opc =
Op.getOpcode();
7169 EVT VT =
Op.getValueType();
7170 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
7171 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
7172 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
7173 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7174 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
7175 VT == MVT::v32bf16);
7183 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
7185 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
7192 unsigned Opc =
Op.getOpcode();
7193 EVT VT =
Op.getValueType();
7194 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7195 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
7196 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7197 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
7198 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
7199 VT == MVT::v32bf16);
7204 : std::pair(Op0, Op0);
7213 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
7215 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
7221 switch (
Op.getOpcode()) {
7225 return LowerBRCOND(
Op, DAG);
7227 return LowerRETURNADDR(
Op, DAG);
7229 return LowerSPONENTRY(
Op, DAG);
7232 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
7233 "Load should return a value and a chain");
7237 EVT VT =
Op.getValueType();
7239 return lowerFSQRTF32(
Op, DAG);
7241 return lowerFSQRTF64(
Op, DAG);
7246 return LowerTrig(
Op, DAG);
7248 return LowerSELECT(
Op, DAG);
7250 return LowerFDIV(
Op, DAG);
7252 return LowerFFREXP(
Op, DAG);
7254 return LowerATOMIC_CMP_SWAP(
Op, DAG);
7256 return LowerSTORE(
Op, DAG);
7260 return LowerGlobalAddress(MFI,
Op, DAG);
7263 return LowerExternalSymbol(
Op, DAG);
7265 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
7267 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
7269 return LowerINTRINSIC_VOID(
Op, DAG);
7271 return lowerADDRSPACECAST(
Op, DAG);
7273 return lowerINSERT_SUBVECTOR(
Op, DAG);
7275 return lowerINSERT_VECTOR_ELT(
Op, DAG);
7277 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
7279 return lowerVECTOR_SHUFFLE(
Op, DAG);
7281 return lowerSCALAR_TO_VECTOR(
Op, DAG);
7283 return lowerBUILD_VECTOR(
Op, DAG);
7286 return lowerFP_ROUND(
Op, DAG);
7288 return lowerTRAP(
Op, DAG);
7290 return lowerDEBUGTRAP(
Op, DAG);
7299 return lowerFMINNUM_FMAXNUM(
Op, DAG);
7302 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
7305 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
7308 return lowerFLDEXP(
Op, DAG);
7314 Op.getValueType() == MVT::i16 &&
7315 Op.getOperand(0).getValueType() == MVT::f32) {
7339 return lowerFCOPYSIGN(
Op, DAG);
7341 return lowerMUL(
Op, DAG);
7344 return lowerXMULO(
Op, DAG);
7347 return lowerXMUL_LOHI(
Op, DAG);
7382 EVT FittingLoadVT = LoadVT;
7414SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
7417 bool IsIntrinsic)
const {
7420 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7421 EVT LoadVT =
M->getValueType(0);
7423 EVT EquivLoadVT = LoadVT;
7437 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
7441 M->getMemoryVT(),
M->getMemOperand());
7452 EVT LoadVT =
M->getValueType(0);
7458 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
7459 bool IsTFE =
M->getNumValues() == 3;
7461 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7462 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7463 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7464 : AMDGPUISD::BUFFER_LOAD;
7467 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG,
Ops);
7472 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
7476 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
7477 M->getMemOperand(), DAG);
7481 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
7483 M->getMemOperand(), DAG);
7491 EVT VT =
N->getValueType(0);
7492 unsigned CondCode =
N->getConstantOperandVal(3);
7503 EVT CmpVT =
LHS.getValueType();
7504 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
7505 unsigned PromoteOp =
7525 EVT VT =
N->getValueType(0);
7527 unsigned CondCode =
N->getConstantOperandVal(3);
7536 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7545 SDValue SetCC = DAG.
getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7554 EVT VT =
N->getValueType(0);
7578 Exec = AMDGPU::EXEC_LO;
7580 Exec = AMDGPU::EXEC;
7597 EVT VT =
N->getValueType(0);
7599 unsigned IID =
N->getConstantOperandVal(0);
7600 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7601 IID == Intrinsic::amdgcn_permlanex16;
7602 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7603 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7607 unsigned SplitSize = 32;
7608 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7609 ST->hasDPALU_DPP() &&
7617 case Intrinsic::amdgcn_permlane16:
7618 case Intrinsic::amdgcn_permlanex16:
7619 case Intrinsic::amdgcn_update_dpp:
7624 case Intrinsic::amdgcn_writelane:
7627 case Intrinsic::amdgcn_readlane:
7628 case Intrinsic::amdgcn_set_inactive:
7629 case Intrinsic::amdgcn_set_inactive_chain_arg:
7630 case Intrinsic::amdgcn_mov_dpp8:
7633 case Intrinsic::amdgcn_readfirstlane:
7634 case Intrinsic::amdgcn_permlane64:
7642 std::reverse(Operands.
begin(), Operands.
end());
7644 if (
SDNode *GL =
N->getGluedNode()) {
7646 GL = GL->getOperand(0).getNode();
7656 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7657 IID == Intrinsic::amdgcn_mov_dpp8 ||
7658 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7659 Src1 =
N->getOperand(2);
7660 if (IID == Intrinsic::amdgcn_writelane ||
7661 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7662 Src2 =
N->getOperand(3);
7665 if (ValSize == SplitSize) {
7675 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7680 if (IID == Intrinsic::amdgcn_writelane) {
7685 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7687 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7690 if (ValSize % SplitSize != 0)
7694 EVT VT =
N->getValueType(0);
7698 unsigned NumOperands =
N->getNumOperands();
7700 SDNode *GL =
N->getGluedNode();
7705 for (
unsigned i = 0; i != NE; ++i) {
7706 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7708 SDValue Operand =
N->getOperand(j);
7717 Operands[j] = Operand;
7722 Operands[NumOperands - 1] =
7738 if (SplitSize == 32) {
7740 return unrollLaneOp(LaneOp.
getNode());
7746 unsigned SubVecNumElt =
7750 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7751 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7755 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7760 if (IID == Intrinsic::amdgcn_writelane)
7765 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7766 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7767 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7768 EltIdx += SubVecNumElt;
7782 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7785 if (IID == Intrinsic::amdgcn_writelane)
7788 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7795 EVT VT =
N->getValueType(0);
7813 auto MakeIntrinsic = [&DAG, &SL](
unsigned IID,
MVT RetVT,
7817 Operands.
append(IntrinArgs);
7823 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7824 {ShiftedIndex, ValueI32});
7834 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7835 {ValueI32, PoisonVal});
7836 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7837 {ShiftedIndex, PoisonVal});
7840 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7843 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7844 {WWMIndex, WWMValue});
7845 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7846 MVT::i32, {WWMIndex, Swapped});
7848 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7856 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7864 DAG.
getSetCC(SL, MVT::i1, SameOrOtherHalf,
7874 switch (
N->getOpcode()) {
7886 unsigned IID =
N->getConstantOperandVal(0);
7888 case Intrinsic::amdgcn_make_buffer_rsrc:
7889 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
7891 case Intrinsic::amdgcn_cvt_pkrtz: {
7896 DAG.
getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7900 case Intrinsic::amdgcn_cvt_pknorm_i16:
7901 case Intrinsic::amdgcn_cvt_pknorm_u16:
7902 case Intrinsic::amdgcn_cvt_pk_i16:
7903 case Intrinsic::amdgcn_cvt_pk_u16: {
7909 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7910 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7911 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7912 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7913 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7914 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7916 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7918 EVT VT =
N->getValueType(0);
7927 case Intrinsic::amdgcn_s_buffer_load: {
7933 if (!Subtarget->hasScalarSubwordLoads())
7939 EVT VT =
Op.getValueType();
7940 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
7952 if (!
Offset->isDivergent()) {
7971 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
7976 case Intrinsic::amdgcn_dead: {
7977 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7988 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7989 Results.push_back(Res.getOperand(
I));
7993 Results.push_back(Res.getValue(1));
8002 EVT VT =
N->getValueType(0);
8007 EVT SelectVT = NewVT;
8008 if (NewVT.
bitsLT(MVT::i32)) {
8011 SelectVT = MVT::i32;
8017 if (NewVT != SelectVT)
8023 if (
N->getValueType(0) != MVT::v2f16)
8035 if (
N->getValueType(0) != MVT::v2f16)
8047 if (
N->getValueType(0) != MVT::f16)
8062 if (U.get() !=
Value)
8065 if (U.getUser()->getOpcode() == Opcode)
8071unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
8074 case Intrinsic::amdgcn_if:
8075 return AMDGPUISD::IF;
8076 case Intrinsic::amdgcn_else:
8077 return AMDGPUISD::ELSE;
8078 case Intrinsic::amdgcn_loop:
8079 return AMDGPUISD::LOOP;
8080 case Intrinsic::amdgcn_end_cf:
8100 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
8127 SDNode *Intr = BRCOND.getOperand(1).getNode();
8144 Intr =
LHS.getNode();
8152 assert(BR &&
"brcond missing unconditional branch user");
8157 unsigned CFNode = isCFIntrinsic(Intr);
8177 Ops.push_back(Target);
8200 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
8219 MVT VT =
Op.getSimpleValueType();
8222 if (
Op.getConstantOperandVal(0) != 0)
8226 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8228 if (
Info->isEntryFunction())
8245 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
8259 return Op.getValueType().bitsLE(VT)
8267 EVT DstVT =
Op.getValueType();
8274 unsigned Opc =
Op.getOpcode();
8286 EVT SrcVT = Src.getValueType();
8287 EVT DstVT =
Op.getValueType();
8290 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
8293 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
8300 if (DstVT == MVT::f16) {
8305 if (!Subtarget->has16BitInsts()) {
8310 if (
Op->getFlags().hasApproximateFuncs()) {
8321 "custom lower FP_ROUND for f16 or bf16");
8322 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
8334 EVT VT =
Op.getValueType();
8336 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8337 bool IsIEEEMode =
Info->getMode().IEEE;
8346 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8353SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
8355 EVT VT =
Op.getValueType();
8357 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8358 bool IsIEEEMode =
Info->getMode().IEEE;
8363 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8371 EVT VT =
Op.getValueType();
8375 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
8376 !Subtarget->hasMinimum3Maximum3F16() &&
8377 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
8378 "should not need to widen f16 minimum/maximum to v2f16");
8392 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
8400 EVT VT =
Op.getValueType();
8404 EVT ExpVT =
Exp.getValueType();
8405 if (ExpVT == MVT::i16)
8426 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
8433 switch (
Op->getOpcode()) {
8463 DAGCombinerInfo &DCI)
const {
8464 const unsigned Opc =
Op.getOpcode();
8472 :
Op->getOperand(0).getValueType();
8473 auto &DAG = DCI.DAG;
8476 if (DCI.isBeforeLegalizeOps() ||
8484 LHS =
Op->getOperand(1);
8485 RHS =
Op->getOperand(2);
8487 LHS =
Op->getOperand(0);
8488 RHS =
Op->getOperand(1);
8527 if (MagVT == SignVT)
8544 EVT VT =
Op.getValueType();
8550 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
8577 if (
Op->isDivergent())
8590 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8592 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8595 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8597 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8603 EVT VT =
Op.getValueType();
8610 const APInt &
C = RHSC->getAPIntValue();
8612 if (
C.isPowerOf2()) {
8614 bool UseArithShift = isSigned && !
C.isMinSignedValue();
8641 if (
Op->isDivergent()) {
8645 if (Subtarget->hasSMulHi()) {
8656 if (!Subtarget->hasTrapHandler() ||
8658 return lowerTrapEndpgm(
Op, DAG);
8660 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
8661 : lowerTrapHsaQueuePtr(
Op, DAG);
8667 return DAG.
getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8671SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
8673 ImplicitParameter Param)
const {
8677 MachinePointerInfo PtrInfo =
8694 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
8697 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8700 if (UserSGPR == AMDGPU::NoRegister) {
8717 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8726 if (Subtarget->hasPrivEnabledTrap2NopBug())
8727 return DAG.
getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8731 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8739 if (!Subtarget->hasTrapHandler() ||
8743 "debugtrap handler not supported",
8751 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8754SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
8756 if (Subtarget->hasApertureRegs()) {
8758 ? AMDGPU::SRC_SHARED_BASE
8759 : AMDGPU::SRC_PRIVATE_BASE;
8760 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8761 !Subtarget->hasGloballyAddressableScratch()) &&
8762 "Cannot use src_private_base with globally addressable scratch!");
8783 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
8787 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8789 if (UserSGPR == AMDGPU::NoRegister) {
8834 const AMDGPUTargetMachine &TM =
8837 unsigned DestAS, SrcAS;
8839 bool IsNonNull =
false;
8841 SrcAS = ASC->getSrcAddressSpace();
8842 Src = ASC->getOperand(0);
8843 DestAS = ASC->getDestAddressSpace();
8846 Op.getConstantOperandVal(0) ==
8847 Intrinsic::amdgcn_addrspacecast_nonnull);
8848 Src =
Op->getOperand(1);
8849 SrcAS =
Op->getConstantOperandVal(2);
8850 DestAS =
Op->getConstantOperandVal(3);
8863 Subtarget->hasGloballyAddressableScratch()) {
8868 AMDGPU::S_MOV_B32, SL, MVT::i32,
8869 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8892 Subtarget->hasGloballyAddressableScratch()) {
8901 if (Subtarget->isWave64())
8907 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8915 AMDGPU::S_MOV_B64, SL, MVT::i64,
8916 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8918 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8920 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8940 Op.getValueType() == MVT::i64) {
8941 const SIMachineFunctionInfo *
Info =
8943 if (
Info->get32BitAddressHighBits() == 0)
8952 Src.getValueType() == MVT::i64)
8980 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
8985 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8987 MVT::i32, InsNumElts / 2);
8992 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
8994 if (InsNumElts == 2) {
9007 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
9030 if (NumElts == 4 && EltSize == 16 && KIdx) {
9041 unsigned Idx = KIdx->getZExtValue();
9042 bool InsertLo = Idx < 2;
9046 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
9052 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
9065 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
9100 EVT ResultVT =
Op.getValueType();
9113 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
9116 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
9120 if (VecSize == 128) {
9128 }
else if (VecSize == 256) {
9131 for (
unsigned P = 0;
P < 4; ++
P) {
9137 Parts[0], Parts[1]));
9139 Parts[2], Parts[3]));
9145 for (
unsigned P = 0;
P < 8; ++
P) {
9152 Parts[0], Parts[1], Parts[2], Parts[3]));
9155 Parts[4], Parts[5], Parts[6], Parts[7]));
9175 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
9190 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
9200 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
9205 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
9206 !(Mask[Elt + 1] & 1);
9212 EVT ResultVT =
Op.getValueType();
9215 const int NewSrcNumElts = 2;
9217 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
9233 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
9255 if (ShouldUseConsecutiveExtract &&
9258 int VecIdx = Idx < SrcNumElts ? 0 : 1;
9259 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
9271 if (Idx0 >= SrcNumElts) {
9276 if (Idx1 >= SrcNumElts) {
9281 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
9282 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
9290 int NewMaskIdx0 = Idx0 - AlignedIdx0;
9291 int NewMaskIdx1 = Idx1 - AlignedIdx1;
9296 if (SubVec0 != SubVec1) {
9297 NewMaskIdx1 += NewSrcNumElts;
9304 {NewMaskIdx0, NewMaskIdx1});
9309 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
9310 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
9311 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
9312 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
9331 EVT ResultVT =
Op.getValueType();
9347 EVT VT =
Op.getValueType();
9349 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
9350 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
9384 for (
unsigned P = 0;
P < NumParts; ++
P) {
9386 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
9405 if (!Subtarget->isAmdHsaOS())
9448 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET64,
DL, PtrVT, Ptr);
9457 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET,
DL, PtrVT, PtrLo, PtrHi);
9465 EVT PtrVT =
Op.getValueType();
9467 const GlobalValue *GV = GSD->
getGlobal();
9481 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
9496 return DAG.
getNode(AMDGPUISD::LDS,
DL, MVT::i32, GA);
9499 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9500 if (Subtarget->has64BitLiterals()) {
9531 MachinePointerInfo PtrInfo =
9544 Fn,
"unsupported external symbol",
Op.getDebugLoc()));
9568 SDValue Param = lowerKernargMemParameter(
9579 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
9587 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
9595 unsigned NumElts = Elts.
size();
9597 if (NumElts <= 12) {
9606 for (
unsigned i = 0; i < Elts.
size(); ++i) {
9612 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
9622 EVT SrcVT = Src.getValueType();
9643 bool Unpacked,
bool IsD16,
int DMaskPop,
9644 int NumVDataDwords,
bool IsAtomicPacked16Bit,
9648 EVT ReqRetVT = ResultTypes[0];
9650 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9651 ? (ReqRetNumElts + 1) / 2
9654 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9665 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
9676 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
9678 NumDataDwords - MaskPopDwords);
9683 EVT LegalReqRetVT = ReqRetVT;
9685 if (!
Data.getValueType().isInteger())
9687 Data.getValueType().changeTypeToInteger(),
Data);
9708 if (Result->getNumValues() == 1)
9715 SDValue *LWE,
bool &IsTexFail) {
9735 unsigned DimIdx,
unsigned EndIdx,
9736 unsigned NumGradients) {
9738 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
9746 if (((
I + 1) >= EndIdx) ||
9747 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
9748 I == DimIdx + NumGradients - 1))) {
9770 !
Op.getNode()->hasAnyUseOfValue(0))
9772 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9782 ResultTypes.erase(&ResultTypes[0]);
9788 int NumVDataDwords = 0;
9789 bool AdjustRetType =
false;
9790 bool IsAtomicPacked16Bit =
false;
9793 const unsigned ArgOffset = WithChain ? 2 : 1;
9796 unsigned DMaskLanes = 0;
9798 if (BaseOpcode->
Atomic) {
9799 VData =
Op.getOperand(2);
9801 IsAtomicPacked16Bit =
9802 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9803 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9804 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9805 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9816 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9818 DMask = Is64Bit ? 0xf : 0x3;
9819 NumVDataDwords = Is64Bit ? 4 : 2;
9821 DMask = Is64Bit ? 0x3 : 0x1;
9822 NumVDataDwords = Is64Bit ? 2 : 1;
9825 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
9828 if (BaseOpcode->
Store) {
9829 VData =
Op.getOperand(2);
9833 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9837 VData = handleD16VData(VData, DAG,
true);
9840 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
9841 }
else if (!BaseOpcode->
NoReturn) {
9846 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9854 (!LoadVT.
isVector() && DMaskLanes > 1))
9860 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9861 !(BaseOpcode->
Gather4 && Subtarget->hasImageGather4D16Bug()))
9862 NumVDataDwords = (DMaskLanes + 1) / 2;
9864 NumVDataDwords = DMaskLanes;
9866 AdjustRetType =
true;
9870 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
9877 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9878 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9880 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
9882 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9883 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9887 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
9893 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
9897 "Bias needs to be converted to 16 bit in A16 mode");
9902 if (BaseOpcode->
Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
9906 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
9907 "require 16 bit args for both gradients and addresses");
9912 if (!
ST->hasA16()) {
9913 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
9914 "support 16 bit addresses\n");
9924 if (BaseOpcode->
Gradients && IsG16 &&
ST->hasG16()) {
9926 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9928 IntrOpcode = G16MappingInfo->
G16;
9951 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
9969 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->
Sampler);
9970 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
9971 const bool UseNSA =
ST->hasNSAEncoding() &&
9972 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
9973 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
9974 const bool UsePartialNSA =
9975 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
9978 if (UsePartialNSA) {
9980 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9981 }
else if (!UseNSA) {
9991 uint64_t UnormConst =
9992 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
9994 Unorm = UnormConst ? True : False;
10000 bool IsTexFail =
false;
10001 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
10010 NumVDataDwords = 1;
10012 NumVDataDwords += 1;
10013 AdjustRetType =
true;
10018 if (AdjustRetType) {
10021 if (DMaskLanes == 0 && !BaseOpcode->
Store) {
10030 MVT::i32, NumVDataDwords)
10033 ResultTypes[0] = NewVT;
10034 if (ResultTypes.size() == 3) {
10038 ResultTypes.erase(&ResultTypes[1]);
10052 Ops.push_back(VData);
10053 if (UsePartialNSA) {
10055 Ops.push_back(VAddr);
10059 Ops.push_back(VAddr);
10062 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
10064 Ops.push_back(Rsrc);
10069 Ops.push_back(Samp);
10074 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
10075 Ops.push_back(Unorm);
10077 Ops.push_back(IsA16 &&
10078 ST->hasFeature(AMDGPU::FeatureR128A16)
10082 Ops.push_back(IsA16 ? True : False);
10084 if (!Subtarget->hasGFX90AInsts())
10085 Ops.push_back(TFE);
10089 "TFE is not supported on this GPU",
DL.getDebugLoc()));
10092 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
10093 Ops.push_back(LWE);
10095 Ops.push_back(DimInfo->
DA ? True : False);
10097 Ops.push_back(IsD16 ? True : False);
10099 Ops.push_back(
Op.getOperand(0));
10101 int NumVAddrDwords =
10107 NumVDataDwords, NumVAddrDwords);
10108 }
else if (IsGFX11Plus) {
10110 UseNSA ? AMDGPU::MIMGEncGfx11NSA
10111 : AMDGPU::MIMGEncGfx11Default,
10112 NumVDataDwords, NumVAddrDwords);
10113 }
else if (IsGFX10Plus) {
10115 UseNSA ? AMDGPU::MIMGEncGfx10NSA
10116 : AMDGPU::MIMGEncGfx10Default,
10117 NumVDataDwords, NumVAddrDwords);
10119 if (Subtarget->hasGFX90AInsts()) {
10121 NumVDataDwords, NumVAddrDwords);
10122 if (Opcode == -1) {
10125 "requested image instruction is not supported on this GPU",
10126 DL.getDebugLoc()));
10130 for (EVT VT : OrigResultTypes) {
10131 if (VT == MVT::Other)
10132 RetValues[Idx++] =
Op.getOperand(0);
10140 if (Opcode == -1 &&
10143 NumVDataDwords, NumVAddrDwords);
10146 NumVDataDwords, NumVAddrDwords);
10153 MachineMemOperand *MemRef = MemOp->getMemOperand();
10172 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
10173 NumVDataDwords, IsAtomicPacked16Bit,
DL);
10186 MachinePointerInfo(),
10191 if (!
Offset->isDivergent()) {
10198 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10207 !Subtarget->hasScalarDwordx3Loads()) {
10211 AMDGPUISD::SBUFFER_LOAD,
DL, DAG.
getVTList(WidenedVT),
Ops, WidenedVT,
10234 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10236 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
10240 unsigned NumLoads = 1;
10246 if (NumElts == 8 || NumElts == 16) {
10247 NumLoads = NumElts / 4;
10251 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
10256 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
10258 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
10259 for (
unsigned i = 0; i < NumLoads; ++i) {
10261 Loads.
push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD,
DL, VTList,
Ops,
10262 LoadVT, MMO, DAG));
10265 if (NumElts == 8 || NumElts == 16)
10273 if (!Subtarget->hasArchitectedSGPRs())
10278 return DAG.
getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
10285 unsigned Width)
const {
10287 using namespace AMDGPU::Hwreg;
10289 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
10328 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
10330 EVT VT =
Op.getValueType();
10332 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
10336 switch (IntrinsicID) {
10337 case Intrinsic::amdgcn_implicit_buffer_ptr: {
10340 return getPreloadedValue(DAG, *MFI, VT,
10343 case Intrinsic::amdgcn_dispatch_ptr:
10344 case Intrinsic::amdgcn_queue_ptr: {
10345 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
10347 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
10348 DL.getDebugLoc()));
10352 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
10355 return getPreloadedValue(DAG, *MFI, VT, RegID);
10357 case Intrinsic::amdgcn_implicitarg_ptr: {
10359 return getImplicitArgPtr(DAG,
DL);
10360 return getPreloadedValue(DAG, *MFI, VT,
10363 case Intrinsic::amdgcn_kernarg_segment_ptr: {
10369 return getPreloadedValue(DAG, *MFI, VT,
10372 case Intrinsic::amdgcn_dispatch_id: {
10375 case Intrinsic::amdgcn_rcp:
10376 return DAG.
getNode(AMDGPUISD::RCP,
DL, VT,
Op.getOperand(1));
10377 case Intrinsic::amdgcn_rsq:
10378 return DAG.
getNode(AMDGPUISD::RSQ,
DL, VT,
Op.getOperand(1));
10379 case Intrinsic::amdgcn_rsq_legacy:
10383 case Intrinsic::amdgcn_rcp_legacy:
10386 return DAG.
getNode(AMDGPUISD::RCP_LEGACY,
DL, VT,
Op.getOperand(1));
10387 case Intrinsic::amdgcn_rsq_clamp: {
10389 return DAG.
getNode(AMDGPUISD::RSQ_CLAMP,
DL, VT,
Op.getOperand(1));
10401 case Intrinsic::r600_read_ngroups_x:
10402 if (Subtarget->isAmdHsaOS())
10405 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10408 case Intrinsic::r600_read_ngroups_y:
10409 if (Subtarget->isAmdHsaOS())
10412 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10415 case Intrinsic::r600_read_ngroups_z:
10416 if (Subtarget->isAmdHsaOS())
10419 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10422 case Intrinsic::r600_read_local_size_x:
10423 if (Subtarget->isAmdHsaOS())
10426 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10428 case Intrinsic::r600_read_local_size_y:
10429 if (Subtarget->isAmdHsaOS())
10432 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10434 case Intrinsic::r600_read_local_size_z:
10435 if (Subtarget->isAmdHsaOS())
10438 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10440 case Intrinsic::amdgcn_workgroup_id_x:
10441 return lowerWorkGroupId(DAG, *MFI, VT,
10445 case Intrinsic::amdgcn_workgroup_id_y:
10446 return lowerWorkGroupId(DAG, *MFI, VT,
10450 case Intrinsic::amdgcn_workgroup_id_z:
10451 return lowerWorkGroupId(DAG, *MFI, VT,
10455 case Intrinsic::amdgcn_cluster_id_x:
10456 return Subtarget->hasClusters()
10457 ? getPreloadedValue(DAG, *MFI, VT,
10459 : DAG.getPOISON(VT);
10460 case Intrinsic::amdgcn_cluster_id_y:
10461 return Subtarget->hasClusters()
10462 ? getPreloadedValue(DAG, *MFI, VT,
10465 case Intrinsic::amdgcn_cluster_id_z:
10466 return Subtarget->hasClusters()
10467 ? getPreloadedValue(DAG, *MFI, VT,
10470 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10471 return Subtarget->hasClusters()
10472 ? getPreloadedValue(
10476 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10477 return Subtarget->hasClusters()
10478 ? getPreloadedValue(
10482 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10483 return Subtarget->hasClusters()
10484 ? getPreloadedValue(
10488 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10489 return Subtarget->hasClusters()
10492 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10493 return Subtarget->hasClusters()
10494 ? getPreloadedValue(
10498 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10499 return Subtarget->hasClusters()
10500 ? getPreloadedValue(
10504 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10505 return Subtarget->hasClusters()
10506 ? getPreloadedValue(
10510 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10511 return Subtarget->hasClusters()
10512 ? getPreloadedValue(
10516 case Intrinsic::amdgcn_wave_id:
10517 return lowerWaveID(DAG,
Op);
10518 case Intrinsic::amdgcn_lds_kernel_id: {
10520 return getLDSKernelId(DAG,
DL);
10521 return getPreloadedValue(DAG, *MFI, VT,
10524 case Intrinsic::amdgcn_workitem_id_x:
10525 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
10526 case Intrinsic::amdgcn_workitem_id_y:
10527 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
10528 case Intrinsic::amdgcn_workitem_id_z:
10529 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
10530 case Intrinsic::amdgcn_wavefrontsize:
10532 SDLoc(
Op), MVT::i32);
10533 case Intrinsic::amdgcn_s_buffer_load: {
10534 unsigned CPol =
Op.getConstantOperandVal(3);
10541 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
10542 Op.getOperand(3), DAG);
10544 case Intrinsic::amdgcn_fdiv_fast:
10545 return lowerFDIV_FAST(
Op, DAG);
10546 case Intrinsic::amdgcn_sin:
10547 return DAG.
getNode(AMDGPUISD::SIN_HW,
DL, VT,
Op.getOperand(1));
10549 case Intrinsic::amdgcn_cos:
10550 return DAG.
getNode(AMDGPUISD::COS_HW,
DL, VT,
Op.getOperand(1));
10552 case Intrinsic::amdgcn_mul_u24:
10553 return DAG.
getNode(AMDGPUISD::MUL_U24,
DL, VT,
Op.getOperand(1),
10555 case Intrinsic::amdgcn_mul_i24:
10556 return DAG.
getNode(AMDGPUISD::MUL_I24,
DL, VT,
Op.getOperand(1),
10559 case Intrinsic::amdgcn_log_clamp: {
10565 case Intrinsic::amdgcn_fract:
10566 return DAG.
getNode(AMDGPUISD::FRACT,
DL, VT,
Op.getOperand(1));
10568 case Intrinsic::amdgcn_class:
10569 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, VT,
Op.getOperand(1),
10571 case Intrinsic::amdgcn_div_fmas:
10572 return DAG.
getNode(AMDGPUISD::DIV_FMAS,
DL, VT,
Op.getOperand(1),
10573 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10575 case Intrinsic::amdgcn_div_fixup:
10576 return DAG.
getNode(AMDGPUISD::DIV_FIXUP,
DL, VT,
Op.getOperand(1),
10577 Op.getOperand(2),
Op.getOperand(3));
10579 case Intrinsic::amdgcn_div_scale: {
10585 SDValue Denominator =
Op.getOperand(2);
10592 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
10594 return DAG.
getNode(AMDGPUISD::DIV_SCALE,
DL,
Op->getVTList(), Src0,
10595 Denominator, Numerator);
10597 case Intrinsic::amdgcn_icmp: {
10599 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
10600 Op.getConstantOperandVal(2) == 0 &&
10605 case Intrinsic::amdgcn_fcmp: {
10608 case Intrinsic::amdgcn_ballot:
10610 case Intrinsic::amdgcn_fmed3:
10611 return DAG.
getNode(AMDGPUISD::FMED3,
DL, VT,
Op.getOperand(1),
10612 Op.getOperand(2),
Op.getOperand(3));
10613 case Intrinsic::amdgcn_fdot2:
10614 return DAG.
getNode(AMDGPUISD::FDOT2,
DL, VT,
Op.getOperand(1),
10615 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10616 case Intrinsic::amdgcn_fmul_legacy:
10617 return DAG.
getNode(AMDGPUISD::FMUL_LEGACY,
DL, VT,
Op.getOperand(1),
10619 case Intrinsic::amdgcn_sffbh:
10620 return DAG.
getNode(AMDGPUISD::FFBH_I32,
DL, VT,
Op.getOperand(1));
10621 case Intrinsic::amdgcn_sbfe:
10622 return DAG.
getNode(AMDGPUISD::BFE_I32,
DL, VT,
Op.getOperand(1),
10623 Op.getOperand(2),
Op.getOperand(3));
10624 case Intrinsic::amdgcn_ubfe:
10625 return DAG.
getNode(AMDGPUISD::BFE_U32,
DL, VT,
Op.getOperand(1),
10626 Op.getOperand(2),
Op.getOperand(3));
10627 case Intrinsic::amdgcn_cvt_pkrtz:
10628 case Intrinsic::amdgcn_cvt_pknorm_i16:
10629 case Intrinsic::amdgcn_cvt_pknorm_u16:
10630 case Intrinsic::amdgcn_cvt_pk_i16:
10631 case Intrinsic::amdgcn_cvt_pk_u16: {
10633 EVT VT =
Op.getValueType();
10636 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10637 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10638 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10639 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10640 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10641 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10642 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10643 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10645 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10648 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
10651 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
10654 case Intrinsic::amdgcn_fmad_ftz:
10655 return DAG.
getNode(AMDGPUISD::FMAD_FTZ,
DL, VT,
Op.getOperand(1),
10656 Op.getOperand(2),
Op.getOperand(3));
10658 case Intrinsic::amdgcn_if_break:
10660 Op->getOperand(1),
Op->getOperand(2)),
10663 case Intrinsic::amdgcn_groupstaticsize: {
10669 const GlobalValue *GV =
10675 case Intrinsic::amdgcn_is_shared:
10676 case Intrinsic::amdgcn_is_private: {
10683 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10687 Subtarget->hasGloballyAddressableScratch()) {
10690 AMDGPU::S_MOV_B32,
DL, MVT::i32,
10691 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10700 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10703 case Intrinsic::amdgcn_perm:
10704 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op.getOperand(1),
10705 Op.getOperand(2),
Op.getOperand(3));
10706 case Intrinsic::amdgcn_reloc_constant: {
10716 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10717 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10718 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10719 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10720 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10721 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10722 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10723 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10724 if (
Op.getOperand(4).getValueType() == MVT::i32)
10730 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10731 Op.getOperand(3), IndexKeyi32);
10733 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10734 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10735 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10736 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10737 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10738 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10739 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10740 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10741 if (
Op.getOperand(4).getValueType() == MVT::i64)
10746 Op.getOperand(4).getValueType() == MVT::v2i32
10750 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10751 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10752 Op.getOperand(6)});
10754 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10755 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10756 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10757 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10758 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10759 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10760 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10763 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
10768 Op.getOperand(6).getValueType().isVector()
10772 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10773 Op.getOperand(3),
Op.getOperand(4),
Op.getOperand(5),
10774 IndexKey,
Op.getOperand(7),
Op.getOperand(8)};
10775 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
10776 Args.push_back(
Op.getOperand(9));
10779 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10780 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10781 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10782 if (
Op.getOperand(6).getValueType() == MVT::i32)
10788 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10789 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10790 IndexKeyi32, Op.getOperand(7)});
10792 case Intrinsic::amdgcn_addrspacecast_nonnull:
10793 return lowerADDRSPACECAST(
Op, DAG);
10794 case Intrinsic::amdgcn_readlane:
10795 case Intrinsic::amdgcn_readfirstlane:
10796 case Intrinsic::amdgcn_writelane:
10797 case Intrinsic::amdgcn_permlane16:
10798 case Intrinsic::amdgcn_permlanex16:
10799 case Intrinsic::amdgcn_permlane64:
10800 case Intrinsic::amdgcn_set_inactive:
10801 case Intrinsic::amdgcn_set_inactive_chain_arg:
10802 case Intrinsic::amdgcn_mov_dpp8:
10803 case Intrinsic::amdgcn_update_dpp:
10805 case Intrinsic::amdgcn_dead: {
10807 for (
const EVT ValTy :
Op.getNode()->values())
10811 case Intrinsic::amdgcn_wave_shuffle:
10814 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10816 return lowerImage(
Op, ImageDimIntr, DAG,
false);
10826 if (Subtarget->hasRestrictedSOffset() &&
isNullConstant(SOffset))
10827 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10833 unsigned NewOpcode)
const {
10837 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10838 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10856 M->getMemOperand());
10861 unsigned NewOpcode)
const {
10865 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10866 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10884 M->getMemOperand());
10889 unsigned IntrID =
Op.getConstantOperandVal(1);
10893 case Intrinsic::amdgcn_ds_ordered_add:
10894 case Intrinsic::amdgcn_ds_ordered_swap: {
10899 unsigned IndexOperand =
M->getConstantOperandVal(7);
10900 unsigned WaveRelease =
M->getConstantOperandVal(8);
10901 unsigned WaveDone =
M->getConstantOperandVal(9);
10903 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10904 IndexOperand &= ~0x3f;
10905 unsigned CountDw = 0;
10908 CountDw = (IndexOperand >> 24) & 0xf;
10909 IndexOperand &= ~(0xf << 24);
10911 if (CountDw < 1 || CountDw > 4) {
10914 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
10915 DL.getDebugLoc()));
10920 if (IndexOperand) {
10923 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
10926 if (WaveDone && !WaveRelease) {
10930 Fn,
"ds_ordered_count: wave_done requires wave_release",
10931 DL.getDebugLoc()));
10934 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10935 unsigned ShaderType =
10937 unsigned Offset0 = OrderedCountIndex << 2;
10938 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10941 Offset1 |= (CountDw - 1) << 6;
10944 Offset1 |= ShaderType << 2;
10946 unsigned Offset = Offset0 | (Offset1 << 8);
10953 M->getVTList(),
Ops,
M->getMemoryVT(),
10954 M->getMemOperand());
10956 case Intrinsic::amdgcn_raw_buffer_load:
10957 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10958 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10959 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10960 case Intrinsic::amdgcn_raw_buffer_load_format:
10961 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10962 const bool IsFormat =
10963 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10964 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10966 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10967 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10981 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
10983 case Intrinsic::amdgcn_struct_buffer_load:
10984 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10985 case Intrinsic::amdgcn_struct_buffer_load_format:
10986 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10987 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10988 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10989 const bool IsFormat =
10990 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10991 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10993 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10994 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11009 case Intrinsic::amdgcn_raw_tbuffer_load:
11010 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
11012 EVT LoadVT =
Op.getValueType();
11013 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11014 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
11030 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
11032 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
11033 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
11036 case Intrinsic::amdgcn_struct_tbuffer_load:
11037 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
11039 EVT LoadVT =
Op.getValueType();
11040 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11041 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11057 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
11059 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
11060 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
11063 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
11064 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
11065 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
11066 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
11067 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
11068 return lowerStructBufferAtomicIntrin(
Op, DAG,
11069 AMDGPUISD::BUFFER_ATOMIC_FADD);
11070 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
11071 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
11072 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
11073 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
11074 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
11075 return lowerStructBufferAtomicIntrin(
Op, DAG,
11076 AMDGPUISD::BUFFER_ATOMIC_FMIN);
11077 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
11078 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
11079 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
11080 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
11081 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
11082 return lowerStructBufferAtomicIntrin(
Op, DAG,
11083 AMDGPUISD::BUFFER_ATOMIC_FMAX);
11084 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
11085 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
11086 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
11087 case Intrinsic::amdgcn_raw_buffer_atomic_add:
11088 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
11089 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
11090 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
11091 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
11092 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
11093 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
11094 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
11095 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
11096 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
11097 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
11098 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
11099 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
11100 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
11101 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
11102 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
11103 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
11104 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
11105 case Intrinsic::amdgcn_raw_buffer_atomic_and:
11106 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
11107 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
11108 case Intrinsic::amdgcn_raw_buffer_atomic_or:
11109 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
11110 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
11111 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
11112 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
11113 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
11114 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
11115 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
11116 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
11117 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
11118 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
11119 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
11120 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
11121 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
11122 return lowerStructBufferAtomicIntrin(
Op, DAG,
11123 AMDGPUISD::BUFFER_ATOMIC_SWAP);
11124 case Intrinsic::amdgcn_struct_buffer_atomic_add:
11125 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
11126 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
11127 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
11128 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
11129 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
11130 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
11131 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
11132 return lowerStructBufferAtomicIntrin(
Op, DAG,
11133 AMDGPUISD::BUFFER_ATOMIC_SMIN);
11134 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
11135 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
11136 return lowerStructBufferAtomicIntrin(
Op, DAG,
11137 AMDGPUISD::BUFFER_ATOMIC_UMIN);
11138 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
11139 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
11140 return lowerStructBufferAtomicIntrin(
Op, DAG,
11141 AMDGPUISD::BUFFER_ATOMIC_SMAX);
11142 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
11143 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
11144 return lowerStructBufferAtomicIntrin(
Op, DAG,
11145 AMDGPUISD::BUFFER_ATOMIC_UMAX);
11146 case Intrinsic::amdgcn_struct_buffer_atomic_and:
11147 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
11148 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
11149 case Intrinsic::amdgcn_struct_buffer_atomic_or:
11150 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
11151 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
11152 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
11153 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
11154 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
11155 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
11156 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
11157 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
11158 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
11159 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
11160 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
11161 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
11162 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
11163 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
11164 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
11165 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
11166 return lowerStructBufferAtomicIntrin(
Op, DAG,
11167 AMDGPUISD::BUFFER_ATOMIC_CSUB);
11168 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
11169 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
11170 return lowerRawBufferAtomicIntrin(
Op, DAG,
11171 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
11172 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
11173 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
11174 return lowerStructBufferAtomicIntrin(
Op, DAG,
11175 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
11176 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
11177 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
11178 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
11179 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11193 EVT VT =
Op.getValueType();
11197 Op->getVTList(),
Ops, VT,
11198 M->getMemOperand());
11200 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
11201 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
11202 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
11203 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
11217 EVT VT =
Op.getValueType();
11221 Op->getVTList(),
Ops, VT,
11222 M->getMemOperand());
11224 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
11225 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
11227 SDValue NodePtr =
M->getOperand(2);
11228 SDValue RayExtent =
M->getOperand(3);
11229 SDValue InstanceMask =
M->getOperand(4);
11230 SDValue RayOrigin =
M->getOperand(5);
11231 SDValue RayDir =
M->getOperand(6);
11233 SDValue TDescr =
M->getOperand(8);
11238 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
11243 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
11244 const unsigned NumVDataDwords = 10;
11245 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
11247 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
11248 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
11249 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
11253 Ops.push_back(NodePtr);
11256 {DAG.getBitcast(MVT::i32, RayExtent),
11257 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
11258 Ops.push_back(RayOrigin);
11259 Ops.push_back(RayDir);
11260 Ops.push_back(Offsets);
11261 Ops.push_back(TDescr);
11262 Ops.push_back(
M->getChain());
11265 MachineMemOperand *MemRef =
M->getMemOperand();
11269 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
11271 SDValue NodePtr =
M->getOperand(2);
11272 SDValue RayExtent =
M->getOperand(3);
11273 SDValue RayOrigin =
M->getOperand(4);
11274 SDValue RayDir =
M->getOperand(5);
11275 SDValue RayInvDir =
M->getOperand(6);
11276 SDValue TDescr =
M->getOperand(7);
11283 if (!Subtarget->hasGFX10_AEncoding()) {
11293 const unsigned NumVDataDwords = 4;
11294 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
11295 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
11296 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
11299 const unsigned BaseOpcodes[2][2] = {
11300 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
11301 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
11302 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
11306 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
11307 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
11308 : AMDGPU::MIMGEncGfx10NSA,
11309 NumVDataDwords, NumVAddrDwords);
11313 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
11314 : AMDGPU::MIMGEncGfx10Default,
11315 NumVDataDwords, NumVAddrDwords);
11321 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
11324 if (Lanes[0].getValueSizeInBits() == 32) {
11325 for (
unsigned I = 0;
I < 3; ++
I)
11332 Ops.push_back(Lanes[2]);
11344 if (UseNSA && IsGFX11Plus) {
11345 Ops.push_back(NodePtr);
11347 Ops.push_back(RayOrigin);
11352 for (
unsigned I = 0;
I < 3; ++
I) {
11355 {DirLanes[I], InvDirLanes[I]})));
11359 Ops.push_back(RayDir);
11360 Ops.push_back(RayInvDir);
11367 Ops.push_back(NodePtr);
11370 packLanes(RayOrigin,
true);
11371 packLanes(RayDir,
true);
11372 packLanes(RayInvDir,
false);
11377 if (NumVAddrDwords > 12) {
11385 Ops.push_back(MergedOps);
11388 Ops.push_back(TDescr);
11390 Ops.push_back(
M->getChain());
11393 MachineMemOperand *MemRef =
M->getMemOperand();
11397 case Intrinsic::amdgcn_global_atomic_fmin_num:
11398 case Intrinsic::amdgcn_global_atomic_fmax_num:
11399 case Intrinsic::amdgcn_flat_atomic_fmin_num:
11400 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11407 unsigned Opcode = 0;
11409 case Intrinsic::amdgcn_global_atomic_fmin_num:
11410 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
11414 case Intrinsic::amdgcn_global_atomic_fmax_num:
11415 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11422 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
11423 Ops,
M->getMemOperand());
11425 case Intrinsic::amdgcn_s_alloc_vgpr: {
11433 ReadFirstLaneID, NumVGPRs);
11436 Op.getOperand(0),
Op.getOperand(1), NumVGPRs);
11438 case Intrinsic::amdgcn_s_get_barrier_state:
11439 case Intrinsic::amdgcn_s_get_named_barrier_state: {
11446 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
11447 BarID = (BarID >> 4) & 0x3F;
11448 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
11451 Ops.push_back(Chain);
11453 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
11454 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
11462 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11470 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11471 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11472 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11476 EVT VT =
Op->getValueType(0);
11480 case Intrinsic::amdgcn_flat_load_monitor_b32:
11481 case Intrinsic::amdgcn_flat_load_monitor_b64:
11482 case Intrinsic::amdgcn_flat_load_monitor_b128: {
11487 Op->getVTList(), {Chain, Ptr},
11490 case Intrinsic::amdgcn_global_load_monitor_b32:
11491 case Intrinsic::amdgcn_global_load_monitor_b64:
11492 case Intrinsic::amdgcn_global_load_monitor_b128: {
11497 Op->getVTList(), {Chain, Ptr},
11502 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11504 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11512SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
11519 EVT VT = VTList.
VTs[0];
11522 bool IsTFE = VTList.
NumVTs == 3;
11525 unsigned NumOpDWords = NumValueDWords + 1;
11527 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
11528 MachineMemOperand *OpDWordsMMO =
11530 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
11531 OpDWordsVT, OpDWordsMMO, DAG);
11536 NumValueDWords == 1
11545 if (!Subtarget->hasDwordx3LoadStores() &&
11546 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11550 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
11552 WidenedMemVT, WidenedMMO);
11562 bool ImageStore)
const {
11572 if (Subtarget->hasUnpackedD16VMem()) {
11586 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11597 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
11603 if ((NumElements % 2) == 1) {
11605 unsigned I = Elts.
size() / 2;
11621 if (NumElements == 3) {
11640 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11641 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11642 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11643 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
11644 case Intrinsic::amdgcn_load_async_to_lds:
11645 case Intrinsic::amdgcn_global_load_async_lds:
11655 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
11657 switch (IntrinsicID) {
11658 case Intrinsic::amdgcn_exp_compr: {
11659 if (!Subtarget->hasCompressedExport()) {
11662 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
11684 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11688 case Intrinsic::amdgcn_struct_tbuffer_store:
11689 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11691 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11693 VData = handleD16VData(VData, DAG);
11694 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11695 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11709 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11710 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11713 M->getMemoryVT(),
M->getMemOperand());
11716 case Intrinsic::amdgcn_raw_tbuffer_store:
11717 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11719 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11721 VData = handleD16VData(VData, DAG);
11722 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11723 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11737 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11738 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11741 M->getMemoryVT(),
M->getMemOperand());
11744 case Intrinsic::amdgcn_raw_buffer_store:
11745 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11746 case Intrinsic::amdgcn_raw_buffer_store_format:
11747 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11748 const bool IsFormat =
11749 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11750 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11757 VData = handleD16VData(VData, DAG);
11767 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11768 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11782 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11783 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11788 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
11791 M->getMemoryVT(),
M->getMemOperand());
11794 case Intrinsic::amdgcn_struct_buffer_store:
11795 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11796 case Intrinsic::amdgcn_struct_buffer_store_format:
11797 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11798 const bool IsFormat =
11799 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11800 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11808 VData = handleD16VData(VData, DAG);
11818 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11819 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11833 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11834 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11838 EVT VDataType = VData.getValueType().getScalarType();
11840 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
11843 M->getMemoryVT(),
M->getMemOperand());
11845 case Intrinsic::amdgcn_raw_buffer_load_lds:
11846 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11847 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11848 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11849 case Intrinsic::amdgcn_struct_buffer_load_lds:
11850 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11851 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
11852 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
11853 if (!Subtarget->hasVMemToLDSLoad())
11857 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11858 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds ||
11859 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds ||
11860 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;
11861 unsigned OpOffset = HasVIndex ? 1 : 0;
11862 SDValue VOffset =
Op.getOperand(5 + OpOffset);
11864 unsigned Size =
Op->getConstantOperandVal(4);
11870 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11871 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11872 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11873 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11876 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11877 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11878 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11879 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11882 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11883 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11884 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11885 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11888 if (!Subtarget->hasLDSLoadB96_B128())
11890 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11891 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11892 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11893 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11896 if (!Subtarget->hasLDSLoadB96_B128())
11898 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11899 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11900 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11901 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11909 if (HasVIndex && HasVOffset)
11913 else if (HasVIndex)
11914 Ops.push_back(
Op.getOperand(5));
11915 else if (HasVOffset)
11916 Ops.push_back(VOffset);
11918 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11919 Ops.push_back(Rsrc);
11920 Ops.push_back(
Op.getOperand(6 + OpOffset));
11921 Ops.push_back(
Op.getOperand(7 + OpOffset));
11923 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
11946 case Intrinsic::amdgcn_load_to_lds:
11947 case Intrinsic::amdgcn_load_async_to_lds:
11948 case Intrinsic::amdgcn_global_load_lds:
11949 case Intrinsic::amdgcn_global_load_async_lds: {
11950 if (!Subtarget->hasVMemToLDSLoad())
11954 unsigned Size =
Op->getConstantOperandVal(4);
11959 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11962 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11965 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11968 if (!Subtarget->hasLDSLoadB96_B128())
11970 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11973 if (!Subtarget->hasLDSLoadB96_B128())
11975 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11991 if (
LHS->isDivergent())
11995 RHS.getOperand(0).getValueType() == MVT::i32) {
11998 VOffset =
RHS.getOperand(0);
12002 Ops.push_back(Addr);
12010 Ops.push_back(VOffset);
12013 Ops.push_back(
Op.getOperand(5));
12015 unsigned Aux =
Op.getConstantOperandVal(6);
12030 case Intrinsic::amdgcn_end_cf:
12032 Op->getOperand(2), Chain),
12034 case Intrinsic::amdgcn_s_barrier_init:
12035 case Intrinsic::amdgcn_s_barrier_signal_var: {
12042 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
12043 ? AMDGPU::S_BARRIER_INIT_M0
12044 : AMDGPU::S_BARRIER_SIGNAL_M0;
12059 constexpr unsigned ShAmt = 16;
12066 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
12071 case Intrinsic::amdgcn_s_wakeup_barrier: {
12072 if (!Subtarget->hasSWakeupBarrier())
12076 case Intrinsic::amdgcn_s_barrier_join: {
12085 switch (IntrinsicID) {
12088 case Intrinsic::amdgcn_s_barrier_join:
12089 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
12091 case Intrinsic::amdgcn_s_wakeup_barrier:
12092 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
12096 unsigned BarID = (BarVal >> 4) & 0x3F;
12099 Ops.push_back(Chain);
12101 switch (IntrinsicID) {
12104 case Intrinsic::amdgcn_s_barrier_join:
12105 Opc = AMDGPU::S_BARRIER_JOIN_M0;
12107 case Intrinsic::amdgcn_s_wakeup_barrier:
12108 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
12119 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
12125 case Intrinsic::amdgcn_s_prefetch_data: {
12128 return Op.getOperand(0);
12131 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
12133 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
12140 Op->getVTList(),
Ops,
M->getMemoryVT(),
12141 M->getMemOperand());
12143 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
12144 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
12145 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
12154 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
12156 return lowerImage(
Op, ImageDimIntr, DAG,
true);
12172 return PtrVT == MVT::i64;
12186std::pair<SDValue, SDValue>
12199 bool CheckNUW = Subtarget->hasGFX1250Insts();
12216 unsigned Overflow = ImmOffset & ~MaxImm;
12217 ImmOffset -= Overflow;
12218 if ((int32_t)Overflow < 0) {
12219 Overflow += ImmOffset;
12224 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
12243void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
12245 Align Alignment)
const {
12247 SDLoc
DL(CombinedOffset);
12249 uint32_t
Imm =
C->getZExtValue();
12250 uint32_t SOffset, ImmOffset;
12251 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
12262 bool CheckNUW = Subtarget->hasGFX1250Insts();
12265 uint32_t SOffset, ImmOffset;
12268 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
12276 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
12285SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
12288 return MaybePointer;
12302 SDValue NumRecords =
Op->getOperand(3);
12308 if (Subtarget->has45BitNumRecordsBufferResource()) {
12327 SDValue ExtShiftedStrideVec =
12339 DAG.
getNode(
ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
12341 DAG.
getNode(
ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
12346 auto [LowHalf, HighHalf] =
12347 DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
12357 NumRecords, Flags);
12369 bool IsTFE)
const {
12374 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
12375 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
12378 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
12390 ? AMDGPUISD::BUFFER_LOAD_UBYTE
12391 : AMDGPUISD::BUFFER_LOAD_USHORT;
12393 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
12407 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
12411 Ops[1] = BufferStoreExt;
12412 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
12413 : AMDGPUISD::BUFFER_STORE_SHORT;
12416 M->getMemOperand());
12441 DAGCombinerInfo &DCI)
const {
12442 SelectionDAG &DAG = DCI.DAG;
12457 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
12464 "unexpected vector extload");
12477 "unexpected fp extload");
12495 DCI.AddToWorklist(Cvt.
getNode());
12500 DCI.AddToWorklist(Cvt.
getNode());
12511 if (Info.isEntryFunction())
12512 return Info.getUserSGPRInfo().hasFlatScratchInit();
12520 EVT MemVT =
Load->getMemoryVT();
12521 MachineMemOperand *MMO =
Load->getMemOperand();
12533 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12561 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
12562 "Custom lowering for non-i32 vectors hasn't been implemented.");
12565 unsigned AS =
Load->getAddressSpace();
12566 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12573 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12577 !Subtarget->hasMultiDwordFlatScratchAddressing())
12587 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
12590 Alignment >=
Align(4) && NumElements < 32) {
12592 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12604 if (NumElements > 4)
12607 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12617 switch (Subtarget->getMaxPrivateElementSize()) {
12623 if (NumElements > 2)
12628 if (NumElements > 4)
12631 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12640 auto Flags =
Load->getMemOperand()->getFlags();
12642 Load->getAlign(), Flags, &
Fast) &&
12651 MemVT, *
Load->getMemOperand())) {
12660 EVT VT =
Op.getValueType();
12697 EVT VT =
Op.getValueType();
12698 const SDNodeFlags
Flags =
Op->getFlags();
12700 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
12706 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12709 if (CLHS->isExactlyValue(1.0)) {
12722 return DAG.
getNode(AMDGPUISD::RCP, SL, VT,
RHS);
12726 if (CLHS->isExactlyValue(-1.0)) {
12729 return DAG.
getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12735 if (!AllowInaccurateRcp &&
12736 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
12750 EVT VT =
Op.getValueType();
12751 const SDNodeFlags
Flags =
Op->getFlags();
12753 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
12754 if (!AllowInaccurateDiv)
12775 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
12785 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12789 return DAG.
getNode(Opcode, SL, VTList,
12798 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
12808 Opcode = AMDGPUISD::FMA_W_CHAIN;
12812 return DAG.
getNode(Opcode, SL, VTList,
12818 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12819 return FastLowered;
12822 EVT VT =
Op.getValueType();
12829 if (VT == MVT::bf16) {
12852 unsigned FMADOpCode =
12856 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt,
Op->getFlags());
12859 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12861 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
12862 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12872 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst,
RHS,
LHS,
12878 SDNodeFlags
Flags =
Op->getFlags();
12888 const APFloat K0Val(0x1p+96f);
12891 const APFloat K1Val(0x1p-32f);
12918 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
12919 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12920 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12925 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12926 return FastLowered;
12932 SDNodeFlags
Flags =
Op->getFlags();
12933 Flags.setNoFPExcept(
true);
12941 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
12950 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12954 using namespace AMDGPU::Hwreg;
12955 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12959 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
12960 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
12963 const bool HasDynamicDenormals =
12969 if (!PreservesDenormals) {
12974 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12977 if (HasDynamicDenormals) {
12981 SavedDenormMode =
SDValue(GetReg, 0);
12987 SDNode *EnableDenorm;
12988 if (Subtarget->hasDenormModeInst()) {
12989 const SDValue EnableDenormValue =
12992 EnableDenorm = DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12996 const SDValue EnableDenormValue =
12998 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12999 {EnableDenormValue,
BitField, Glue});
13009 ApproxRcp, One, NegDivScale0, Flags);
13012 ApproxRcp, Fma0, Flags);
13018 NumeratorScaled,
Mul, Flags);
13024 NumeratorScaled, Fma3, Flags);
13026 if (!PreservesDenormals) {
13027 SDNode *DisableDenorm;
13028 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
13032 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
13034 DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
13038 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
13039 const SDValue DisableDenormValue =
13040 HasDynamicDenormals
13045 AMDGPU::S_SETREG_B32, SL, MVT::Other,
13056 {Fma4, Fma1, Fma3, Scale},
Flags);
13058 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas,
RHS,
LHS, Flags);
13062 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
13063 return FastLowered;
13071 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
13077 SDValue Rcp = DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
13095 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
13125 DAG.
getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3,
Mul, Scale);
13127 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas,
Y,
X);
13131 EVT VT =
Op.getValueType();
13133 if (VT == MVT::f32)
13134 return LowerFDIV32(
Op, DAG);
13136 if (VT == MVT::f64)
13137 return LowerFDIV64(
Op, DAG);
13139 if (VT == MVT::f16 || VT == MVT::bf16)
13140 return LowerFDIV16(
Op, DAG);
13149 EVT ResultExpVT =
Op->getValueType(1);
13150 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
13160 if (Subtarget->hasFractBug()) {
13178 EVT VT =
Store->getMemoryVT();
13180 if (VT == MVT::i1) {
13184 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
13188 Store->getValue().getValueType().getScalarType() == MVT::i32);
13190 unsigned AS =
Store->getAddressSpace();
13191 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
13199 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
13203 !Subtarget->hasMultiDwordFlatScratchAddressing())
13210 if (NumElements > 4)
13213 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13217 VT, *
Store->getMemOperand()))
13223 switch (Subtarget->getMaxPrivateElementSize()) {
13227 if (NumElements > 2)
13231 if (NumElements > 4 ||
13232 (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
13240 auto Flags =
Store->getMemOperand()->getFlags();
13259 assert(!Subtarget->has16BitInsts());
13260 SDNodeFlags
Flags =
Op->getFlags();
13274 SDNodeFlags
Flags =
Op->getFlags();
13275 MVT VT =
Op.getValueType().getSimpleVT();
13383 SDNodeFlags
Flags =
Op->getFlags();
13446 EVT VT =
Op.getValueType();
13457 if (!
V.getValueType().isVector())
13465 if (Subtarget->hasTrigReducedRange()) {
13467 TrigVal = UnrollIfVec(DAG.
getNode(AMDGPUISD::FRACT,
DL, VT, MulVal, Flags));
13472 switch (
Op.getOpcode()) {
13474 TrigVal = DAG.
getNode(AMDGPUISD::COS_HW, SDLoc(
Op), VT, TrigVal, Flags);
13477 TrigVal = DAG.
getNode(AMDGPUISD::SIN_HW, SDLoc(
Op), VT, TrigVal, Flags);
13483 return UnrollIfVec(TrigVal);
13503 EVT VT =
Op.getValueType();
13511 Op->getVTList(),
Ops, VT,
13520SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
13521 DAGCombinerInfo &DCI)
const {
13522 EVT VT =
N->getValueType(0);
13524 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13527 SelectionDAG &DAG = DCI.DAG;
13531 EVT SrcVT = Src.getValueType();
13537 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13540 DCI.AddToWorklist(Cvt.
getNode());
13543 if (ScalarVT != MVT::f32) {
13555 DAGCombinerInfo &DCI)
const {
13566 SelectionDAG &DAG = DCI.DAG;
13585 for (
unsigned I = 0;
I != NumElts; ++
I) {
13609 if (NewElts.
size() == 1)
13631 for (
unsigned I = 0;
I != NumElts; ++
I) {
13666SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
13668 DAGCombinerInfo &DCI)
const {
13685 SelectionDAG &DAG = DCI.DAG;
13698 AM.BaseOffs =
Offset.getSExtValue();
13703 EVT VT =
N->getValueType(0);
13709 Flags.setNoUnsignedWrap(
13710 N->getFlags().hasNoUnsignedWrap() &&
13722 switch (
N->getOpcode()) {
13733 DAGCombinerInfo &DCI)
const {
13734 SelectionDAG &DAG = DCI.DAG;
13741 SDValue NewPtr = performSHLPtrCombine(Ptr.
getNode(),
N->getAddressSpace(),
13742 N->getMemoryVT(), DCI);
13746 NewOps[PtrIdx] = NewPtr;
13755 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13756 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13765SDValue SITargetLowering::splitBinaryBitConstantOp(
13769 uint32_t ValLo =
Lo_32(Val);
13770 uint32_t ValHi =
Hi_32(Val);
13777 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
13791 if (V.getValueType() != MVT::i1)
13793 switch (V.getOpcode()) {
13798 case AMDGPUISD::FP_CLASS:
13810 return V.getResNo() == 1;
13812 unsigned IntrinsicID = V.getConstantOperandVal(0);
13813 switch (IntrinsicID) {
13814 case Intrinsic::amdgcn_is_shared:
13815 case Intrinsic::amdgcn_is_private:
13832 if (!(
C & 0x000000ff))
13833 ZeroByteMask |= 0x000000ff;
13834 if (!(
C & 0x0000ff00))
13835 ZeroByteMask |= 0x0000ff00;
13836 if (!(
C & 0x00ff0000))
13837 ZeroByteMask |= 0x00ff0000;
13838 if (!(
C & 0xff000000))
13839 ZeroByteMask |= 0xff000000;
13840 uint32_t NonZeroByteMask = ~ZeroByteMask;
13841 if ((NonZeroByteMask &
C) != NonZeroByteMask)
13854 assert(V.getValueSizeInBits() == 32);
13856 if (V.getNumOperands() != 2)
13865 switch (V.getOpcode()) {
13870 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13875 return (0x03020100 & ~ConstMask) | ConstMask;
13882 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
13888 return uint32_t(0x0c0c0c0c03020100ull >>
C);
13895 DAGCombinerInfo &DCI)
const {
13896 if (DCI.isBeforeLegalize())
13899 SelectionDAG &DAG = DCI.DAG;
13900 EVT VT =
N->getValueType(0);
13905 if (VT == MVT::i64 && CRHS) {
13907 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
13911 if (CRHS && VT == MVT::i32) {
13921 unsigned Shift = CShift->getZExtValue();
13923 unsigned Offset = NB + Shift;
13924 if ((
Offset & (Bits - 1)) == 0) {
13927 DAG.
getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
LHS->getOperand(0),
13948 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13950 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
13963 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
13968 if (
X !=
LHS.getOperand(1))
13972 const ConstantFPSDNode *C1 =
13989 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
X,
13995 if (
RHS.getOpcode() ==
ISD::SETCC &&
LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13998 if (
LHS.getOpcode() ==
ISD::SETCC &&
RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14006 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
14007 LHS.getOperand(0) ==
LHS.getOperand(1))) {
14009 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
14010 :
Mask->getZExtValue() & OrdMask;
14013 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
RHS.getOperand(0),
14031 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14034 if (LHSMask != ~0u && RHSMask != ~0u) {
14037 if (LHSMask > RHSMask) {
14044 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14045 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14048 if (!(LHSUsedLanes & RHSUsedLanes) &&
14051 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14057 uint32_t
Mask = LHSMask & RHSMask;
14058 for (
unsigned I = 0;
I < 32;
I += 8) {
14059 uint32_t ByteSel = 0xff <<
I;
14060 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
14061 Mask &= (0x0c <<
I) & 0xffffffff;
14066 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
14069 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14119static const std::optional<ByteProvider<SDValue>>
14121 unsigned Depth = 0) {
14124 return std::nullopt;
14126 if (
Op.getValueSizeInBits() < 8)
14127 return std::nullopt;
14129 if (
Op.getValueType().isVector())
14132 switch (
Op->getOpcode()) {
14145 NarrowVT = VTSign->getVT();
14148 return std::nullopt;
14151 if (SrcIndex >= NarrowByteWidth)
14152 return std::nullopt;
14160 return std::nullopt;
14162 uint64_t BitShift = ShiftOp->getZExtValue();
14164 if (BitShift % 8 != 0)
14165 return std::nullopt;
14167 SrcIndex += BitShift / 8;
14185static const std::optional<ByteProvider<SDValue>>
14187 unsigned StartingIndex = 0) {
14191 return std::nullopt;
14193 unsigned BitWidth =
Op.getScalarValueSizeInBits();
14195 return std::nullopt;
14197 return std::nullopt;
14199 bool IsVec =
Op.getValueType().isVector();
14200 switch (
Op.getOpcode()) {
14203 return std::nullopt;
14208 return std::nullopt;
14212 return std::nullopt;
14215 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
14216 return std::nullopt;
14217 if (!
LHS ||
LHS->isConstantZero())
14219 if (!
RHS ||
RHS->isConstantZero())
14221 return std::nullopt;
14226 return std::nullopt;
14230 return std::nullopt;
14232 uint32_t BitMask = BitMaskOp->getZExtValue();
14234 uint32_t IndexMask = 0xFF << (Index * 8);
14236 if ((IndexMask & BitMask) != IndexMask) {
14239 if (IndexMask & BitMask)
14240 return std::nullopt;
14249 return std::nullopt;
14253 if (!ShiftOp ||
Op.getValueType().isVector())
14254 return std::nullopt;
14256 uint64_t BitsProvided =
Op.getValueSizeInBits();
14257 if (BitsProvided % 8 != 0)
14258 return std::nullopt;
14260 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
14262 return std::nullopt;
14264 uint64_t ConcatSizeInBytes = BitsProvided / 4;
14265 uint64_t ByteShift = BitShift / 8;
14267 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
14268 uint64_t BytesProvided = BitsProvided / 8;
14269 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
14270 NewIndex %= BytesProvided;
14277 return std::nullopt;
14281 return std::nullopt;
14283 uint64_t BitShift = ShiftOp->getZExtValue();
14285 return std::nullopt;
14287 auto BitsProvided =
Op.getScalarValueSizeInBits();
14288 if (BitsProvided % 8 != 0)
14289 return std::nullopt;
14291 uint64_t BytesProvided = BitsProvided / 8;
14292 uint64_t ByteShift = BitShift / 8;
14297 return BytesProvided - ByteShift > Index
14305 return std::nullopt;
14309 return std::nullopt;
14311 uint64_t BitShift = ShiftOp->getZExtValue();
14312 if (BitShift % 8 != 0)
14313 return std::nullopt;
14314 uint64_t ByteShift = BitShift / 8;
14320 return Index < ByteShift
14323 Depth + 1, StartingIndex);
14332 return std::nullopt;
14340 NarrowBitWidth = VTSign->getVT().getSizeInBits();
14342 if (NarrowBitWidth % 8 != 0)
14343 return std::nullopt;
14344 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14346 if (Index >= NarrowByteWidth)
14348 ? std::optional<ByteProvider<SDValue>>(
14356 return std::nullopt;
14360 if (NarrowByteWidth >= Index) {
14365 return std::nullopt;
14372 return std::nullopt;
14378 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
14379 if (NarrowBitWidth % 8 != 0)
14380 return std::nullopt;
14381 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14386 if (Index >= NarrowByteWidth) {
14388 ? std::optional<ByteProvider<SDValue>>(
14393 if (NarrowByteWidth > Index) {
14397 return std::nullopt;
14402 return std::nullopt;
14405 Depth + 1, StartingIndex);
14411 return std::nullopt;
14412 auto VecIdx = IdxOp->getZExtValue();
14413 auto ScalarSize =
Op.getScalarValueSizeInBits();
14414 if (ScalarSize < 32)
14415 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
14417 StartingIndex, Index);
14420 case AMDGPUISD::PERM: {
14422 return std::nullopt;
14426 return std::nullopt;
14429 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
14430 if (IdxMask > 0x07 && IdxMask != 0x0c)
14431 return std::nullopt;
14433 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
14434 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
14436 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
14442 return std::nullopt;
14457 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
14464 auto MemVT = L->getMemoryVT();
14467 return L->getMemoryVT().getSizeInBits() == 16;
14477 int Low8 = Mask & 0xff;
14478 int Hi8 = (Mask & 0xff00) >> 8;
14480 assert(Low8 < 8 && Hi8 < 8);
14482 bool IsConsecutive = (Hi8 - Low8 == 1);
14487 bool Is16Aligned = !(Low8 % 2);
14489 return IsConsecutive && Is16Aligned;
14497 int Low16 = PermMask & 0xffff;
14498 int Hi16 = (PermMask & 0xffff0000) >> 16;
14508 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14510 if (!OtherOpIs16Bit)
14518 unsigned DWordOffset) {
14523 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14528 if (Src.getValueType().isVector()) {
14529 auto ScalarTySize = Src.getScalarValueSizeInBits();
14530 auto ScalarTy = Src.getValueType().getScalarType();
14531 if (ScalarTySize == 32) {
14535 if (ScalarTySize > 32) {
14538 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
14539 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14546 assert(ScalarTySize < 32);
14547 auto NumElements =
TypeSize / ScalarTySize;
14548 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14549 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14550 auto NumElementsIn32 = 32 / ScalarTySize;
14551 auto NumAvailElements = DWordOffset < Trunc32Elements
14553 : NumElements - NormalizedTrunc;
14566 auto ShiftVal = 32 * DWordOffset;
14574 [[maybe_unused]]
EVT VT =
N->getValueType(0);
14579 for (
int i = 0; i < 4; i++) {
14581 std::optional<ByteProvider<SDValue>>
P =
14584 if (!
P ||
P->isConstantZero())
14589 if (PermNodes.
size() != 4)
14592 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14593 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14595 for (
size_t i = 0; i < PermNodes.
size(); i++) {
14596 auto PermOp = PermNodes[i];
14599 int SrcByteAdjust = 4;
14603 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14604 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14606 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14607 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14611 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14612 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14615 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14617 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14620 SDValue Op = *PermNodes[FirstSrc.first].Src;
14622 assert(
Op.getValueSizeInBits() == 32);
14626 int Low16 = PermMask & 0xffff;
14627 int Hi16 = (PermMask & 0xffff0000) >> 16;
14629 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14630 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14633 if (WellFormedLow && WellFormedHi)
14637 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
14646 (
N->getOperand(0) ==
Op ||
N->getOperand(0) == OtherOp) &&
14647 (
N->getOperand(1) ==
Op ||
N->getOperand(1) == OtherOp))
14652 assert(
Op.getValueType().isByteSized() &&
14663 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op, OtherOp,
14670 DAGCombinerInfo &DCI)
const {
14671 SelectionDAG &DAG = DCI.DAG;
14675 EVT VT =
N->getValueType(0);
14676 if (VT == MVT::i1) {
14678 if (
LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14679 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14681 if (Src !=
RHS.getOperand(0))
14686 if (!CLHS || !CRHS)
14690 static const uint32_t MaxMask = 0x3ff;
14695 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1, Src,
14704 LHS.getOpcode() == AMDGPUISD::PERM &&
14710 Sel |=
LHS.getConstantOperandVal(2);
14712 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14719 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14723 auto usesCombinedOperand = [](SDNode *OrUse) {
14726 !OrUse->getValueType(0).isVector())
14730 for (
auto *VUser : OrUse->users()) {
14731 if (!VUser->getValueType(0).isVector())
14738 if (VUser->getOpcode() == VectorwiseOp)
14744 if (!
any_of(
N->users(), usesCombinedOperand))
14750 if (LHSMask != ~0u && RHSMask != ~0u) {
14753 if (LHSMask > RHSMask) {
14760 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14761 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14764 if (!(LHSUsedLanes & RHSUsedLanes) &&
14767 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14769 LHSMask &= ~RHSUsedLanes;
14770 RHSMask &= ~LHSUsedLanes;
14772 LHSMask |= LHSUsedLanes & 0x04040404;
14774 uint32_t Sel = LHSMask | RHSMask;
14777 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14782 if (LHSMask == ~0u || RHSMask == ~0u) {
14823 return IdentitySrc;
14829 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14844 if (SrcVT == MVT::i32) {
14849 DCI.AddToWorklist(LowOr.
getNode());
14850 DCI.AddToWorklist(HiBits.getNode());
14861 N->getOperand(0), CRHS))
14869 DAGCombinerInfo &DCI)
const {
14870 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
14877 SelectionDAG &DAG = DCI.DAG;
14879 EVT VT =
N->getValueType(0);
14880 if (CRHS && VT == MVT::i64) {
14882 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
14889 unsigned Opc =
LHS.getOpcode();
14919 LHS->getOperand(0), FNegLHS, FNegRHS);
14928SITargetLowering::performZeroOrAnyExtendCombine(
SDNode *
N,
14929 DAGCombinerInfo &DCI)
const {
14930 if (!Subtarget->has16BitInsts() ||
14934 EVT VT =
N->getValueType(0);
14935 if (VT != MVT::i32)
14939 if (Src.getValueType() != MVT::i16)
14942 if (!Src->hasOneUse())
14949 std::optional<ByteProvider<SDValue>> BP0 =
14951 if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
14955 std::optional<ByteProvider<SDValue>> BP1 =
14957 if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
14965 SelectionDAG &DAG = DCI.DAG;
14967 uint32_t PermMask = 0x0c0c0c0c;
14970 PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
14975 PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
14978 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32, V0, V1,
14983SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
14984 DAGCombinerInfo &DCI)
const {
14990 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14991 VTSign->getVT() == MVT::i8) ||
14992 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14993 VTSign->getVT() == MVT::i16))) {
14994 assert(Subtarget->hasScalarSubwordLoads() &&
14995 "s_buffer_load_{u8, i8} are supported "
14996 "in GFX12 (or newer) architectures.");
14997 EVT VT = Src.getValueType();
14998 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14999 ? AMDGPUISD::SBUFFER_LOAD_BYTE
15000 : AMDGPUISD::SBUFFER_LOAD_SHORT;
15002 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
15009 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
15010 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
15014 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
15015 VTSign->getVT() == MVT::i8) ||
15016 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
15017 VTSign->getVT() == MVT::i16)) &&
15026 Src.getOperand(6), Src.getOperand(7)};
15029 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
15030 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
15031 ? AMDGPUISD::BUFFER_LOAD_BYTE
15032 : AMDGPUISD::BUFFER_LOAD_SHORT;
15033 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
15034 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
15035 return DCI.DAG.getMergeValues(
15036 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
15042 DAGCombinerInfo &DCI)
const {
15043 SelectionDAG &DAG = DCI.DAG;
15050 if (
N->getOperand(0).isUndef())
15057 DAGCombinerInfo &DCI)
const {
15058 EVT VT =
N->getValueType(0);
15068 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(
N), VT, N0,
15075 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(
N), VT, N0.
getOperand(0),
15084 unsigned MaxDepth)
const {
15085 unsigned Opcode =
Op.getOpcode();
15090 const auto &
F = CFP->getValueAPF();
15091 if (
F.isNaN() &&
F.isSignaling())
15093 if (!
F.isDenormal())
15125 case AMDGPUISD::FMUL_LEGACY:
15126 case AMDGPUISD::FMAD_FTZ:
15127 case AMDGPUISD::RCP:
15128 case AMDGPUISD::RSQ:
15129 case AMDGPUISD::RSQ_CLAMP:
15130 case AMDGPUISD::RCP_LEGACY:
15131 case AMDGPUISD::RCP_IFLAG:
15132 case AMDGPUISD::LOG:
15133 case AMDGPUISD::EXP:
15134 case AMDGPUISD::DIV_SCALE:
15135 case AMDGPUISD::DIV_FMAS:
15136 case AMDGPUISD::DIV_FIXUP:
15137 case AMDGPUISD::FRACT:
15138 case AMDGPUISD::CVT_PKRTZ_F16_F32:
15139 case AMDGPUISD::CVT_F32_UBYTE0:
15140 case AMDGPUISD::CVT_F32_UBYTE1:
15141 case AMDGPUISD::CVT_F32_UBYTE2:
15142 case AMDGPUISD::CVT_F32_UBYTE3:
15143 case AMDGPUISD::FP_TO_FP16:
15144 case AMDGPUISD::SIN_HW:
15145 case AMDGPUISD::COS_HW:
15156 if (
Op.getValueType() == MVT::i32) {
15162 if (RHS->getZExtValue() == 0xffff0000) {
15172 return Op.getValueType().getScalarType() != MVT::f16;
15182 case AMDGPUISD::CLAMP:
15183 case AMDGPUISD::FMED3:
15184 case AMDGPUISD::FMAX3:
15185 case AMDGPUISD::FMIN3:
15186 case AMDGPUISD::FMAXIMUM3:
15187 case AMDGPUISD::FMINIMUM3: {
15193 if (Subtarget->supportsMinMaxDenormModes() ||
15203 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
15215 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
15242 if (
Op.getValueType() == MVT::i16) {
15253 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
15255 switch (IntrinsicID) {
15256 case Intrinsic::amdgcn_cvt_pkrtz:
15257 case Intrinsic::amdgcn_cubeid:
15258 case Intrinsic::amdgcn_frexp_mant:
15259 case Intrinsic::amdgcn_fdot2:
15260 case Intrinsic::amdgcn_rcp:
15261 case Intrinsic::amdgcn_rsq:
15262 case Intrinsic::amdgcn_rsq_clamp:
15263 case Intrinsic::amdgcn_rcp_legacy:
15264 case Intrinsic::amdgcn_rsq_legacy:
15265 case Intrinsic::amdgcn_trig_preop:
15266 case Intrinsic::amdgcn_tanh:
15267 case Intrinsic::amdgcn_log:
15268 case Intrinsic::amdgcn_exp2:
15269 case Intrinsic::amdgcn_sqrt:
15287 unsigned MaxDepth)
const {
15290 unsigned Opcode =
MI->getOpcode();
15292 if (Opcode == AMDGPU::G_FCANONICALIZE)
15295 std::optional<FPValueAndVReg> FCR;
15298 if (FCR->Value.isSignaling())
15300 if (!FCR->Value.isDenormal())
15311 case AMDGPU::G_FADD:
15312 case AMDGPU::G_FSUB:
15313 case AMDGPU::G_FMUL:
15314 case AMDGPU::G_FCEIL:
15315 case AMDGPU::G_FFLOOR:
15316 case AMDGPU::G_FRINT:
15317 case AMDGPU::G_FNEARBYINT:
15318 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
15319 case AMDGPU::G_INTRINSIC_TRUNC:
15320 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
15321 case AMDGPU::G_FMA:
15322 case AMDGPU::G_FMAD:
15323 case AMDGPU::G_FSQRT:
15324 case AMDGPU::G_FDIV:
15325 case AMDGPU::G_FREM:
15326 case AMDGPU::G_FPOW:
15327 case AMDGPU::G_FPEXT:
15328 case AMDGPU::G_FLOG:
15329 case AMDGPU::G_FLOG2:
15330 case AMDGPU::G_FLOG10:
15331 case AMDGPU::G_FPTRUNC:
15332 case AMDGPU::G_AMDGPU_RCP_IFLAG:
15333 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
15334 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
15335 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
15336 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
15338 case AMDGPU::G_FNEG:
15339 case AMDGPU::G_FABS:
15340 case AMDGPU::G_FCOPYSIGN:
15342 case AMDGPU::G_FMINNUM:
15343 case AMDGPU::G_FMAXNUM:
15344 case AMDGPU::G_FMINNUM_IEEE:
15345 case AMDGPU::G_FMAXNUM_IEEE:
15346 case AMDGPU::G_FMINIMUM:
15347 case AMDGPU::G_FMAXIMUM:
15348 case AMDGPU::G_FMINIMUMNUM:
15349 case AMDGPU::G_FMAXIMUMNUM: {
15350 if (Subtarget->supportsMinMaxDenormModes() ||
15357 case AMDGPU::G_BUILD_VECTOR:
15362 case AMDGPU::G_INTRINSIC:
15363 case AMDGPU::G_INTRINSIC_CONVERGENT:
15365 case Intrinsic::amdgcn_fmul_legacy:
15366 case Intrinsic::amdgcn_fmad_ftz:
15367 case Intrinsic::amdgcn_sqrt:
15368 case Intrinsic::amdgcn_fmed3:
15369 case Intrinsic::amdgcn_sin:
15370 case Intrinsic::amdgcn_cos:
15371 case Intrinsic::amdgcn_log:
15372 case Intrinsic::amdgcn_exp2:
15373 case Intrinsic::amdgcn_log_clamp:
15374 case Intrinsic::amdgcn_rcp:
15375 case Intrinsic::amdgcn_rcp_legacy:
15376 case Intrinsic::amdgcn_rsq:
15377 case Intrinsic::amdgcn_rsq_clamp:
15378 case Intrinsic::amdgcn_rsq_legacy:
15379 case Intrinsic::amdgcn_div_scale:
15380 case Intrinsic::amdgcn_div_fmas:
15381 case Intrinsic::amdgcn_div_fixup:
15382 case Intrinsic::amdgcn_fract:
15383 case Intrinsic::amdgcn_cvt_pkrtz:
15384 case Intrinsic::amdgcn_cubeid:
15385 case Intrinsic::amdgcn_cubema:
15386 case Intrinsic::amdgcn_cubesc:
15387 case Intrinsic::amdgcn_cubetc:
15388 case Intrinsic::amdgcn_frexp_mant:
15389 case Intrinsic::amdgcn_fdot2:
15390 case Intrinsic::amdgcn_trig_preop:
15391 case Intrinsic::amdgcn_tanh:
15410 if (
C.isDenormal()) {
15424 if (
C.isSignaling()) {
15447SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
15448 DAGCombinerInfo &DCI)
const {
15449 SelectionDAG &DAG = DCI.DAG;
15451 EVT VT =
N->getValueType(0);
15460 EVT VT =
N->getValueType(0);
15461 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
15477 EVT EltVT =
Lo.getValueType();
15480 for (
unsigned I = 0;
I != 2; ++
I) {
15484 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
15485 }
else if (
Op.isUndef()) {
15521 return AMDGPUISD::FMAX3;
15523 return AMDGPUISD::FMAXIMUM3;
15525 return AMDGPUISD::SMAX3;
15527 return AMDGPUISD::UMAX3;
15531 return AMDGPUISD::FMIN3;
15533 return AMDGPUISD::FMINIMUM3;
15535 return AMDGPUISD::SMIN3;
15537 return AMDGPUISD::UMIN3;
15558 if (!MinK || !MaxK)
15570 unsigned Med3Opc =
Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15571 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15572 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
15596 bool IsKnownNoNaNs)
const {
15632 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15638 if (
Info->getMode().DX10Clamp) {
15647 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15679 case AMDGPUISD::FMIN_LEGACY:
15680 case AMDGPUISD::FMAX_LEGACY:
15681 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
15682 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15685 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15686 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15687 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15692 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
15701 DAGCombinerInfo &DCI)
const {
15702 SelectionDAG &DAG = DCI.DAG;
15734 if (
SDValue Med3 = performIntMed3ImmCombine(
15739 if (
SDValue Med3 = performIntMed3ImmCombine(
15745 if (
SDValue Med3 = performIntMed3ImmCombine(
15750 if (
SDValue Med3 = performIntMed3ImmCombine(
15763 (
Opc == AMDGPUISD::FMIN_LEGACY &&
15764 Op0.
getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15765 (VT == MVT::f32 || VT == MVT::f64 ||
15766 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15767 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15768 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15769 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15771 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1,
15772 N->getFlags().hasNoNaNs()))
15779 const SDNodeFlags
Flags =
N->getFlags();
15781 !Subtarget->hasIEEEMinimumMaximumInsts() &&
15785 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
15795 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15796 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15805 DAGCombinerInfo &DCI)
const {
15806 EVT VT =
N->getValueType(0);
15810 SelectionDAG &DAG = DCI.DAG;
15821 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15825 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15829 if (
Info->getMode().DX10Clamp) {
15842 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15849 DAGCombinerInfo &DCI)
const {
15853 return DCI.DAG.getUNDEF(
N->getValueType(0));
15861 bool IsDivergentIdx,
15866 unsigned VecSize = EltSize * NumElem;
15869 if (VecSize <= 64 && EltSize < 32)
15878 if (IsDivergentIdx)
15882 unsigned NumInsts = NumElem +
15883 ((EltSize + 31) / 32) * NumElem ;
15887 if (Subtarget->useVGPRIndexMode())
15888 return NumInsts <= 16;
15892 if (Subtarget->hasMovrel())
15893 return NumInsts <= 15;
15899 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
15914SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
15915 DAGCombinerInfo &DCI)
const {
15921 EVT ResVT =
N->getValueType(0);
15945 if (!
C ||
C->getZExtValue() != 0x1f)
15961 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15989 DCI.AddToWorklist(Elt0.
getNode());
15990 DCI.AddToWorklist(Elt1.
getNode());
16021 if (KImm && KImm->getValueType(0).getSizeInBits() == 64) {
16022 uint64_t KImmValue = KImm->getZExtValue();
16024 (KImmValue >> (32 * Idx->getZExtValue())) & 0xffffffff, SL, MVT::i32);
16027 if (KFPImm && KFPImm->getValueType(0).getSizeInBits() == 64) {
16028 uint64_t KFPImmValue =
16029 KFPImm->getValueAPF().bitcastToAPInt().getZExtValue();
16030 return DAG.
getConstant((KFPImmValue >> (32 * Idx->getZExtValue())) &
16036 if (!DCI.isBeforeLegalize())
16043 VecSize > 32 && VecSize % 32 == 0 && Idx) {
16046 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
16047 unsigned EltIdx = BitIndex / 32;
16048 unsigned LeftoverBitIdx = BitIndex % 32;
16052 DCI.AddToWorklist(Cast.
getNode());
16056 DCI.AddToWorklist(Elt.
getNode());
16059 DCI.AddToWorklist(Srl.
getNode());
16063 DCI.AddToWorklist(Trunc.
getNode());
16065 if (VecEltVT == ResVT) {
16077SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
16078 DAGCombinerInfo &DCI)
const {
16089 SelectionDAG &DAG = DCI.DAG;
16109 Src.getOperand(0).getValueType() == MVT::f16) {
16110 return Src.getOperand(0);
16114 APFloat Val = CFP->getValueAPF();
16115 bool LosesInfo =
true;
16125 DAGCombinerInfo &DCI)
const {
16126 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
16127 "combine only useful on gfx8");
16129 SDValue TruncSrc =
N->getOperand(0);
16130 EVT VT =
N->getValueType(0);
16131 if (VT != MVT::f16)
16134 if (TruncSrc.
getOpcode() != AMDGPUISD::FMED3 ||
16138 SelectionDAG &DAG = DCI.DAG;
16169unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
16171 const SDNode *N1)
const {
16176 if (((VT == MVT::f32 &&
16178 (VT == MVT::f16 && Subtarget->hasMadF16() &&
16198 EVT VT =
N->getValueType(0);
16199 if (VT != MVT::i32 && VT != MVT::i64)
16205 unsigned Opc =
N->getOpcode();
16260 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
16279 DAGCombinerInfo &DCI)
const {
16282 SelectionDAG &DAG = DCI.DAG;
16283 EVT VT =
N->getValueType(0);
16293 if (!
N->isDivergent() && Subtarget->hasSMulHi())
16297 if (NumBits <= 32 || NumBits > 64)
16308 if (!Subtarget->hasFullRate64Ops()) {
16309 unsigned NumUsers = 0;
16310 for (SDNode *User :
LHS->
users()) {
16313 if (!
User->isAnyAdd())
16337 bool MulSignedLo =
false;
16338 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
16347 if (VT != MVT::i64) {
16370 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
16372 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
16373 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
16375 if (!MulLHSUnsigned32) {
16382 if (!MulRHSUnsigned32) {
16393 if (VT != MVT::i64)
16399SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
16400 DAGCombinerInfo &DCI)
const {
16410 SelectionDAG &DAG = DCI.DAG;
16425 unsigned Opcode =
N->getOpcode();
16429 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
16440static std::optional<ByteProvider<SDValue>>
16443 if (!Byte0 || Byte0->isConstantZero()) {
16444 return std::nullopt;
16447 if (Byte1 && !Byte1->isConstantZero()) {
16448 return std::nullopt;
16454 unsigned FirstCs =
First & 0x0c0c0c0c;
16455 unsigned SecondCs = Second & 0x0c0c0c0c;
16456 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
16457 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
16459 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
16460 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
16461 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
16462 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
16464 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
16488 for (
int BPI = 0; BPI < 2; BPI++) {
16491 BPP = {Src1, Src0};
16493 unsigned ZeroMask = 0x0c0c0c0c;
16494 unsigned FMask = 0xFF << (8 * (3 - Step));
16496 unsigned FirstMask =
16497 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16498 unsigned SecondMask =
16499 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16503 int FirstGroup = -1;
16504 for (
int I = 0;
I < 2;
I++) {
16506 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
16507 return IterElt.SrcOp == *BPP.first.Src &&
16508 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
16512 if (Match != Srcs.
end()) {
16513 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
16518 if (FirstGroup != -1) {
16520 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
16521 return IterElt.SrcOp == *BPP.second.Src &&
16522 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
16525 if (Match != Srcs.
end()) {
16526 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
16528 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
16536 unsigned ZeroMask = 0x0c0c0c0c;
16537 unsigned FMask = 0xFF << (8 * (3 - Step));
16541 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16545 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16554 if (Srcs.
size() == 1) {
16555 auto *Elt = Srcs.
begin();
16559 if (Elt->PermMask == 0x3020100)
16562 return DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16566 auto *FirstElt = Srcs.
begin();
16567 auto *SecondElt = std::next(FirstElt);
16574 auto FirstMask = FirstElt->PermMask;
16575 auto SecondMask = SecondElt->PermMask;
16577 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
16578 unsigned FirstPlusFour = FirstMask | 0x04040404;
16581 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
16593 FirstElt = std::next(SecondElt);
16594 if (FirstElt == Srcs.
end())
16597 SecondElt = std::next(FirstElt);
16600 if (SecondElt == Srcs.
end()) {
16605 DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16606 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
16612 return Perms.
size() == 2
16618 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16619 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
16620 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
16621 EntryMask += ZeroMask;
16626 auto Opcode =
Op.getOpcode();
16628 return (Opcode ==
ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16629 Opcode == AMDGPUISD::MUL_I24);
16632static std::optional<bool>
16643 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16646 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16648 assert(!(S0IsUnsigned && S0IsSigned));
16649 assert(!(S1IsUnsigned && S1IsSigned));
16657 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16663 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16664 return std::nullopt;
16676 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16677 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16682 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16688 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16689 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16690 return std::nullopt;
16696 DAGCombinerInfo &DCI)
const {
16697 SelectionDAG &DAG = DCI.DAG;
16698 EVT VT =
N->getValueType(0);
16704 if (Subtarget->hasMad64_32()) {
16705 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16710 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
16714 if (VT == MVT::i64) {
16715 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16720 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16722 std::optional<bool> IsSigned;
16728 int ChainLength = 0;
16729 for (
int I = 0;
I < 4;
I++) {
16733 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16736 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16741 TempNode->getOperand(MulIdx), *Src0, *Src1,
16742 TempNode->getOperand(MulIdx)->getOperand(0),
16743 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16747 IsSigned = *IterIsSigned;
16748 if (*IterIsSigned != *IsSigned)
16751 auto AddIdx = 1 - MulIdx;
16754 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
16755 Src2s.
push_back(TempNode->getOperand(AddIdx));
16765 TempNode->getOperand(AddIdx), *Src0, *Src1,
16766 TempNode->getOperand(AddIdx)->getOperand(0),
16767 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16771 if (*IterIsSigned != *IsSigned)
16775 ChainLength =
I + 2;
16779 TempNode = TempNode->getOperand(AddIdx);
16781 ChainLength =
I + 1;
16782 if (TempNode->getNumOperands() < 2)
16784 LHS = TempNode->getOperand(0);
16785 RHS = TempNode->getOperand(1);
16788 if (ChainLength < 2)
16794 if (ChainLength < 4) {
16804 bool UseOriginalSrc =
false;
16805 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
16806 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
16807 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
16808 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
16809 SmallVector<unsigned, 4> SrcBytes;
16810 auto Src0Mask = Src0s.
begin()->PermMask;
16811 SrcBytes.
push_back(Src0Mask & 0xFF000000);
16812 bool UniqueEntries =
true;
16813 for (
auto I = 1;
I < 4;
I++) {
16814 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
16817 UniqueEntries =
false;
16823 if (UniqueEntries) {
16824 UseOriginalSrc =
true;
16826 auto *FirstElt = Src0s.
begin();
16830 auto *SecondElt = Src1s.
begin();
16832 SecondElt->DWordOffset);
16841 if (!UseOriginalSrc) {
16848 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16851 : Intrinsic::amdgcn_udot4,
16861 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16866 unsigned Opc =
LHS.getOpcode();
16878 auto Cond =
RHS.getOperand(0);
16883 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16900 DAGCombinerInfo &DCI)
const {
16901 SelectionDAG &DAG = DCI.DAG;
16903 EVT VT =
N->getValueType(0);
16916 SDNodeFlags ShlFlags = N1->
getFlags();
16920 SDNodeFlags NewShlFlags =
16925 DCI.AddToWorklist(Inner.
getNode());
16932 if (Subtarget->hasMad64_32()) {
16933 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16942 if (VT == MVT::i64) {
16943 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16956 if (!YIsConstant && !ZIsConstant && !
X->isDivergent() &&
16957 Y->isDivergent() !=
Z->isDivergent()) {
16966 if (
Y->isDivergent())
16969 SDNodeFlags ReassocFlags =
16972 DCI.AddToWorklist(UniformInner.
getNode());
16980 DAGCombinerInfo &DCI)
const {
16981 SelectionDAG &DAG = DCI.DAG;
16982 EVT VT =
N->getValueType(0);
16984 if (VT == MVT::i64) {
16985 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16989 if (VT != MVT::i32)
16998 unsigned Opc =
RHS.getOpcode();
17005 auto Cond =
RHS.getOperand(0);
17010 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
17028SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
17029 DAGCombinerInfo &DCI)
const {
17031 if (
N->getValueType(0) != MVT::i32)
17037 SelectionDAG &DAG = DCI.DAG;
17042 unsigned LHSOpc =
LHS.getOpcode();
17043 unsigned Opc =
N->getOpcode();
17047 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
17053 DAGCombinerInfo &DCI)
const {
17057 SelectionDAG &DAG = DCI.DAG;
17058 EVT VT =
N->getValueType(0);
17070 if (
A ==
LHS.getOperand(1)) {
17071 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
17072 if (FusedOp != 0) {
17074 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
17082 if (
A ==
RHS.getOperand(1)) {
17083 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
17084 if (FusedOp != 0) {
17086 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
17095 DAGCombinerInfo &DCI)
const {
17099 SelectionDAG &DAG = DCI.DAG;
17101 EVT VT =
N->getValueType(0);
17114 if (
A ==
LHS.getOperand(1)) {
17115 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
17116 if (FusedOp != 0) {
17120 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
17129 if (
A ==
RHS.getOperand(1)) {
17130 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
17131 if (FusedOp != 0) {
17133 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
17142 DAGCombinerInfo &DCI)
const {
17143 SelectionDAG &DAG = DCI.DAG;
17145 EVT VT =
N->getValueType(0);
17154 SDNodeFlags
Flags =
N->getFlags();
17155 SDNodeFlags RHSFlags =
RHS->getFlags();
17161 bool IsNegative =
false;
17162 if (CLHS->isExactlyValue(1.0) ||
17163 (IsNegative = CLHS->isExactlyValue(-1.0))) {
17169 DAG.
getNode(AMDGPUISD::RSQ, SL, VT,
RHS.getOperand(0), Flags);
17179 DAGCombinerInfo &DCI)
const {
17180 SelectionDAG &DAG = DCI.DAG;
17181 EVT VT =
N->getValueType(0);
17185 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
17186 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
17201 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
17206 const ConstantFPSDNode *FalseNode =
17216 if (ScalarVT == MVT::f32 &&
17222 if (TrueNodeExpVal == INT_MIN)
17225 if (FalseNodeExpVal == INT_MIN)
17245 DAGCombinerInfo &DCI)
const {
17246 SelectionDAG &DAG = DCI.DAG;
17247 EVT VT =
N->getValueType(0);
17250 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
17268 (
N->getFlags().hasAllowContract() &&
17269 FMA->getFlags().hasAllowContract())) {
17303 if (Vec1 == Vec2 || Vec3 == Vec4)
17309 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
17310 return DAG.
getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
17318 DAGCombinerInfo &DCI)
const {
17319 SelectionDAG &DAG = DCI.DAG;
17324 EVT VT =
LHS.getValueType();
17353 return LHS.getOperand(0);
17367 const APInt &CT =
LHS.getConstantOperandAPInt(1);
17368 const APInt &CF =
LHS.getConstantOperandAPInt(2);
17373 return DAG.
getNOT(SL,
LHS.getOperand(0), MVT::i1);
17376 return LHS.getOperand(0);
17397 if (VT == MVT::i64) {
17409 const std::optional<bool> KnownEq =
17417 const std::optional<bool> KnownEq =
17428 const std::optional<bool> KnownUge =
17448 const std::optional<bool> KnownUle =
17499 DAG.
getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
17504 {Op0Hi, Op1Hi, CarryInHi});
17514 DCI.CombineTo(
LHS.getNode(), Result);
17518 if (VT != MVT::f32 && VT != MVT::f64 &&
17519 (!Subtarget->has16BitInsts() || VT != MVT::f16))
17534 const unsigned IsInfMask =
17536 const unsigned IsFiniteMask =
17541 return DAG.
getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1,
LHS.getOperand(0),
17550SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
17551 DAGCombinerInfo &DCI)
const {
17552 SelectionDAG &DAG = DCI.DAG;
17554 unsigned Offset =
N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
17573 unsigned ShiftOffset = 8 *
Offset;
17575 ShiftOffset -=
C->getZExtValue();
17577 ShiftOffset +=
C->getZExtValue();
17579 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
17580 return DAG.
getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
17581 MVT::f32, Shifted);
17592 DCI.AddToWorklist(
N);
17599 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
17605 DAGCombinerInfo &DCI)
const {
17610 const MachineFunction &MF = DCI.DAG.getMachineFunction();
17614 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
17615 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
17618 APFloat One(
F.getSemantics(),
"1.0");
17620 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
17626 DAGCombinerInfo &DCI)
const {
17647 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
17648 bool isInteger =
LHS.getValueType().isInteger();
17651 if (!isFloatingPoint && !isInteger)
17656 if (!isEquality && !isNonEquality)
17673 if (isFloatingPoint) {
17675 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
17686 if (!(isEquality && TrueVal == ConstVal) &&
17687 !(isNonEquality && FalseVal == ConstVal))
17694 SelectLHS, SelectRHS);
17699 switch (
N->getOpcode()) {
17715 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
17725 switch (
N->getOpcode()) {
17727 return performAddCombine(
N, DCI);
17729 return performPtrAddCombine(
N, DCI);
17731 return performSubCombine(
N, DCI);
17734 return performAddCarrySubCarryCombine(
N, DCI);
17736 return performFAddCombine(
N, DCI);
17738 return performFSubCombine(
N, DCI);
17740 return performFDivCombine(
N, DCI);
17742 return performFMulCombine(
N, DCI);
17744 return performSetCCCombine(
N, DCI);
17746 if (
auto Res = performSelectCombine(
N, DCI))
17761 case AMDGPUISD::FMIN_LEGACY:
17762 case AMDGPUISD::FMAX_LEGACY:
17763 return performMinMaxCombine(
N, DCI);
17765 return performFMACombine(
N, DCI);
17767 return performAndCombine(
N, DCI);
17769 return performOrCombine(
N, DCI);
17772 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
17773 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
17779 return performXorCombine(
N, DCI);
17782 return performZeroOrAnyExtendCombine(
N, DCI);
17784 return performSignExtendInRegCombine(
N, DCI);
17785 case AMDGPUISD::FP_CLASS:
17786 return performClassCombine(
N, DCI);
17788 return performFCanonicalizeCombine(
N, DCI);
17789 case AMDGPUISD::RCP:
17790 return performRcpCombine(
N, DCI);
17792 case AMDGPUISD::FRACT:
17793 case AMDGPUISD::RSQ:
17794 case AMDGPUISD::RCP_LEGACY:
17795 case AMDGPUISD::RCP_IFLAG:
17796 case AMDGPUISD::RSQ_CLAMP: {
17805 return performUCharToFloatCombine(
N, DCI);
17807 return performFCopySignCombine(
N, DCI);
17808 case AMDGPUISD::CVT_F32_UBYTE0:
17809 case AMDGPUISD::CVT_F32_UBYTE1:
17810 case AMDGPUISD::CVT_F32_UBYTE2:
17811 case AMDGPUISD::CVT_F32_UBYTE3:
17812 return performCvtF32UByteNCombine(
N, DCI);
17813 case AMDGPUISD::FMED3:
17814 return performFMed3Combine(
N, DCI);
17815 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17816 return performCvtPkRTZCombine(
N, DCI);
17817 case AMDGPUISD::CLAMP:
17818 return performClampCombine(
N, DCI);
17821 EVT VT =
N->getValueType(0);
17824 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17827 EVT EltVT = Src.getValueType();
17828 if (EltVT != MVT::i16)
17838 return performExtractVectorEltCombine(
N, DCI);
17840 return performInsertVectorEltCombine(
N, DCI);
17842 return performFPRoundCombine(
N, DCI);
17851 return performMemSDNodeCombine(MemNode, DCI);
17882 unsigned Opcode =
Node->getMachineOpcode();
17885 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17886 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
17889 SDNode *
Users[5] = {
nullptr};
17891 unsigned DmaskIdx =
17892 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17893 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
17894 unsigned NewDmask = 0;
17895 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17896 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17897 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
17898 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
17899 unsigned TFCLane = 0;
17900 bool HasChain =
Node->getNumValues() > 1;
17902 if (OldDmask == 0) {
17910 TFCLane = OldBitsSet;
17914 for (SDUse &Use :
Node->uses()) {
17917 if (
Use.getResNo() != 0)
17920 SDNode *
User =
Use.getUser();
17923 if (!
User->isMachineOpcode() ||
17924 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17936 if (UsesTFC && Lane == TFCLane) {
17941 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17943 Dmask &= ~(1 << Comp);
17951 NewDmask |= 1 << Comp;
17956 bool NoChannels = !NewDmask;
17963 if (OldBitsSet == 1)
17969 if (NewDmask == OldDmask)
17978 unsigned NewChannels = BitsSet + UsesTFC;
17982 assert(NewOpcode != -1 &&
17983 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
17984 "failed to find equivalent MIMG op");
17992 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
17994 MVT ResultVT = NewChannels == 1
17997 : NewChannels == 5 ? 8
17999 SDVTList NewVTList =
18002 MachineSDNode *NewNode =
18011 if (NewChannels == 1) {
18021 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
18026 if (i || !NoChannels)
18031 if (NewUser != User) {
18041 Idx = AMDGPU::sub1;
18044 Idx = AMDGPU::sub2;
18047 Idx = AMDGPU::sub3;
18050 Idx = AMDGPU::sub4;
18061 Op =
Op.getOperand(0);
18086 Node->getOperand(0), SL, VReg, SrcVal,
18092 return ToResultReg.
getNode();
18097 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
18099 Ops.push_back(
Node->getOperand(i));
18105 Node->getOperand(i).getValueType(),
18106 Node->getOperand(i)),
18118 unsigned Opcode =
Node->getMachineOpcode();
18120 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
18121 !
TII->isGather4(Opcode) &&
18123 return adjustWritemask(
Node, DAG);
18126 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
18132 case AMDGPU::V_DIV_SCALE_F32_e64:
18133 case AMDGPU::V_DIV_SCALE_F64_e64: {
18143 (Src0 == Src1 || Src0 == Src2))
18199 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
18200 unsigned InitIdx = 0;
18202 if (
TII->isImage(
MI)) {
18210 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
18211 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
18212 unsigned D16Val = D16 ? D16->getImm() : 0;
18214 if (!TFEVal && !LWEVal)
18225 assert(MO_Dmask &&
"Expected dmask operand in instruction");
18227 unsigned dmask = MO_Dmask->
getImm();
18232 bool Packed = !Subtarget->hasUnpackedD16VMem();
18234 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
18241 uint32_t DstSize =
TRI.getRegSizeInBits(*DstRC) / 32;
18242 if (DstSize < InitIdx)
18246 InitIdx =
TRI.getRegSizeInBits(*DstRC) / 32;
18255 unsigned NewDst = 0;
18260 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
18261 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
18264 for (; SizeLeft; SizeLeft--, CurrIdx++) {
18285 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
18297 if (
TII->isVOP3(
MI.getOpcode())) {
18299 TII->legalizeOperandsVOP3(MRI,
MI);
18301 if (
TII->isMAI(
MI)) {
18306 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
18307 AMDGPU::OpName::scale_src0);
18308 if (Src0Idx != -1) {
18309 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
18310 AMDGPU::OpName::scale_src1);
18311 if (
TII->usesConstantBus(MRI,
MI, Src0Idx) &&
18312 TII->usesConstantBus(MRI,
MI, Src1Idx))
18313 TII->legalizeOpWithMove(
MI, Src1Idx);
18320 if (
TII->isImage(
MI))
18321 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
18395std::pair<unsigned, const TargetRegisterClass *>
18402 if (Constraint.
size() == 1) {
18406 if (VT == MVT::Other)
18409 switch (Constraint[0]) {
18416 RC = &AMDGPU::SReg_32RegClass;
18419 RC = &AMDGPU::SGPR_64RegClass;
18424 return std::pair(0U,
nullptr);
18431 return std::pair(0U,
nullptr);
18433 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
18434 : &AMDGPU::VGPR_32_Lo256RegClass;
18437 RC = Subtarget->has1024AddressableVGPRs()
18438 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
18441 return std::pair(0U,
nullptr);
18446 if (!Subtarget->hasMAIInsts())
18450 return std::pair(0U,
nullptr);
18452 RC = &AMDGPU::AGPR_32RegClass;
18457 return std::pair(0U,
nullptr);
18462 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
18466 RC = &AMDGPU::AV_32RegClass;
18469 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
18471 return std::pair(0U,
nullptr);
18480 return std::pair(0U, RC);
18483 if (Kind !=
'\0') {
18485 RC = &AMDGPU::VGPR_32_Lo256RegClass;
18486 }
else if (Kind ==
's') {
18487 RC = &AMDGPU::SGPR_32RegClass;
18488 }
else if (Kind ==
'a') {
18489 RC = &AMDGPU::AGPR_32RegClass;
18495 return std::pair(0U,
nullptr);
18501 return std::pair(0U,
nullptr);
18505 RC =
TRI->getVGPRClassForBitWidth(Width);
18507 RC =
TRI->getSGPRClassForBitWidth(Width);
18509 RC =
TRI->getAGPRClassForBitWidth(Width);
18511 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
18516 return std::pair(0U,
nullptr);
18518 return std::pair(Reg, RC);
18524 return std::pair(0U,
nullptr);
18525 if (Idx < RC->getNumRegs())
18527 return std::pair(0U,
nullptr);
18533 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
18539 if (Constraint.
size() == 1) {
18540 switch (Constraint[0]) {
18550 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
18558 if (Constraint.
size() == 1) {
18559 switch (Constraint[0]) {
18567 }
else if (Constraint.
size() == 2) {
18568 if (Constraint ==
"VA")
18586 std::vector<SDValue> &
Ops,
18601 unsigned Size =
Op.getScalarValueSizeInBits();
18605 if (
Size == 16 && !Subtarget->has16BitInsts())
18609 Val =
C->getSExtValue();
18613 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
18617 if (
Size != 16 ||
Op.getNumOperands() != 2)
18619 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
18622 Val =
C->getSExtValue();
18626 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
18636 if (Constraint.
size() == 1) {
18637 switch (Constraint[0]) {
18652 }
else if (Constraint.
size() == 2) {
18653 if (Constraint ==
"DA") {
18654 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
18655 int64_t LoBits =
static_cast<int32_t
>(Val);
18659 if (Constraint ==
"DB") {
18667 unsigned MaxSize)
const {
18668 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
18669 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
18671 MVT VT =
Op.getSimpleValueType();
18696 switch (UnalignedClassID) {
18697 case AMDGPU::VReg_64RegClassID:
18698 return AMDGPU::VReg_64_Align2RegClassID;
18699 case AMDGPU::VReg_96RegClassID:
18700 return AMDGPU::VReg_96_Align2RegClassID;
18701 case AMDGPU::VReg_128RegClassID:
18702 return AMDGPU::VReg_128_Align2RegClassID;
18703 case AMDGPU::VReg_160RegClassID:
18704 return AMDGPU::VReg_160_Align2RegClassID;
18705 case AMDGPU::VReg_192RegClassID:
18706 return AMDGPU::VReg_192_Align2RegClassID;
18707 case AMDGPU::VReg_224RegClassID:
18708 return AMDGPU::VReg_224_Align2RegClassID;
18709 case AMDGPU::VReg_256RegClassID:
18710 return AMDGPU::VReg_256_Align2RegClassID;
18711 case AMDGPU::VReg_288RegClassID:
18712 return AMDGPU::VReg_288_Align2RegClassID;
18713 case AMDGPU::VReg_320RegClassID:
18714 return AMDGPU::VReg_320_Align2RegClassID;
18715 case AMDGPU::VReg_352RegClassID:
18716 return AMDGPU::VReg_352_Align2RegClassID;
18717 case AMDGPU::VReg_384RegClassID:
18718 return AMDGPU::VReg_384_Align2RegClassID;
18719 case AMDGPU::VReg_512RegClassID:
18720 return AMDGPU::VReg_512_Align2RegClassID;
18721 case AMDGPU::VReg_1024RegClassID:
18722 return AMDGPU::VReg_1024_Align2RegClassID;
18723 case AMDGPU::AReg_64RegClassID:
18724 return AMDGPU::AReg_64_Align2RegClassID;
18725 case AMDGPU::AReg_96RegClassID:
18726 return AMDGPU::AReg_96_Align2RegClassID;
18727 case AMDGPU::AReg_128RegClassID:
18728 return AMDGPU::AReg_128_Align2RegClassID;
18729 case AMDGPU::AReg_160RegClassID:
18730 return AMDGPU::AReg_160_Align2RegClassID;
18731 case AMDGPU::AReg_192RegClassID:
18732 return AMDGPU::AReg_192_Align2RegClassID;
18733 case AMDGPU::AReg_256RegClassID:
18734 return AMDGPU::AReg_256_Align2RegClassID;
18735 case AMDGPU::AReg_512RegClassID:
18736 return AMDGPU::AReg_512_Align2RegClassID;
18737 case AMDGPU::AReg_1024RegClassID:
18738 return AMDGPU::AReg_1024_Align2RegClassID;
18754 if (Info->isEntryFunction()) {
18761 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18763 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
18764 :
TRI->getAlignedHighSGPRForRC(MF, 2,
18765 &AMDGPU::SGPR_64RegClass);
18766 Info->setSGPRForEXECCopy(SReg);
18768 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
18769 Info->getStackPtrOffsetReg()));
18770 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18771 MRI.
replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
18775 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18776 MRI.
replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
18778 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18781 Info->limitOccupancy(MF);
18783 if (ST.isWave32() && !MF.
empty()) {
18784 for (
auto &
MBB : MF) {
18785 for (
auto &
MI :
MBB) {
18786 TII->fixImplicitOperands(
MI);
18796 if (ST.needsAlignedVGPRs()) {
18803 if (NewClassID != -1)
18813 const APInt &DemandedElts,
18815 unsigned Depth)
const {
18817 unsigned Opc =
Op.getOpcode();
18820 unsigned IID =
Op.getConstantOperandVal(0);
18822 case Intrinsic::amdgcn_mbcnt_lo:
18823 case Intrinsic::amdgcn_mbcnt_hi: {
18829 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18839 Op, Known, DemandedElts, DAG,
Depth);
18855 unsigned MaxValue =
18862 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
18866 unsigned Src1Cst = 0;
18867 if (Src1.
isImm()) {
18868 Src1Cst = Src1.
getImm();
18869 }
else if (Src1.
isReg()) {
18873 Src1Cst = Cst->Value.getZExtValue();
18884 if (Width >= BFEWidth)
18893 Known = Known.
sext(BFEWidth);
18895 Known = Known.
zext(BFEWidth);
18901 unsigned Depth)
const {
18904 switch (
MI->getOpcode()) {
18905 case AMDGPU::S_BFE_I32:
18908 case AMDGPU::S_BFE_U32:
18911 case AMDGPU::S_BFE_I64:
18914 case AMDGPU::S_BFE_U64:
18917 case AMDGPU::G_INTRINSIC:
18918 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18921 case Intrinsic::amdgcn_workitem_id_x:
18924 case Intrinsic::amdgcn_workitem_id_y:
18927 case Intrinsic::amdgcn_workitem_id_z:
18930 case Intrinsic::amdgcn_mbcnt_lo:
18931 case Intrinsic::amdgcn_mbcnt_hi: {
18943 case Intrinsic::amdgcn_groupstaticsize: {
18954 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18957 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18960 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
18965 case AMDGPU::G_AMDGPU_SMED3:
18966 case AMDGPU::G_AMDGPU_UMED3: {
18967 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
18994 unsigned Depth)
const {
19001 AttributeList Attrs =
19003 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
19021 if (Header->getAlignment() != PrefAlign)
19022 return Header->getAlignment();
19023 if (needsFetchWindowAlignment(*Header))
19044 if (Header->getAlignment() != PrefAlign)
19045 return Header->getAlignment();
19047 unsigned LoopSize = 0;
19052 LoopSize +=
MBB->getAlignment().value() / 2;
19055 LoopSize +=
TII->getInstSizeInBytes(
MI);
19056 if (LoopSize > 192)
19061 if (LoopSize <= 64)
19064 if (LoopSize <= 128)
19065 return CacheLineAlign;
19071 auto I = Exit->getFirstNonDebugInstr();
19072 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
19073 return CacheLineAlign;
19082 if (PreTerm == Pre->
begin() ||
19083 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
19087 auto ExitHead = Exit->getFirstNonDebugInstr();
19088 if (ExitHead == Exit->end() ||
19089 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
19094 return CacheLineAlign;
19102 if (needsFetchWindowAlignment(*
MBB))
19107bool SITargetLowering::needsFetchWindowAlignment(
19109 if (!
getSubtarget()->hasLoopHeadInstSplitSensitivity())
19113 if (
MI.isMetaInstruction())
19116 return TII->getInstSizeInBytes(
MI) > 4;
19126 N =
N->getOperand(0).getNode();
19136 switch (
N->getOpcode()) {
19144 if (Reg.isPhysical() || MRI.
isLiveIn(Reg))
19145 return !
TRI->isSGPRReg(MRI, Reg);
19151 return !
TRI->isSGPRReg(MRI, Reg);
19155 unsigned AS = L->getAddressSpace();
19165 case AMDGPUISD::ATOMIC_CMP_SWAP:
19166 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
19167 case AMDGPUISD::BUFFER_ATOMIC_ADD:
19168 case AMDGPUISD::BUFFER_ATOMIC_SUB:
19169 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
19170 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
19171 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
19172 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
19173 case AMDGPUISD::BUFFER_ATOMIC_AND:
19174 case AMDGPUISD::BUFFER_ATOMIC_OR:
19175 case AMDGPUISD::BUFFER_ATOMIC_XOR:
19176 case AMDGPUISD::BUFFER_ATOMIC_INC:
19177 case AMDGPUISD::BUFFER_ATOMIC_DEC:
19178 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
19179 case AMDGPUISD::BUFFER_ATOMIC_FADD:
19180 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
19181 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
19187 return A->readMem() &&
A->writeMem();
19208 switch (Ty.getScalarSizeInBits()) {
19220 const APInt &DemandedElts,
19223 unsigned Depth)
const {
19224 if (
Op.getOpcode() == AMDGPUISD::CLAMP) {
19228 if (Info->getMode().DX10Clamp)
19240 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
19260 <<
"Hardware instruction generated for atomic "
19262 <<
" operation at memory scope " << MemScope;
19267 Type *EltTy = VT->getElementType();
19268 return VT->getNumElements() == 2 &&
19288 unsigned BW =
IT->getBitWidth();
19289 return BW == 32 || BW == 64;
19303 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
19304 return BW == 32 || BW == 64;
19307 if (Ty->isFloatTy() || Ty->isDoubleTy())
19311 return VT->getNumElements() == 2 &&
19312 VT->getElementType()->getPrimitiveSizeInBits() == 16;
19322 bool HasSystemScope) {
19329 if (HasSystemScope) {
19330 if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
19333 if (Subtarget.hasEmulatedSystemScopeAtomics())
19335 }
else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
19338 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
19351 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
19359 return STI.hasGloballyAddressableScratch()
19377 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
19390 bool HasSystemScope =
19422 if (!
IT ||
IT->getBitWidth() != 32)
19428 if (Subtarget->hasEmulatedSystemScopeAtomics())
19444 if (!HasSystemScope &&
19445 Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
19457 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
19465 ConstVal && ConstVal->isNullValue())
19503 if (Ty->isFloatTy()) {
19508 if (Ty->isDoubleTy()) {
19529 if (Ty->isFloatTy() &&
19530 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
19543 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
19547 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
19551 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
19556 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
19561 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
19565 if (Ty->isFloatTy()) {
19568 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19571 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19576 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
19584 if (Subtarget->hasFlatAtomicFaddF32Inst())
19593 if (Subtarget->hasLDSFPAtomicAddF32()) {
19594 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19596 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19624 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
19626 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
19630 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
19632 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
19686 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
19687 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
19688 : &AMDGPU::SReg_32RegClass;
19689 if (!
TRI->isSGPRClass(RC) && !isDivergent)
19690 return TRI->getEquivalentSGPRClass(RC);
19691 if (
TRI->isSGPRClass(RC) && isDivergent) {
19692 if (Subtarget->hasGFX90AInsts())
19693 return TRI->getEquivalentAVClass(RC);
19694 return TRI->getEquivalentVGPRClass(RC);
19707 unsigned WaveSize) {
19712 if (!
IT ||
IT->getBitWidth() != WaveSize)
19717 if (!Visited.
insert(V).second)
19719 bool Result =
false;
19720 for (
const auto *U : V->users()) {
19722 if (V == U->getOperand(1)) {
19727 case Intrinsic::amdgcn_if_break:
19728 case Intrinsic::amdgcn_if:
19729 case Intrinsic::amdgcn_else:
19734 if (V == U->getOperand(0)) {
19739 case Intrinsic::amdgcn_end_cf:
19740 case Intrinsic::amdgcn_loop:
19746 Result =
hasCFUser(U, Visited, WaveSize);
19755 const Value *V)
const {
19757 if (CI->isInlineAsm()) {
19766 for (
auto &TC : TargetConstraints) {
19780 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
19815 if (
I.getMetadata(
"amdgpu.noclobber"))
19817 if (
I.getMetadata(
"amdgpu.last.use"))
19881 Alignment = RMW->getAlign();
19894 bool FullFlatEmulation =
19896 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19897 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19898 RMW->getType()->isDoubleTy()));
19901 bool ReturnValueIsUsed = !AI->
use_empty();
19910 if (FullFlatEmulation) {
19921 std::prev(BB->
end())->eraseFromParent();
19922 Builder.SetInsertPoint(BB);
19924 Value *LoadedShared =
nullptr;
19925 if (FullFlatEmulation) {
19926 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19927 {Addr},
nullptr,
"is.shared");
19928 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19929 Builder.SetInsertPoint(SharedBB);
19930 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19936 LoadedShared = Clone;
19938 Builder.CreateBr(PhiBB);
19939 Builder.SetInsertPoint(CheckPrivateBB);
19942 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19943 {Addr},
nullptr,
"is.private");
19944 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19946 Builder.SetInsertPoint(PrivateBB);
19948 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19951 Value *LoadedPrivate;
19953 LoadedPrivate = Builder.CreateAlignedLoad(
19954 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
19957 LoadedPrivate, RMW->getValOperand());
19959 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19961 auto [ResultLoad, Equal] =
19967 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19970 Builder.CreateBr(PhiBB);
19972 Builder.SetInsertPoint(GlobalBB);
19976 if (FullFlatEmulation) {
19977 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19986 if (!FullFlatEmulation) {
19991 MDNode *RangeNotPrivate =
19994 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
19998 Builder.CreateBr(PhiBB);
20000 Builder.SetInsertPoint(PhiBB);
20002 if (ReturnValueIsUsed) {
20005 if (FullFlatEmulation)
20006 Loaded->addIncoming(LoadedShared, SharedBB);
20007 Loaded->addIncoming(LoadedPrivate, PrivateBB);
20008 Loaded->addIncoming(LoadedGlobal, GlobalBB);
20009 Loaded->takeName(AI);
20012 Builder.CreateBr(ExitBB);
20016 unsigned PtrOpIdx) {
20017 Value *PtrOp =
I->getOperand(PtrOpIdx);
20024 I->setOperand(PtrOpIdx, ASCast);
20036 ConstVal && ConstVal->isNullValue()) {
20066 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
20074 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
20089 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
@ DEFAULT
Default weight is used in cases when there is no dedicated execution weight set.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI, unsigned ArgIdx)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
void setUsesDynamicLDS(bool DynLDS)
bool isBottomOfStack() const
uint32_t getLDSSize() const
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned AndSaveExecOpc
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf()
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
const SIInstrInfo * getInstrInfo() const override
unsigned getInstCacheLineSize() const
Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
const SIRegisterInfo * getRegisterInfo() const override
bool hasMin3Max3_16() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Type * getValueType() const
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
LLVM_ABI bool isLiveIn(Register Reg) const
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register getLiveInVirtReg(MCRegister PReg) const
getLiveInVirtReg - If PReg is a live-in physical register, return the corresponding live-in virtual r...
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
unsigned getNumVirtRegs() const
getNumVirtRegs - Return the number of virtual registers created.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, SDNodeFlags UserFlags={}, unsigned MaxDepth=5) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const override
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
constexpr bool empty() const
empty - Check if the string is empty.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< user_iterator > users()
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
LLVM_READONLY int32_t getVOPe64(uint32_t Opcode)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
constexpr T MinAlign(U A, V B)
A and B are either alignments or offsets.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
constexpr RegState getUndefRegState(bool B)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
unsigned AtomicNoRetBaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
unsigned getBitWidth() const
Get the bit width of this value.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
static LLVM_ABI std::optional< bool > ule(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_ULE result.
static LLVM_ABI std::optional< bool > uge(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_UGE result.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
std::optional< unsigned > fallbackAddressSpace
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const