43#include "llvm/IR/IntrinsicsAMDGPU.h"
44#include "llvm/IR/IntrinsicsR600.h"
55#define DEBUG_TYPE "si-lower"
61 cl::desc(
"Do not align and prefetch loops"),
65 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
66 cl::desc(
"Use indirect register addressing for divergent indexes"),
80 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
81 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
83 return AMDGPU::SGPR0 +
Reg;
99 TRI->getDefaultVectorSuperClassForBitWidth(32);
105 TRI->getDefaultVectorSuperClassForBitWidth(64);
143 TRI->getDefaultVectorSuperClassForBitWidth(320));
147 TRI->getDefaultVectorSuperClassForBitWidth(352));
151 TRI->getDefaultVectorSuperClassForBitWidth(384));
155 TRI->getDefaultVectorSuperClassForBitWidth(512));
162 TRI->getDefaultVectorSuperClassForBitWidth(1024));
164 if (Subtarget->has16BitInsts()) {
165 if (Subtarget->useRealTrue16Insts()) {
195 TRI->getDefaultVectorSuperClassForBitWidth(1024));
208 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
209 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
210 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
211 MVT::i1, MVT::v32i32},
215 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
216 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
217 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
218 MVT::i1, MVT::v32i32},
286 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
293 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
294 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
295 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
298 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
299 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
300 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
304 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
305 MVT::v3i16, MVT::v4i16, MVT::Other},
310 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
326 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
327 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
328 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
329 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
330 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
331 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
332 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
333 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
365 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
379 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
393 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
407 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
421 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
436 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
437 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
440 if (Subtarget->hasPkMovB32()) {
461 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
462 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
467 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
471 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
472 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
473 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
474 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
498 if (Subtarget->hasSMemRealTime() ||
503 if (Subtarget->has16BitInsts()) {
513 if (Subtarget->hasMadMacF32Insts())
530 if (Subtarget->hasIntClamp())
533 if (Subtarget->hasAddNoCarryInsts())
539 {MVT::f32, MVT::f64},
Custom);
545 {MVT::f32, MVT::f64},
Legal);
547 if (Subtarget->haveRoundOpsF64())
577 if (Subtarget->has16BitInsts()) {
630 if (Subtarget->hasBF16TransInsts())
653 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
654 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
655 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
790 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
791 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
792 MVT::v32f16, MVT::v32bf16},
802 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
806 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
810 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
811 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
819 if (Subtarget->hasVOP3PInsts()) {
830 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
833 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
834 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
835 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
838 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
846 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
853 {MVT::v2f16, MVT::v4f16},
Custom);
859 if (Subtarget->hasBF16PackedInsts()) {
860 for (
MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
866 if (Subtarget->hasPackedFP32Ops()) {
870 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
877 if (Subtarget->has16BitInsts()) {
890 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
891 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
892 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
893 MVT::v32f16, MVT::v32bf16},
898 if (Subtarget->hasVectorMulU64())
900 else if (Subtarget->hasScalarSMulU64())
903 if (Subtarget->hasMad64_32())
906 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
909 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
911 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
914 if (Subtarget->hasMinimum3Maximum3F32())
917 if (Subtarget->hasMinimum3Maximum3PKF16()) {
921 if (!Subtarget->hasMinimum3Maximum3F16())
926 if (Subtarget->hasVOP3PInsts()) {
929 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
933 if (Subtarget->hasIntMinMax64())
938 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
939 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
944 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
945 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
946 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
947 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
951 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
952 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
953 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
954 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
969 if (Subtarget->hasBF16ConversionInsts()) {
974 if (Subtarget->hasBF16PackedInsts()) {
980 if (Subtarget->hasBF16TransInsts()) {
984 if (Subtarget->hasCvtPkF16F32Inst()) {
986 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1037 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1078 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1091 EVT DestVT,
EVT SrcVT)
const {
1093 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1094 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1096 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1103 LLT DestTy,
LLT SrcTy)
const {
1104 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1105 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1107 SrcTy.getScalarSizeInBits() == 16 &&
1128 return Subtarget->has16BitInsts()
1134 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1138 if (!Subtarget->has16BitInsts() && VT.
getSizeInBits() == 16)
1160 return (NumElts + 1) / 2;
1166 return NumElts * ((
Size + 31) / 32);
1175 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1184 MVT SimpleIntermediateVT =
1186 IntermediateVT = SimpleIntermediateVT;
1187 RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
1188 NumIntermediates = (NumElts + 1) / 2;
1189 return (NumElts + 1) / 2;
1194 IntermediateVT = RegisterVT;
1195 NumIntermediates = NumElts;
1196 return NumIntermediates;
1201 RegisterVT = MVT::i16;
1202 IntermediateVT = ScalarVT;
1203 NumIntermediates = NumElts;
1204 return NumIntermediates;
1208 RegisterVT = MVT::i32;
1209 IntermediateVT = ScalarVT;
1210 NumIntermediates = NumElts;
1211 return NumIntermediates;
1215 RegisterVT = MVT::i32;
1216 IntermediateVT = RegisterVT;
1217 NumIntermediates = NumElts * ((
Size + 31) / 32);
1218 return NumIntermediates;
1223 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1228 unsigned MaxNumLanes) {
1229 assert(MaxNumLanes != 0);
1233 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1244 unsigned MaxNumLanes) {
1250 assert(ST->getNumContainedTypes() == 2 &&
1251 ST->getContainedType(1)->isIntegerTy(32));
1265 return MVT::amdgpuBufferFatPointer;
1267 DL.getPointerSizeInBits(AS) == 192)
1268 return MVT::amdgpuBufferStridedPointer;
1277 DL.getPointerSizeInBits(AS) == 160) ||
1279 DL.getPointerSizeInBits(AS) == 192))
1286 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1287 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1288 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1290 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1291 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1292 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1293 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1294 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1295 case Intrinsic::amdgcn_flat_load_monitor_b32:
1296 case Intrinsic::amdgcn_global_load_monitor_b32:
1298 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1299 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1300 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1301 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1302 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1303 case Intrinsic::amdgcn_flat_load_monitor_b64:
1304 case Intrinsic::amdgcn_global_load_monitor_b64:
1306 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1307 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1308 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1309 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1310 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1311 case Intrinsic::amdgcn_flat_load_monitor_b128:
1312 case Intrinsic::amdgcn_global_load_monitor_b128:
1348 unsigned IntrID)
const {
1350 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1364 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1377 if (RsrcIntr->IsImage) {
1392 Info.ptrVal = RsrcArg;
1396 if (RsrcIntr->IsImage) {
1397 unsigned MaxNumLanes = 4;
1412 std::numeric_limits<unsigned>::max());
1422 if (RsrcIntr->IsImage) {
1442 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1444 Info.memVT = MVT::i32;
1451 case Intrinsic::amdgcn_raw_buffer_load_lds:
1452 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
1453 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1454 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
1455 case Intrinsic::amdgcn_struct_buffer_load_lds:
1456 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
1457 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
1458 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
1472 CI.
getContext(), Width * 8 * Subtarget->getWavefrontSize());
1481 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1482 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1483 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1484 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1487 std::numeric_limits<unsigned>::max());
1500 case Intrinsic::amdgcn_ds_ordered_add:
1501 case Intrinsic::amdgcn_ds_ordered_swap: {
1515 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1516 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1519 Info.ptrVal =
nullptr;
1525 case Intrinsic::amdgcn_ds_append:
1526 case Intrinsic::amdgcn_ds_consume: {
1540 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1541 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1542 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1547 Info.memVT = MVT::i64;
1554 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1555 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1556 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1559 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1562 ->getElementType(0));
1571 case Intrinsic::amdgcn_global_atomic_fmin_num:
1572 case Intrinsic::amdgcn_global_atomic_fmax_num:
1573 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1574 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1575 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1586 case Intrinsic::amdgcn_cluster_load_b32:
1587 case Intrinsic::amdgcn_cluster_load_b64:
1588 case Intrinsic::amdgcn_cluster_load_b128:
1589 case Intrinsic::amdgcn_ds_load_tr6_b96:
1590 case Intrinsic::amdgcn_ds_load_tr4_b64:
1591 case Intrinsic::amdgcn_ds_load_tr8_b64:
1592 case Intrinsic::amdgcn_ds_load_tr16_b128:
1593 case Intrinsic::amdgcn_global_load_tr6_b96:
1594 case Intrinsic::amdgcn_global_load_tr4_b64:
1595 case Intrinsic::amdgcn_global_load_tr_b64:
1596 case Intrinsic::amdgcn_global_load_tr_b128:
1597 case Intrinsic::amdgcn_ds_read_tr4_b64:
1598 case Intrinsic::amdgcn_ds_read_tr6_b96:
1599 case Intrinsic::amdgcn_ds_read_tr8_b64:
1600 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1609 case Intrinsic::amdgcn_flat_load_monitor_b32:
1610 case Intrinsic::amdgcn_flat_load_monitor_b64:
1611 case Intrinsic::amdgcn_flat_load_monitor_b128:
1612 case Intrinsic::amdgcn_global_load_monitor_b32:
1613 case Intrinsic::amdgcn_global_load_monitor_b64:
1614 case Intrinsic::amdgcn_global_load_monitor_b128: {
1625 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1626 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1627 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1638 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1639 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1640 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1651 case Intrinsic::amdgcn_ds_gws_init:
1652 case Intrinsic::amdgcn_ds_gws_barrier:
1653 case Intrinsic::amdgcn_ds_gws_sema_v:
1654 case Intrinsic::amdgcn_ds_gws_sema_br:
1655 case Intrinsic::amdgcn_ds_gws_sema_p:
1656 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1666 Info.memVT = MVT::i32;
1668 Info.align =
Align(4);
1670 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1677 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1678 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1679 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1680 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1681 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1682 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1683 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1684 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1699 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1700 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1701 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1702 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1717 case Intrinsic::amdgcn_load_to_lds:
1718 case Intrinsic::amdgcn_load_async_to_lds:
1719 case Intrinsic::amdgcn_global_load_lds:
1720 case Intrinsic::amdgcn_global_load_async_lds: {
1739 Width * 8 * Subtarget->getWavefrontSize());
1745 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1746 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1747 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1748 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1758 Info.memVT = MVT::i32;
1760 Info.align =
Align(4);
1766 case Intrinsic::amdgcn_s_prefetch_data:
1767 case Intrinsic::amdgcn_flat_prefetch:
1768 case Intrinsic::amdgcn_global_prefetch: {
1784 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1787 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1788 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1800 Type *&AccessTy)
const {
1801 Value *Ptr =
nullptr;
1802 switch (
II->getIntrinsicID()) {
1803 case Intrinsic::amdgcn_cluster_load_b128:
1804 case Intrinsic::amdgcn_cluster_load_b64:
1805 case Intrinsic::amdgcn_cluster_load_b32:
1806 case Intrinsic::amdgcn_ds_append:
1807 case Intrinsic::amdgcn_ds_consume:
1808 case Intrinsic::amdgcn_ds_load_tr8_b64:
1809 case Intrinsic::amdgcn_ds_load_tr16_b128:
1810 case Intrinsic::amdgcn_ds_load_tr4_b64:
1811 case Intrinsic::amdgcn_ds_load_tr6_b96:
1812 case Intrinsic::amdgcn_ds_read_tr4_b64:
1813 case Intrinsic::amdgcn_ds_read_tr6_b96:
1814 case Intrinsic::amdgcn_ds_read_tr8_b64:
1815 case Intrinsic::amdgcn_ds_read_tr16_b64:
1816 case Intrinsic::amdgcn_ds_ordered_add:
1817 case Intrinsic::amdgcn_ds_ordered_swap:
1818 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1819 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1820 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1821 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1822 case Intrinsic::amdgcn_global_atomic_fmax_num:
1823 case Intrinsic::amdgcn_global_atomic_fmin_num:
1824 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1825 case Intrinsic::amdgcn_global_load_tr_b64:
1826 case Intrinsic::amdgcn_global_load_tr_b128:
1827 case Intrinsic::amdgcn_global_load_tr4_b64:
1828 case Intrinsic::amdgcn_global_load_tr6_b96:
1829 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1830 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1831 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1832 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1833 Ptr =
II->getArgOperand(0);
1835 case Intrinsic::amdgcn_load_to_lds:
1836 case Intrinsic::amdgcn_load_async_to_lds:
1837 case Intrinsic::amdgcn_global_load_lds:
1838 case Intrinsic::amdgcn_global_load_async_lds:
1839 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1840 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1841 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1842 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1843 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1844 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1845 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1846 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1847 Ptr =
II->getArgOperand(1);
1852 AccessTy =
II->getType();
1858 unsigned AddrSpace)
const {
1859 if (!Subtarget->hasFlatInstOffsets()) {
1870 return AM.
Scale == 0 &&
1871 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1872 AM.
BaseOffs, AddrSpace, FlatVariant));
1876 if (Subtarget->hasFlatGlobalInsts())
1879 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1892 return isLegalMUBUFAddressingMode(AM);
1895bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1906 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1918 if (AM.HasBaseReg) {
1950 return isLegalMUBUFAddressingMode(AM);
1952 if (!Subtarget->hasScalarSubwordLoads()) {
1957 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
2005 return Subtarget->hasFlatScratchEnabled()
2007 : isLegalMUBUFAddressingMode(AM);
2054 unsigned Size,
unsigned AddrSpace,
Align Alignment,
2063 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
2066 Align RequiredAlignment(
2068 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
Size > 32 &&
2069 Alignment < RequiredAlignment)
2084 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
2090 RequiredAlignment =
Align(4);
2092 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2108 *IsFast = (Alignment >= RequiredAlignment) ? 64
2109 : (Alignment <
Align(4)) ? 32
2116 if (!Subtarget->hasDS96AndDS128())
2122 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2131 *IsFast = (Alignment >= RequiredAlignment) ? 96
2132 : (Alignment <
Align(4)) ? 32
2139 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2145 RequiredAlignment =
Align(8);
2147 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2156 *IsFast = (Alignment >= RequiredAlignment) ? 128
2157 : (Alignment <
Align(4)) ? 32
2174 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2176 return Alignment >= RequiredAlignment ||
2177 Subtarget->hasUnalignedDSAccessEnabled();
2185 bool AlignedBy4 = Alignment >=
Align(4);
2186 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2188 *IsFast = AlignedBy4 ?
Size : 1;
2193 *IsFast = AlignedBy4;
2204 return Alignment >=
Align(4) ||
2205 Subtarget->hasUnalignedBufferAccessEnabled();
2217 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2232 return Size >= 32 && Alignment >=
Align(4);
2237 unsigned *IsFast)
const {
2239 Alignment, Flags, IsFast);
2244 const AttributeList &FuncAttributes)
const {
2250 if (
Op.size() >= 16 &&
2254 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2272 unsigned DestAS)
const {
2275 Subtarget->hasGloballyAddressableScratch()) {
2305 unsigned Index)
const {
2317 unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
2322 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2357 auto [InputPtrReg, RC, ArgTy] =
2367 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2373 const SDLoc &SL)
const {
2380 const SDLoc &SL)
const {
2383 std::optional<uint32_t> KnownSize =
2385 if (KnownSize.has_value())
2412 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2427SDValue SITargetLowering::lowerKernargMemParameter(
2432 MachinePointerInfo PtrInfo =
2441 int64_t OffsetDiff =
Offset - AlignDownOffset;
2447 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2458 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2463 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain,
Offset);
2468 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2477 const SDLoc &SL)
const {
2546 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2549 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2550 if (ConvertedVal == ArgValue)
2551 return ConvertedVal;
2556SDValue SITargetLowering::lowerWorkGroupId(
2561 if (!Subtarget->hasClusters())
2562 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2570 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2571 SDLoc SL(ClusterIdXYZ);
2572 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2575 SDValue ClusterWorkGroupIdXYZ =
2576 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2586 return ClusterIdXYZ;
2588 using namespace AMDGPU::Hwreg;
2592 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2603SDValue SITargetLowering::getPreloadedValue(
2606 const ArgDescriptor *
Reg =
nullptr;
2607 const TargetRegisterClass *RC;
2611 const ArgDescriptor WorkGroupIDX =
2619 const ArgDescriptor WorkGroupIDZ =
2621 const ArgDescriptor ClusterWorkGroupIDX =
2623 const ArgDescriptor ClusterWorkGroupIDY =
2625 const ArgDescriptor ClusterWorkGroupIDZ =
2627 const ArgDescriptor ClusterWorkGroupMaxIDX =
2629 const ArgDescriptor ClusterWorkGroupMaxIDY =
2631 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2633 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2636 auto LoadConstant = [&](
unsigned N) {
2640 if (Subtarget->hasArchitectedSGPRs() &&
2647 Reg = &WorkGroupIDX;
2648 RC = &AMDGPU::SReg_32RegClass;
2652 Reg = &WorkGroupIDY;
2653 RC = &AMDGPU::SReg_32RegClass;
2657 Reg = &WorkGroupIDZ;
2658 RC = &AMDGPU::SReg_32RegClass;
2662 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2663 return LoadConstant(0);
2664 Reg = &ClusterWorkGroupIDX;
2665 RC = &AMDGPU::SReg_32RegClass;
2669 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2670 return LoadConstant(0);
2671 Reg = &ClusterWorkGroupIDY;
2672 RC = &AMDGPU::SReg_32RegClass;
2676 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2677 return LoadConstant(0);
2678 Reg = &ClusterWorkGroupIDZ;
2679 RC = &AMDGPU::SReg_32RegClass;
2684 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2685 Reg = &ClusterWorkGroupMaxIDX;
2686 RC = &AMDGPU::SReg_32RegClass;
2691 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2692 Reg = &ClusterWorkGroupMaxIDY;
2693 RC = &AMDGPU::SReg_32RegClass;
2698 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2699 Reg = &ClusterWorkGroupMaxIDZ;
2700 RC = &AMDGPU::SReg_32RegClass;
2704 Reg = &ClusterWorkGroupMaxFlatID;
2705 RC = &AMDGPU::SReg_32RegClass;
2736 for (
unsigned I = 0,
E = Ins.
size(), PSInputNum = 0;
I !=
E; ++
I) {
2740 "vector type argument should have been split");
2745 bool SkipArg = !Arg->
Used && !Info->isPSInputAllocated(PSInputNum);
2753 "unexpected vector split in ps argument type");
2767 Info->markPSInputAllocated(PSInputNum);
2769 Info->markPSInputEnabled(PSInputNum);
2785 if (Info.hasWorkItemIDX()) {
2791 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2795 if (Info.hasWorkItemIDY()) {
2796 assert(Info.hasWorkItemIDX());
2797 if (Subtarget->hasPackedTID()) {
2798 Info.setWorkItemIDY(
2801 unsigned Reg = AMDGPU::VGPR1;
2809 if (Info.hasWorkItemIDZ()) {
2810 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2811 if (Subtarget->hasPackedTID()) {
2812 Info.setWorkItemIDZ(
2815 unsigned Reg = AMDGPU::VGPR2;
2835 if (RegIdx == ArgVGPRs.
size()) {
2842 unsigned Reg = ArgVGPRs[RegIdx];
2854 unsigned NumArgRegs) {
2857 if (RegIdx == ArgSGPRs.
size())
2860 unsigned Reg = ArgSGPRs[RegIdx];
2902 const unsigned Mask = 0x3ff;
2905 if (Info.hasWorkItemIDX()) {
2907 Info.setWorkItemIDX(Arg);
2910 if (Info.hasWorkItemIDY()) {
2912 Info.setWorkItemIDY(Arg);
2915 if (Info.hasWorkItemIDZ())
2927 const unsigned Mask = 0x3ff;
2936 auto &
ArgInfo = Info.getArgInfo();
2948 if (Info.hasImplicitArgPtr())
2956 if (Info.hasWorkGroupIDX())
2959 if (Info.hasWorkGroupIDY())
2962 if (Info.hasWorkGroupIDZ())
2965 if (Info.hasLDSKernelId())
2976 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2977 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2983 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2984 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2989 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2990 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2996 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
3002 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
3011 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
3016 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
3017 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
3022 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
3023 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
3038 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
3040 bool InPreloadSequence =
true;
3042 bool AlignedForImplictArgs =
false;
3043 unsigned ImplicitArgOffset = 0;
3044 for (
auto &Arg :
F.args()) {
3045 if (!InPreloadSequence || !Arg.hasInRegAttr())
3048 unsigned ArgIdx = Arg.getArgNo();
3051 if (InIdx < Ins.
size() &&
3052 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
3055 for (; InIdx < Ins.
size() && Ins[InIdx].isOrigArg() &&
3056 Ins[InIdx].getOrigArgIndex() == ArgIdx;
3058 assert(ArgLocs[ArgIdx].isMemLoc());
3059 auto &ArgLoc = ArgLocs[InIdx];
3061 unsigned ArgOffset = ArgLoc.getLocMemOffset();
3063 unsigned NumAllocSGPRs =
3064 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
3067 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
3068 if (!AlignedForImplictArgs) {
3070 alignTo(LastExplicitArgOffset,
3071 Subtarget->getAlignmentForImplicitArgPtr()) -
3072 LastExplicitArgOffset;
3073 AlignedForImplictArgs =
true;
3075 ArgOffset += ImplicitArgOffset;
3079 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
3080 assert(InIdx >= 1 &&
"No previous SGPR");
3081 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
3082 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
3086 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3087 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
3090 InPreloadSequence =
false;
3096 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3098 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3100 if (PreloadRegs->
size() > 1)
3101 RC = &AMDGPU::SGPR_32RegClass;
3102 for (
auto &Reg : *PreloadRegs) {
3108 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3117 if (Info.hasLDSKernelId()) {
3118 Register Reg = Info.addLDSKernelId();
3119 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3128 bool IsShader)
const {
3129 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3130 if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
3136 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3138 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3142 unsigned NumRequiredSystemSGPRs =
3143 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3144 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3145 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3146 Register Reg = Info.addReservedUserSGPR();
3147 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3152 if (!HasArchitectedSGPRs) {
3153 if (Info.hasWorkGroupIDX()) {
3154 Register Reg = Info.addWorkGroupIDX();
3155 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3159 if (Info.hasWorkGroupIDY()) {
3160 Register Reg = Info.addWorkGroupIDY();
3161 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3165 if (Info.hasWorkGroupIDZ()) {
3166 Register Reg = Info.addWorkGroupIDZ();
3167 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3172 if (Info.hasWorkGroupInfo()) {
3173 Register Reg = Info.addWorkGroupInfo();
3174 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3178 if (Info.hasPrivateSegmentWaveByteOffset()) {
3180 unsigned PrivateSegmentWaveByteOffsetReg;
3183 PrivateSegmentWaveByteOffsetReg =
3184 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3188 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3190 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3193 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3195 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3196 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3199 assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
3200 Info.getNumPreloadedSGPRs() >= 16);
3215 if (HasStackObjects)
3216 Info.setHasNonSpillStackObjects(
true);
3221 HasStackObjects =
true;
3225 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3227 if (!ST.hasFlatScratchEnabled()) {
3228 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3235 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3237 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3247 Info.setScratchRSrcReg(ReservedBufferReg);
3266 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
3267 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3274 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3275 if (!
MRI.isLiveIn(
Reg)) {
3276 Info.setStackPtrOffsetReg(
Reg);
3281 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3288 if (ST.getFrameLowering()->hasFP(MF)) {
3289 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3305 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3314 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3315 RC = &AMDGPU::SGPR_64RegClass;
3316 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3317 RC = &AMDGPU::SGPR_32RegClass;
3323 Entry->addLiveIn(*
I);
3328 for (
auto *Exit : Exits)
3330 TII->get(TargetOpcode::COPY), *
I)
3345 bool IsError =
false;
3349 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3367 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3368 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3370 if (!Subtarget->hasFlatScratchEnabled())
3375 !Subtarget->hasArchitectedSGPRs())
3376 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3377 !Info->hasWorkGroupIDZ());
3380 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3398 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3399 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3402 Info->markPSInputAllocated(0);
3403 Info->markPSInputEnabled(0);
3405 if (Subtarget->isAmdPalOS()) {
3414 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3415 if ((PsInputBits & 0x7F) == 0 ||
3416 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3419 }
else if (IsKernel) {
3420 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3432 if (IsKernel && Subtarget->hasKernargPreload())
3436 }
else if (!IsGraphics) {
3441 if (!Subtarget->hasFlatScratchEnabled())
3453 Info->setNumWaveDispatchSGPRs(
3455 Info->setNumWaveDispatchVGPRs(
3457 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3458 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3463 if (IsWholeWaveFunc) {
3465 {MVT::i1, MVT::Other}, Chain);
3477 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.
size(), ArgIdx = 0; i != e;
3488 if (IsEntryFunc && VA.
isMemLoc()) {
3511 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3515 int64_t OffsetDiff =
Offset - AlignDownOffset;
3522 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3533 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3534 Ins[i].Flags.isSExt(), &Ins[i]);
3542 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3545 if (PreloadRegs.
size() == 1) {
3546 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3551 TRI->getRegSizeInBits(*RC)));
3559 for (
auto Reg : PreloadRegs) {
3566 PreloadRegs.size()),
3583 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3584 Ins[i].Flags.isSExt(), &Ins[i]);
3596 "hidden argument in kernel signature was not preloaded",
3602 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3603 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3623 if (!IsEntryFunc && VA.
isMemLoc()) {
3624 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3635 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3636 RC = &AMDGPU::VGPR_32RegClass;
3637 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3638 RC = &AMDGPU::SGPR_32RegClass;
3658 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3667 Info->setBytesInStackArgArea(StackArgSize);
3669 return Chains.
empty() ? Chain
3678 const Type *RetTy)
const {
3686 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3691 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3692 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3693 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3694 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3717 Info->setIfReturnsVoid(Outs.
empty());
3718 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3737 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3738 ++
I, ++RealRVLocIdx) {
3742 SDValue Arg = OutVals[RealRVLocIdx];
3765 ReadFirstLane, Arg);
3772 if (!Info->isEntryFunction()) {
3778 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3780 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3793 unsigned Opc = AMDGPUISD::ENDPGM;
3795 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3796 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3797 : AMDGPUISD::RET_GLUE;
3902 const auto [OutgoingArg, ArgRC, ArgTy] =
3907 const auto [IncomingArg, IncomingArgRC, Ty] =
3909 assert(IncomingArgRC == ArgRC);
3912 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3920 InputReg = getImplicitArgPtr(DAG,
DL);
3922 std::optional<uint32_t> Id =
3924 if (Id.has_value()) {
3935 if (OutgoingArg->isRegister()) {
3936 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3937 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3940 unsigned SpecialArgOffset =
3951 auto [OutgoingArg, ArgRC, Ty] =
3954 std::tie(OutgoingArg, ArgRC, Ty) =
3957 std::tie(OutgoingArg, ArgRC, Ty) =
3972 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3973 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3974 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3979 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3987 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
3997 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
4006 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
4007 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
4018 : IncomingArgY ? *IncomingArgY
4025 if (OutgoingArg->isRegister()) {
4027 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
4053 if (Callee->isDivergent())
4060 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
4064 if (!CallerPreserved)
4067 bool CCMatch = CallerCC == CalleeCC;
4080 if (Arg.hasByValAttr())
4094 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
4095 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4104 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4117 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4119 if (!CCVA.isRegLoc())
4124 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4126 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4150enum ChainCallArgIdx {
4172 bool UsesDynamicVGPRs =
false;
4173 if (IsChainCallConv) {
4178 auto RequestedExecIt =
4180 return Arg.OrigArgIndex == 2;
4182 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4184 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4187 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4190 "Haven't popped all the special args");
4193 CLI.
Args[ChainCallArgIdx::Exec];
4194 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4202 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4204 ChainCallSpecialArgs.
push_back(Arg.Node);
4207 PushNodeOrTargetConstant(RequestedExecArg);
4213 if (FlagsValue.
isZero()) {
4214 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4216 "no additional args allowed if flags == 0");
4218 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4222 if (!Subtarget->isWave32()) {
4224 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4227 UsesDynamicVGPRs =
true;
4228 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4229 CLI.
Args.end(), PushNodeOrTargetConstant);
4238 bool IsSibCall =
false;
4252 "unsupported call to variadic function ");
4260 "unsupported required tail call to function ");
4265 Outs, OutVals, Ins, DAG);
4269 "site marked musttail or on llvm.amdgcn.cs.chain");
4276 if (!TailCallOpt && IsTailCall)
4300 if (!Subtarget->hasFlatScratchEnabled())
4321 auto *
TRI = Subtarget->getRegisterInfo();
4328 if (!IsSibCall || IsChainCallConv) {
4329 if (!Subtarget->hasFlatScratchEnabled()) {
4335 RegsToPass.emplace_back(IsChainCallConv
4336 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4337 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4344 const unsigned NumSpecialInputs = RegsToPass.size();
4346 MVT PtrVT = MVT::i32;
4349 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4377 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4385 int32_t
Offset = LocMemOffset;
4392 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4398 ? Flags.getNonZeroByValAlign()
4425 if (Outs[i].Flags.isByVal()) {
4427 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4430 Outs[i].Flags.getNonZeroByValAlign(),
4432 nullptr, std::nullopt, DstInfo,
4438 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4444 if (!MemOpChains.
empty())
4460 unsigned ArgIdx = 0;
4461 for (
auto [Reg, Val] : RegsToPass) {
4462 if (ArgIdx++ >= NumSpecialInputs &&
4463 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4489 if (IsTailCall && !IsSibCall) {
4494 std::vector<SDValue>
Ops({Chain});
4500 Ops.push_back(Callee);
4517 Ops.push_back(Callee);
4528 if (IsChainCallConv)
4533 for (
auto &[Reg, Val] : RegsToPass)
4537 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4538 assert(Mask &&
"Missing call preserved mask for calling convention");
4548 MVT::Glue, GlueOps),
4553 Ops.push_back(InGlue);
4559 unsigned OPC = AMDGPUISD::TC_RETURN;
4562 OPC = AMDGPUISD::TC_RETURN_GFX;
4566 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4567 : AMDGPUISD::TC_RETURN_CHAIN;
4573 if (Info->isWholeWaveFunction())
4574 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4581 Chain =
Call.getValue(0);
4582 InGlue =
Call.getValue(1);
4584 uint64_t CalleePopBytes = NumBytes;
4605 EVT VT =
Op.getValueType();
4619 "Stack grows upwards for AMDGPU");
4621 Chain = BaseAddr.getValue(1);
4623 if (Alignment > StackAlign) {
4625 << Subtarget->getWavefrontSizeLog2();
4626 uint64_t StackAlignMask = ScaledAlignment - 1;
4633 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4639 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4650 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4666 if (
Op.getValueType() != MVT::i32)
4685 assert(
Op.getValueType() == MVT::i32);
4694 Op.getOperand(0), IntrinID, GetRoundBothImm);
4728 SDValue RoundModeTimesNumBits =
4748 TableEntry, EnumOffset);
4764 static_cast<uint32_t>(ConstMode->getZExtValue()),
4776 if (UseReducedTable) {
4782 SDValue RoundModeTimesNumBits =
4802 SDValue RoundModeTimesNumBits =
4811 NewMode = TruncTable;
4820 ReadFirstLaneID, NewMode);
4833 IntrinID, RoundBothImm, NewMode);
4839 if (
Op->isDivergent() &&
4840 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4850 if (Subtarget->hasSafeSmemPrefetch())
4858 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4867 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4868 EVT SrcVT = Src.getValueType();
4877 EVT DstVT =
Op.getValueType();
4886 if (
Op.getValueType() != MVT::i64)
4900 Op.getOperand(0), IntrinID, ModeHwRegImm);
4902 Op.getOperand(0), IntrinID, TrapHwRegImm);
4916 if (
Op.getOperand(1).getValueType() != MVT::i64)
4928 ReadFirstLaneID, NewModeReg);
4930 ReadFirstLaneID, NewTrapReg);
4932 unsigned ModeHwReg =
4935 unsigned TrapHwReg =
4943 IntrinID, ModeHwRegImm, NewModeReg);
4946 IntrinID, TrapHwRegImm, NewTrapReg);
4955 .
Case(
"m0", AMDGPU::M0)
4956 .
Case(
"exec", AMDGPU::EXEC)
4957 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4958 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4959 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4960 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4961 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4966 if (!Subtarget->hasFlatScrRegister() &&
4967 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4969 "\" for subtarget."));
4974 case AMDGPU::EXEC_LO:
4975 case AMDGPU::EXEC_HI:
4976 case AMDGPU::FLAT_SCR_LO:
4977 case AMDGPU::FLAT_SCR_HI:
4982 case AMDGPU::FLAT_SCR:
5001 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
5010static std::pair<MachineBasicBlock *, MachineBasicBlock *>
5032 auto Next = std::next(
I);
5043 MBB.addSuccessor(LoopBB);
5045 return std::pair(LoopBB, RemainderBB);
5052 auto I =
MI.getIterator();
5053 auto E = std::next(
I);
5075 Src->setIsKill(
false);
5085 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
5091 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5094 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5118 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5119 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5129 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
5130 Register NewExec =
MRI.createVirtualRegister(BoolRC);
5132 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5133 Register CondReg =
MRI.createVirtualRegister(BoolRC);
5141 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5148 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5152 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5160 MRI.setSimpleHint(NewExec, CondReg);
5162 if (UseGPRIdxMode) {
5164 SGPRIdxReg = CurrentIdxReg;
5166 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5167 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5177 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5208 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5209 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5217 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5219 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
5220 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
5236 InitResultReg, DstReg, PhiReg, TmpExec,
5237 Offset, UseGPRIdxMode, SGPRIdxReg);
5243 LoopBB->removeSuccessor(RemainderBB);
5245 LoopBB->addSuccessor(LandingPad);
5256static std::pair<unsigned, int>
5260 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5265 return std::pair(AMDGPU::sub0,
Offset);
5305 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5322 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5323 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5332 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5335 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5339 if (UseGPRIdxMode) {
5346 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5359 MI.eraseFromParent();
5368 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5369 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5375 UseGPRIdxMode, SGPRIdxReg);
5379 if (UseGPRIdxMode) {
5381 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5383 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5388 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5393 MI.eraseFromParent();
5410 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5420 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5422 if (Idx->
getReg() == AMDGPU::NoRegister) {
5433 MI.eraseFromParent();
5438 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5442 if (UseGPRIdxMode) {
5446 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5455 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5456 TRI.getRegSizeInBits(*VecRC), 32,
false);
5462 MI.eraseFromParent();
5472 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5476 UseGPRIdxMode, SGPRIdxReg);
5479 if (UseGPRIdxMode) {
5481 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5483 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5489 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5490 TRI.getRegSizeInBits(*VecRC), 32,
false);
5491 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5497 MI.eraseFromParent();
5513 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5514 if (ST.hasScalarAddSub64()) {
5515 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5525 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5526 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5529 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5531 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5534 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5536 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5538 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5539 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5548 MI.eraseFromParent();
5554 case AMDGPU::S_MIN_U32:
5555 return std::numeric_limits<uint32_t>::max();
5556 case AMDGPU::S_MIN_I32:
5557 return std::numeric_limits<int32_t>::max();
5558 case AMDGPU::S_MAX_U32:
5559 return std::numeric_limits<uint32_t>::min();
5560 case AMDGPU::S_MAX_I32:
5561 return std::numeric_limits<int32_t>::min();
5562 case AMDGPU::V_ADD_F32_e64:
5564 case AMDGPU::V_SUB_F32_e64:
5566 case AMDGPU::S_ADD_I32:
5567 case AMDGPU::S_SUB_I32:
5568 case AMDGPU::S_OR_B32:
5569 case AMDGPU::S_XOR_B32:
5570 return std::numeric_limits<uint32_t>::min();
5571 case AMDGPU::S_AND_B32:
5572 return std::numeric_limits<uint32_t>::max();
5573 case AMDGPU::V_MIN_F32_e64:
5574 case AMDGPU::V_MAX_F32_e64:
5578 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5584 case AMDGPU::V_CMP_LT_U64_e64:
5585 return std::numeric_limits<uint64_t>::max();
5586 case AMDGPU::V_CMP_LT_I64_e64:
5587 return std::numeric_limits<int64_t>::max();
5588 case AMDGPU::V_CMP_GT_U64_e64:
5589 return std::numeric_limits<uint64_t>::min();
5590 case AMDGPU::V_CMP_GT_I64_e64:
5591 return std::numeric_limits<int64_t>::min();
5592 case AMDGPU::V_MIN_F64_e64:
5593 case AMDGPU::V_MAX_F64_e64:
5594 case AMDGPU::V_MIN_NUM_F64_e64:
5595 case AMDGPU::V_MAX_NUM_F64_e64:
5596 return 0x7FF8000000000000;
5597 case AMDGPU::S_ADD_U64_PSEUDO:
5598 case AMDGPU::S_SUB_U64_PSEUDO:
5599 case AMDGPU::S_OR_B64:
5600 case AMDGPU::S_XOR_B64:
5601 return std::numeric_limits<uint64_t>::min();
5602 case AMDGPU::S_AND_B64:
5603 return std::numeric_limits<uint64_t>::max();
5604 case AMDGPU::V_ADD_F64_e64:
5605 case AMDGPU::V_ADD_F64_pseudo_e64:
5606 return 0x8000000000000000;
5609 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5614 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5615 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5616 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5617 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5618 Opc == AMDGPU::S_XOR_B32 ||
Opc == AMDGPU::V_MIN_F32_e64 ||
5619 Opc == AMDGPU::V_MAX_F32_e64 ||
Opc == AMDGPU::V_ADD_F32_e64 ||
5620 Opc == AMDGPU::V_SUB_F32_e64;
5624 return Opc == AMDGPU::V_MIN_F32_e64 ||
Opc == AMDGPU::V_MAX_F32_e64 ||
5625 Opc == AMDGPU::V_ADD_F32_e64 ||
Opc == AMDGPU::V_SUB_F32_e64 ||
5626 Opc == AMDGPU::V_MIN_F64_e64 ||
Opc == AMDGPU::V_MAX_F64_e64 ||
5627 Opc == AMDGPU::V_MIN_NUM_F64_e64 ||
Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
5628 Opc == AMDGPU::V_ADD_F64_e64 ||
Opc == AMDGPU::V_ADD_F64_pseudo_e64;
5642 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5647 case AMDGPU::S_MIN_U32:
5648 case AMDGPU::S_MIN_I32:
5649 case AMDGPU::V_MIN_F32_e64:
5650 case AMDGPU::S_MAX_U32:
5651 case AMDGPU::S_MAX_I32:
5652 case AMDGPU::V_MAX_F32_e64:
5653 case AMDGPU::S_AND_B32:
5654 case AMDGPU::S_OR_B32: {
5660 case AMDGPU::V_CMP_LT_U64_e64:
5661 case AMDGPU::V_CMP_LT_I64_e64:
5662 case AMDGPU::V_CMP_GT_U64_e64:
5663 case AMDGPU::V_CMP_GT_I64_e64:
5664 case AMDGPU::V_MIN_F64_e64:
5665 case AMDGPU::V_MIN_NUM_F64_e64:
5666 case AMDGPU::V_MAX_F64_e64:
5667 case AMDGPU::V_MAX_NUM_F64_e64:
5668 case AMDGPU::S_AND_B64:
5669 case AMDGPU::S_OR_B64: {
5675 case AMDGPU::S_XOR_B32:
5676 case AMDGPU::S_XOR_B64:
5677 case AMDGPU::S_ADD_I32:
5678 case AMDGPU::S_ADD_U64_PSEUDO:
5679 case AMDGPU::V_ADD_F32_e64:
5680 case AMDGPU::V_ADD_F64_e64:
5681 case AMDGPU::V_ADD_F64_pseudo_e64:
5682 case AMDGPU::S_SUB_I32:
5683 case AMDGPU::S_SUB_U64_PSEUDO:
5684 case AMDGPU::V_SUB_F32_e64: {
5687 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5689 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5691 bool IsWave32 = ST.isWave32();
5692 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5693 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5694 unsigned BitCountOpc =
5695 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5699 auto NewAccumulator =
5704 case AMDGPU::S_XOR_B32:
5705 case AMDGPU::S_XOR_B64: {
5711 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5714 .
addReg(NewAccumulator->getOperand(0).getReg())
5717 if (
Opc == AMDGPU::S_XOR_B32) {
5723 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5725 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5729 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5732 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5734 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5744 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5752 case AMDGPU::S_SUB_I32: {
5753 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5761 .
addReg(NewAccumulator->getOperand(0).getReg());
5764 case AMDGPU::S_ADD_I32: {
5767 .
addReg(NewAccumulator->getOperand(0).getReg());
5770 case AMDGPU::S_ADD_U64_PSEUDO:
5771 case AMDGPU::S_SUB_U64_PSEUDO: {
5772 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5773 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5775 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5777 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5778 Register CarryReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5779 Register AddReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5781 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5783 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5787 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5790 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5792 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5794 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5797 .
addReg(NewAccumulator->getOperand(0).getReg())
5807 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5809 : NewAccumulator->getOperand(0).getReg();
5820 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5826 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5832 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5839 case AMDGPU::V_ADD_F32_e64:
5840 case AMDGPU::V_ADD_F64_e64:
5841 case AMDGPU::V_ADD_F64_pseudo_e64:
5842 case AMDGPU::V_SUB_F32_e64: {
5845 Register ActiveLanesVreg =
MRI.createVirtualRegister(VregRC);
5846 Register DstVreg =
MRI.createVirtualRegister(VregRC);
5849 TII->get(is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
5850 : AMDGPU::V_CVT_F64_I32_e64),
5852 .
addReg(NewAccumulator->getOperand(0).getReg())
5858 (
Opc == AMDGPU::V_SUB_F32_e64 ||
5859 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
5862 unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
5864 ? AMDGPU::V_MUL_F64_pseudo_e64
5865 : AMDGPU::V_MUL_F64_e64;
5875 BuildMI(BB,
MI,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5879 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5881 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5883 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
5885 TII->buildExtractSubRegOrImm(
MI,
MRI, DestVregInst->getOperand(0),
5886 VregRC, AMDGPU::sub0, VregSubRC);
5888 TII->buildExtractSubRegOrImm(
MI,
MRI, DestVregInst->getOperand(0),
5889 VregRC, AMDGPU::sub1, VregSubRC);
5898 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5931 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5932 Register IdentityValReg =
MRI.createVirtualRegister(DstRegClass);
5933 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5934 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5935 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5936 Register FF1Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5937 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
5939 bool IsWave32 = ST.isWave32();
5940 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5941 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5948 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5952 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
5955 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5964 I = ComputeLoop->begin();
5966 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5970 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5974 I = ComputeLoop->end();
5977 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5981 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5987 MRI.createVirtualRegister(
MRI.getRegClass(SrcReg));
5988 Register DstVreg =
MRI.createVirtualRegister(
MRI.getRegClass(SrcReg));
5990 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_MOV_B32_e32),
6000 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
6001 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6010 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6012 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6013 Register LaneValReg =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6016 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
6018 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
6020 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
6022 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
6026 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
6030 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
6031 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
6037 case AMDGPU::S_OR_B64:
6038 case AMDGPU::S_AND_B64:
6039 case AMDGPU::S_XOR_B64: {
6042 .
addReg(LaneValue->getOperand(0).getReg())
6046 case AMDGPU::V_CMP_GT_I64_e64:
6047 case AMDGPU::V_CMP_GT_U64_e64:
6048 case AMDGPU::V_CMP_LT_I64_e64:
6049 case AMDGPU::V_CMP_LT_U64_e64: {
6050 Register LaneMaskReg =
MRI.createVirtualRegister(WaveMaskRegClass);
6052 MRI.createVirtualRegister(WaveMaskRegClass);
6054 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src);
6056 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), SrcIdx));
6058 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
6059 Register AccumulatorVReg =
MRI.createVirtualRegister(VregClass);
6062 VregClass, AMDGPU::sub0, VSubRegClass);
6065 VregClass, AMDGPU::sub1, VSubRegClass);
6066 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
6073 .
addReg(LaneValue->getOperand(0).getReg())
6074 .
addReg(AccumulatorVReg);
6076 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6077 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
6081 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
6082 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
6083 .
addReg(LaneValue->getOperand(0).getReg())
6087 case AMDGPU::V_MIN_F64_e64:
6088 case AMDGPU::V_MIN_NUM_F64_e64:
6089 case AMDGPU::V_MAX_F64_e64:
6090 case AMDGPU::V_MAX_NUM_F64_e64:
6091 case AMDGPU::V_ADD_F64_e64:
6092 case AMDGPU::V_ADD_F64_pseudo_e64: {
6094 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src);
6096 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), SrcIdx));
6098 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
6099 Register AccumulatorVReg =
MRI.createVirtualRegister(VregRC);
6100 Register DstVreg =
MRI.createVirtualRegister(VregRC);
6102 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6104 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6105 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::COPY), AccumulatorVReg)
6108 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6113 .
addReg(LaneValue->getOperand(0).getReg())
6119 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32),
6122 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32),
6126 TII->buildExtractSubRegOrImm(Iters,
MRI, DstVregInst->getOperand(0),
6127 VregRC, AMDGPU::sub0, VregSubRC);
6129 TII->buildExtractSubRegOrImm(Iters,
MRI, DstVregInst->getOperand(0),
6130 VregRC, AMDGPU::sub1, VregSubRC);
6131 ReadLaneLo.add(Op1L);
6132 ReadLaneHi.add(Op1H);
6133 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
6134 TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
6141 case AMDGPU::S_ADD_U64_PSEUDO:
6142 case AMDGPU::S_SUB_U64_PSEUDO: {
6145 .
addReg(LaneValue->getOperand(0).getReg());
6152 unsigned BITSETOpc =
6153 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
6154 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
6160 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
6163 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
6165 .
addReg(NewActiveBitsReg)
6167 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
6172 MI.eraseFromParent();
6187 switch (
MI.getOpcode()) {
6188 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6190 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6192 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6194 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6196 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6198 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
6201 ? AMDGPU::V_MIN_NUM_F64_e64
6202 : AMDGPU::V_MIN_F64_e64);
6203 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6205 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6207 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6209 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6211 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6213 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
6216 ? AMDGPU::V_MAX_NUM_F64_e64
6217 : AMDGPU::V_MAX_F64_e64);
6218 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6220 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6222 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6224 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
6227 ? AMDGPU::V_ADD_F64_pseudo_e64
6228 : AMDGPU::V_ADD_F64_e64);
6229 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6231 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6233 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6235 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
6240 ? AMDGPU::V_ADD_F64_pseudo_e64
6241 : AMDGPU::V_ADD_F64_e64);
6242 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6244 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6246 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6248 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6250 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6252 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6254 case AMDGPU::S_UADDO_PSEUDO:
6255 case AMDGPU::S_USUBO_PSEUDO: {
6261 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6263 : AMDGPU::S_SUB_U32;
6271 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6274 MI.eraseFromParent();
6277 case AMDGPU::S_ADD_U64_PSEUDO:
6278 case AMDGPU::S_SUB_U64_PSEUDO: {
6281 case AMDGPU::V_ADD_U64_PSEUDO:
6282 case AMDGPU::V_SUB_U64_PSEUDO: {
6283 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6289 if (ST.hasAddSubU64Insts()) {
6291 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6292 : AMDGPU::V_SUB_U64_e64),
6297 TII->legalizeOperands(*
I);
6298 MI.eraseFromParent();
6302 if (IsAdd && ST.hasLshlAddU64Inst()) {
6308 TII->legalizeOperands(*
Add);
6309 MI.eraseFromParent();
6313 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6315 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6316 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6318 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
6319 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
6323 : &AMDGPU::VReg_64RegClass;
6326 : &AMDGPU::VReg_64RegClass;
6329 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6331 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6334 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6336 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6339 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6341 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6344 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6351 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6365 TII->legalizeOperands(*LoHalf);
6366 TII->legalizeOperands(*HiHalf);
6367 MI.eraseFromParent();
6370 case AMDGPU::S_ADD_CO_PSEUDO:
6371 case AMDGPU::S_SUB_CO_PSEUDO: {
6382 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6383 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6388 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6389 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6393 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6395 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6400 if (ST.isWave64()) {
6401 if (ST.hasScalarCompareEq64()) {
6408 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6410 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6412 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6413 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6415 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6429 unsigned Opc =
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6430 ? AMDGPU::S_ADDC_U32
6431 : AMDGPU::S_SUBB_U32;
6436 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6442 MI.eraseFromParent();
6445 case AMDGPU::SI_INIT_M0: {
6448 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6451 MI.eraseFromParent();
6454 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6457 TII->get(AMDGPU::S_CMP_EQ_U32))
6462 case AMDGPU::GET_GROUPSTATICSIZE: {
6466 .
add(
MI.getOperand(0))
6468 MI.eraseFromParent();
6471 case AMDGPU::GET_SHADERCYCLESHILO: {
6484 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6486 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6487 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6489 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6490 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6492 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6496 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6501 .
add(
MI.getOperand(0))
6506 MI.eraseFromParent();
6509 case AMDGPU::SI_INDIRECT_SRC_V1:
6510 case AMDGPU::SI_INDIRECT_SRC_V2:
6511 case AMDGPU::SI_INDIRECT_SRC_V3:
6512 case AMDGPU::SI_INDIRECT_SRC_V4:
6513 case AMDGPU::SI_INDIRECT_SRC_V5:
6514 case AMDGPU::SI_INDIRECT_SRC_V6:
6515 case AMDGPU::SI_INDIRECT_SRC_V7:
6516 case AMDGPU::SI_INDIRECT_SRC_V8:
6517 case AMDGPU::SI_INDIRECT_SRC_V9:
6518 case AMDGPU::SI_INDIRECT_SRC_V10:
6519 case AMDGPU::SI_INDIRECT_SRC_V11:
6520 case AMDGPU::SI_INDIRECT_SRC_V12:
6521 case AMDGPU::SI_INDIRECT_SRC_V16:
6522 case AMDGPU::SI_INDIRECT_SRC_V32:
6524 case AMDGPU::SI_INDIRECT_DST_V1:
6525 case AMDGPU::SI_INDIRECT_DST_V2:
6526 case AMDGPU::SI_INDIRECT_DST_V3:
6527 case AMDGPU::SI_INDIRECT_DST_V4:
6528 case AMDGPU::SI_INDIRECT_DST_V5:
6529 case AMDGPU::SI_INDIRECT_DST_V6:
6530 case AMDGPU::SI_INDIRECT_DST_V7:
6531 case AMDGPU::SI_INDIRECT_DST_V8:
6532 case AMDGPU::SI_INDIRECT_DST_V9:
6533 case AMDGPU::SI_INDIRECT_DST_V10:
6534 case AMDGPU::SI_INDIRECT_DST_V11:
6535 case AMDGPU::SI_INDIRECT_DST_V12:
6536 case AMDGPU::SI_INDIRECT_DST_V16:
6537 case AMDGPU::SI_INDIRECT_DST_V32:
6539 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6540 case AMDGPU::SI_KILL_I1_PSEUDO:
6542 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6546 Register SrcCond =
MI.getOperand(3).getReg();
6548 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6549 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6550 const auto *CondRC =
TRI->getWaveMaskRegClass();
6551 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
6555 : &AMDGPU::VReg_64RegClass;
6558 : &AMDGPU::VReg_64RegClass;
6561 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6563 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6566 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6568 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6571 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6573 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6594 MI.eraseFromParent();
6597 case AMDGPU::SI_BR_UNDEF: {
6599 .
add(
MI.getOperand(0));
6601 MI.eraseFromParent();
6604 case AMDGPU::ADJCALLSTACKUP:
6605 case AMDGPU::ADJCALLSTACKDOWN: {
6612 case AMDGPU::SI_CALL_ISEL: {
6613 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
6616 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6622 MI.eraseFromParent();
6625 case AMDGPU::V_ADD_CO_U32_e32:
6626 case AMDGPU::V_SUB_CO_U32_e32:
6627 case AMDGPU::V_SUBREV_CO_U32_e32: {
6629 unsigned Opc =
MI.getOpcode();
6631 bool NeedClampOperand =
false;
6632 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
6634 NeedClampOperand =
true;
6638 if (
TII->isVOP3(*
I)) {
6641 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
6642 if (NeedClampOperand)
6645 TII->legalizeOperands(*
I);
6647 MI.eraseFromParent();
6650 case AMDGPU::V_ADDC_U32_e32:
6651 case AMDGPU::V_SUBB_U32_e32:
6652 case AMDGPU::V_SUBBREV_U32_e32:
6655 TII->legalizeOperands(
MI);
6657 case AMDGPU::DS_GWS_INIT:
6658 case AMDGPU::DS_GWS_SEMA_BR:
6659 case AMDGPU::DS_GWS_BARRIER:
6660 case AMDGPU::DS_GWS_SEMA_V:
6661 case AMDGPU::DS_GWS_SEMA_P:
6662 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6670 case AMDGPU::S_SETREG_B32: {
6686 const unsigned SetMask = WidthMask <<
Offset;
6689 unsigned SetDenormOp = 0;
6690 unsigned SetRoundOp = 0;
6698 SetRoundOp = AMDGPU::S_ROUND_MODE;
6699 SetDenormOp = AMDGPU::S_DENORM_MODE;
6701 SetRoundOp = AMDGPU::S_ROUND_MODE;
6703 SetDenormOp = AMDGPU::S_DENORM_MODE;
6706 if (SetRoundOp || SetDenormOp) {
6708 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6709 unsigned ImmVal = Def->getOperand(1).getImm();
6723 MI.eraseFromParent();
6732 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6736 case AMDGPU::S_INVERSE_BALLOT_U32:
6737 case AMDGPU::S_INVERSE_BALLOT_U64:
6740 MI.setDesc(
TII->get(AMDGPU::COPY));
6742 case AMDGPU::ENDPGM_TRAP: {
6744 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6764 MI.eraseFromParent();
6767 case AMDGPU::SIMULATED_TRAP: {
6768 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6770 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6771 MI.eraseFromParent();
6774 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6775 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6781 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6782 Register OriginalExec = Setup->getOperand(0).getReg();
6784 MI.getOperand(0).setReg(OriginalExec);
6821 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6825 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6852 if (!Subtarget->hasMadMacF32Insts())
6853 return Subtarget->hasFastFMAF32();
6859 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6862 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6878 switch (Ty.getScalarSizeInBits()) {
6896 if (Ty.getScalarSizeInBits() == 16)
6898 if (Ty.getScalarSizeInBits() == 32)
6899 return Subtarget->hasMadMacF32Insts() &&
6909 EVT VT =
N->getValueType(0);
6911 return Subtarget->hasMadMacF32Insts() &&
6913 if (VT == MVT::f16) {
6914 return Subtarget->hasMadF16() &&
6929 unsigned Opc =
Op.getOpcode();
6930 EVT VT =
Op.getValueType();
6931 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6932 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6933 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6934 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6935 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6936 VT == MVT::v32bf16);
6952 [[maybe_unused]]
EVT VT =
Op.getValueType();
6954 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6955 VT == MVT::v16i32) &&
6956 "Unexpected ValueType.");
6965 unsigned Opc =
Op.getOpcode();
6966 EVT VT =
Op.getValueType();
6967 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6968 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6969 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6970 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6971 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6972 VT == MVT::v32bf16);
6980 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6982 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6989 unsigned Opc =
Op.getOpcode();
6990 EVT VT =
Op.getValueType();
6991 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6992 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6993 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6994 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6995 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6996 VT == MVT::v32bf16);
7001 : std::pair(Op0, Op0);
7010 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
7012 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
7018 switch (
Op.getOpcode()) {
7022 return LowerBRCOND(
Op, DAG);
7024 return LowerRETURNADDR(
Op, DAG);
7026 return LowerSPONENTRY(
Op, DAG);
7029 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
7030 "Load should return a value and a chain");
7034 EVT VT =
Op.getValueType();
7036 return lowerFSQRTF32(
Op, DAG);
7038 return lowerFSQRTF64(
Op, DAG);
7043 return LowerTrig(
Op, DAG);
7045 return LowerSELECT(
Op, DAG);
7047 return LowerFDIV(
Op, DAG);
7049 return LowerFFREXP(
Op, DAG);
7051 return LowerATOMIC_CMP_SWAP(
Op, DAG);
7053 return LowerSTORE(
Op, DAG);
7057 return LowerGlobalAddress(MFI,
Op, DAG);
7060 return LowerExternalSymbol(
Op, DAG);
7062 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
7064 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
7066 return LowerINTRINSIC_VOID(
Op, DAG);
7068 return lowerADDRSPACECAST(
Op, DAG);
7070 return lowerINSERT_SUBVECTOR(
Op, DAG);
7072 return lowerINSERT_VECTOR_ELT(
Op, DAG);
7074 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
7076 return lowerVECTOR_SHUFFLE(
Op, DAG);
7078 return lowerSCALAR_TO_VECTOR(
Op, DAG);
7080 return lowerBUILD_VECTOR(
Op, DAG);
7083 return lowerFP_ROUND(
Op, DAG);
7085 return lowerTRAP(
Op, DAG);
7087 return lowerDEBUGTRAP(
Op, DAG);
7096 return lowerFMINNUM_FMAXNUM(
Op, DAG);
7099 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
7102 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
7105 return lowerFLDEXP(
Op, DAG);
7111 Op.getValueType() == MVT::i16 &&
7112 Op.getOperand(0).getValueType() == MVT::f32) {
7136 return lowerFCOPYSIGN(
Op, DAG);
7138 return lowerMUL(
Op, DAG);
7141 return lowerXMULO(
Op, DAG);
7144 return lowerXMUL_LOHI(
Op, DAG);
7179 EVT FittingLoadVT = LoadVT;
7211SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
7214 bool IsIntrinsic)
const {
7217 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7218 EVT LoadVT =
M->getValueType(0);
7220 EVT EquivLoadVT = LoadVT;
7234 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
7238 M->getMemoryVT(),
M->getMemOperand());
7249 EVT LoadVT =
M->getValueType(0);
7255 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
7256 bool IsTFE =
M->getNumValues() == 3;
7258 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7259 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7260 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7261 : AMDGPUISD::BUFFER_LOAD;
7264 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG,
Ops);
7269 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
7273 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
7274 M->getMemOperand(), DAG);
7278 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
7280 M->getMemOperand(), DAG);
7288 EVT VT =
N->getValueType(0);
7289 unsigned CondCode =
N->getConstantOperandVal(3);
7300 EVT CmpVT =
LHS.getValueType();
7301 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
7302 unsigned PromoteOp =
7322 EVT VT =
N->getValueType(0);
7324 unsigned CondCode =
N->getConstantOperandVal(3);
7333 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7342 SDValue SetCC = DAG.
getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7351 EVT VT =
N->getValueType(0);
7375 Exec = AMDGPU::EXEC_LO;
7377 Exec = AMDGPU::EXEC;
7394 EVT VT =
N->getValueType(0);
7396 unsigned IID =
N->getConstantOperandVal(0);
7397 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7398 IID == Intrinsic::amdgcn_permlanex16;
7399 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7400 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7404 unsigned SplitSize = 32;
7405 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7406 ST->hasDPALU_DPP() &&
7414 case Intrinsic::amdgcn_permlane16:
7415 case Intrinsic::amdgcn_permlanex16:
7416 case Intrinsic::amdgcn_update_dpp:
7421 case Intrinsic::amdgcn_writelane:
7424 case Intrinsic::amdgcn_readlane:
7425 case Intrinsic::amdgcn_set_inactive:
7426 case Intrinsic::amdgcn_set_inactive_chain_arg:
7427 case Intrinsic::amdgcn_mov_dpp8:
7430 case Intrinsic::amdgcn_readfirstlane:
7431 case Intrinsic::amdgcn_permlane64:
7439 std::reverse(Operands.
begin(), Operands.
end());
7441 if (
SDNode *GL =
N->getGluedNode()) {
7443 GL = GL->getOperand(0).getNode();
7453 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7454 IID == Intrinsic::amdgcn_mov_dpp8 ||
7455 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7456 Src1 =
N->getOperand(2);
7457 if (IID == Intrinsic::amdgcn_writelane ||
7458 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7459 Src2 =
N->getOperand(3);
7462 if (ValSize == SplitSize) {
7472 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7477 if (IID == Intrinsic::amdgcn_writelane) {
7482 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7484 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7487 if (ValSize % SplitSize != 0)
7491 EVT VT =
N->getValueType(0);
7495 unsigned NumOperands =
N->getNumOperands();
7497 SDNode *GL =
N->getGluedNode();
7502 for (
unsigned i = 0; i != NE; ++i) {
7503 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7505 SDValue Operand =
N->getOperand(j);
7514 Operands[j] = Operand;
7519 Operands[NumOperands - 1] =
7535 if (SplitSize == 32) {
7537 return unrollLaneOp(LaneOp.
getNode());
7543 unsigned SubVecNumElt =
7547 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7548 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7552 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7557 if (IID == Intrinsic::amdgcn_writelane)
7562 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7563 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7564 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7565 EltIdx += SubVecNumElt;
7579 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7582 if (IID == Intrinsic::amdgcn_writelane)
7585 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7592 EVT VT =
N->getValueType(0);
7610 auto MakeIntrinsic = [&DAG, &SL](
unsigned IID,
MVT RetVT,
7614 Operands.
append(IntrinArgs);
7620 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7621 {ShiftedIndex, ValueI32});
7631 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7632 {ValueI32, PoisonVal});
7633 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7634 {ShiftedIndex, PoisonVal});
7637 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7640 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7641 {WWMIndex, WWMValue});
7642 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7643 MVT::i32, {WWMIndex, Swapped});
7645 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7653 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7661 DAG.
getSetCC(SL, MVT::i1, SameOrOtherHalf,
7671 switch (
N->getOpcode()) {
7683 unsigned IID =
N->getConstantOperandVal(0);
7685 case Intrinsic::amdgcn_make_buffer_rsrc:
7686 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
7688 case Intrinsic::amdgcn_cvt_pkrtz: {
7693 DAG.
getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7697 case Intrinsic::amdgcn_cvt_pknorm_i16:
7698 case Intrinsic::amdgcn_cvt_pknorm_u16:
7699 case Intrinsic::amdgcn_cvt_pk_i16:
7700 case Intrinsic::amdgcn_cvt_pk_u16: {
7706 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7707 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7708 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7709 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7710 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7711 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7713 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7715 EVT VT =
N->getValueType(0);
7724 case Intrinsic::amdgcn_s_buffer_load: {
7730 if (!Subtarget->hasScalarSubwordLoads())
7736 EVT VT =
Op.getValueType();
7737 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
7749 if (!
Offset->isDivergent()) {
7768 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
7773 case Intrinsic::amdgcn_dead: {
7774 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7785 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7786 Results.push_back(Res.getOperand(
I));
7790 Results.push_back(Res.getValue(1));
7799 EVT VT =
N->getValueType(0);
7804 EVT SelectVT = NewVT;
7805 if (NewVT.
bitsLT(MVT::i32)) {
7808 SelectVT = MVT::i32;
7814 if (NewVT != SelectVT)
7820 if (
N->getValueType(0) != MVT::v2f16)
7832 if (
N->getValueType(0) != MVT::v2f16)
7844 if (
N->getValueType(0) != MVT::f16)
7859 if (U.get() !=
Value)
7862 if (U.getUser()->getOpcode() == Opcode)
7868unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
7871 case Intrinsic::amdgcn_if:
7872 return AMDGPUISD::IF;
7873 case Intrinsic::amdgcn_else:
7874 return AMDGPUISD::ELSE;
7875 case Intrinsic::amdgcn_loop:
7876 return AMDGPUISD::LOOP;
7877 case Intrinsic::amdgcn_end_cf:
7897 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7924 SDNode *Intr = BRCOND.getOperand(1).getNode();
7941 Intr =
LHS.getNode();
7949 assert(BR &&
"brcond missing unconditional branch user");
7954 unsigned CFNode = isCFIntrinsic(Intr);
7974 Ops.push_back(Target);
7997 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
8016 MVT VT =
Op.getSimpleValueType();
8019 if (
Op.getConstantOperandVal(0) != 0)
8023 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8025 if (
Info->isEntryFunction())
8042 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
8056 return Op.getValueType().bitsLE(VT)
8064 EVT DstVT =
Op.getValueType();
8071 unsigned Opc =
Op.getOpcode();
8083 EVT SrcVT = Src.getValueType();
8084 EVT DstVT =
Op.getValueType();
8087 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
8090 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
8097 if (DstVT == MVT::f16) {
8102 if (!Subtarget->has16BitInsts()) {
8107 if (
Op->getFlags().hasApproximateFuncs()) {
8118 "custom lower FP_ROUND for f16 or bf16");
8119 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
8131 EVT VT =
Op.getValueType();
8133 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8134 bool IsIEEEMode =
Info->getMode().IEEE;
8143 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8150SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
8152 EVT VT =
Op.getValueType();
8154 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8155 bool IsIEEEMode =
Info->getMode().IEEE;
8160 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8168 EVT VT =
Op.getValueType();
8172 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
8173 !Subtarget->hasMinimum3Maximum3F16() &&
8174 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
8175 "should not need to widen f16 minimum/maximum to v2f16");
8189 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
8197 EVT VT =
Op.getValueType();
8201 EVT ExpVT =
Exp.getValueType();
8202 if (ExpVT == MVT::i16)
8223 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
8230 switch (
Op->getOpcode()) {
8260 DAGCombinerInfo &DCI)
const {
8261 const unsigned Opc =
Op.getOpcode();
8269 :
Op->getOperand(0).getValueType();
8270 auto &DAG = DCI.DAG;
8273 if (DCI.isBeforeLegalizeOps() ||
8281 LHS =
Op->getOperand(1);
8282 RHS =
Op->getOperand(2);
8284 LHS =
Op->getOperand(0);
8285 RHS =
Op->getOperand(1);
8324 if (MagVT == SignVT)
8341 EVT VT =
Op.getValueType();
8347 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
8374 if (
Op->isDivergent())
8387 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8389 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8392 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8394 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8400 EVT VT =
Op.getValueType();
8407 const APInt &
C = RHSC->getAPIntValue();
8409 if (
C.isPowerOf2()) {
8411 bool UseArithShift = isSigned && !
C.isMinSignedValue();
8438 if (
Op->isDivergent()) {
8442 if (Subtarget->hasSMulHi()) {
8453 if (!Subtarget->hasTrapHandler() ||
8455 return lowerTrapEndpgm(
Op, DAG);
8457 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
8458 : lowerTrapHsaQueuePtr(
Op, DAG);
8464 return DAG.
getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8468SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
8470 ImplicitParameter Param)
const {
8474 MachinePointerInfo PtrInfo =
8491 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
8494 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8497 if (UserSGPR == AMDGPU::NoRegister) {
8514 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8523 if (Subtarget->hasPrivEnabledTrap2NopBug())
8524 return DAG.
getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8528 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8536 if (!Subtarget->hasTrapHandler() ||
8540 "debugtrap handler not supported",
8548 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8551SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
8553 if (Subtarget->hasApertureRegs()) {
8555 ? AMDGPU::SRC_SHARED_BASE
8556 : AMDGPU::SRC_PRIVATE_BASE;
8557 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8558 !Subtarget->hasGloballyAddressableScratch()) &&
8559 "Cannot use src_private_base with globally addressable scratch!");
8580 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
8584 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8586 if (UserSGPR == AMDGPU::NoRegister) {
8631 const AMDGPUTargetMachine &TM =
8634 unsigned DestAS, SrcAS;
8636 bool IsNonNull =
false;
8638 SrcAS = ASC->getSrcAddressSpace();
8639 Src = ASC->getOperand(0);
8640 DestAS = ASC->getDestAddressSpace();
8643 Op.getConstantOperandVal(0) ==
8644 Intrinsic::amdgcn_addrspacecast_nonnull);
8645 Src =
Op->getOperand(1);
8646 SrcAS =
Op->getConstantOperandVal(2);
8647 DestAS =
Op->getConstantOperandVal(3);
8660 Subtarget->hasGloballyAddressableScratch()) {
8665 AMDGPU::S_MOV_B32, SL, MVT::i32,
8666 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8674 unsigned NullVal = TM.getNullPointerValue(DestAS);
8689 Subtarget->hasGloballyAddressableScratch()) {
8698 if (Subtarget->isWave64())
8704 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8712 AMDGPU::S_MOV_B64, SL, MVT::i64,
8713 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8715 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8717 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8725 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8737 Op.getValueType() == MVT::i64) {
8738 const SIMachineFunctionInfo *
Info =
8740 if (
Info->get32BitAddressHighBits() == 0)
8749 Src.getValueType() == MVT::i64)
8777 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
8782 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8784 MVT::i32, InsNumElts / 2);
8789 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
8791 if (InsNumElts == 2) {
8804 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
8827 if (NumElts == 4 && EltSize == 16 && KIdx) {
8838 unsigned Idx = KIdx->getZExtValue();
8839 bool InsertLo = Idx < 2;
8843 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8849 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8862 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
8897 EVT ResultVT =
Op.getValueType();
8910 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8913 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8917 if (VecSize == 128) {
8925 }
else if (VecSize == 256) {
8928 for (
unsigned P = 0;
P < 4; ++
P) {
8934 Parts[0], Parts[1]));
8936 Parts[2], Parts[3]));
8942 for (
unsigned P = 0;
P < 8; ++
P) {
8949 Parts[0], Parts[1], Parts[2], Parts[3]));
8952 Parts[4], Parts[5], Parts[6], Parts[7]));
8972 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8987 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8997 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
9002 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
9003 !(Mask[Elt + 1] & 1);
9009 EVT ResultVT =
Op.getValueType();
9012 const int NewSrcNumElts = 2;
9014 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
9030 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
9052 if (ShouldUseConsecutiveExtract &&
9055 int VecIdx = Idx < SrcNumElts ? 0 : 1;
9056 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
9068 if (Idx0 >= SrcNumElts) {
9073 if (Idx1 >= SrcNumElts) {
9078 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
9079 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
9087 int NewMaskIdx0 = Idx0 - AlignedIdx0;
9088 int NewMaskIdx1 = Idx1 - AlignedIdx1;
9093 if (SubVec0 != SubVec1) {
9094 NewMaskIdx1 += NewSrcNumElts;
9101 {NewMaskIdx0, NewMaskIdx1});
9106 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
9107 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
9108 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
9109 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
9128 EVT ResultVT =
Op.getValueType();
9144 EVT VT =
Op.getValueType();
9146 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
9147 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
9181 for (
unsigned P = 0;
P < NumParts; ++
P) {
9183 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
9202 if (!Subtarget->isAmdHsaOS())
9245 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET64,
DL, PtrVT, Ptr);
9254 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET,
DL, PtrVT, PtrLo, PtrHi);
9262 EVT PtrVT =
Op.getValueType();
9264 const GlobalValue *GV = GSD->
getGlobal();
9278 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
9293 return DAG.
getNode(AMDGPUISD::LDS,
DL, MVT::i32, GA);
9296 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9297 if (Subtarget->has64BitLiterals()) {
9328 MachinePointerInfo PtrInfo =
9341 Fn,
"unsupported external symbol",
Op.getDebugLoc()));
9365 SDValue Param = lowerKernargMemParameter(
9376 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
9384 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
9392 unsigned NumElts = Elts.
size();
9394 if (NumElts <= 12) {
9403 for (
unsigned i = 0; i < Elts.
size(); ++i) {
9409 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
9419 EVT SrcVT = Src.getValueType();
9440 bool Unpacked,
bool IsD16,
int DMaskPop,
9441 int NumVDataDwords,
bool IsAtomicPacked16Bit,
9445 EVT ReqRetVT = ResultTypes[0];
9447 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9448 ? (ReqRetNumElts + 1) / 2
9451 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9462 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
9473 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
9475 NumDataDwords - MaskPopDwords);
9480 EVT LegalReqRetVT = ReqRetVT;
9482 if (!
Data.getValueType().isInteger())
9484 Data.getValueType().changeTypeToInteger(),
Data);
9505 if (Result->getNumValues() == 1)
9512 SDValue *LWE,
bool &IsTexFail) {
9532 unsigned DimIdx,
unsigned EndIdx,
9533 unsigned NumGradients) {
9535 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
9543 if (((
I + 1) >= EndIdx) ||
9544 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
9545 I == DimIdx + NumGradients - 1))) {
9567 !
Op.getNode()->hasAnyUseOfValue(0))
9569 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9579 ResultTypes.erase(&ResultTypes[0]);
9585 int NumVDataDwords = 0;
9586 bool AdjustRetType =
false;
9587 bool IsAtomicPacked16Bit =
false;
9590 const unsigned ArgOffset = WithChain ? 2 : 1;
9593 unsigned DMaskLanes = 0;
9595 if (BaseOpcode->
Atomic) {
9596 VData =
Op.getOperand(2);
9598 IsAtomicPacked16Bit =
9599 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9600 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9601 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9602 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9613 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9615 DMask = Is64Bit ? 0xf : 0x3;
9616 NumVDataDwords = Is64Bit ? 4 : 2;
9618 DMask = Is64Bit ? 0x3 : 0x1;
9619 NumVDataDwords = Is64Bit ? 2 : 1;
9622 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
9625 if (BaseOpcode->
Store) {
9626 VData =
Op.getOperand(2);
9630 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9634 VData = handleD16VData(VData, DAG,
true);
9637 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
9638 }
else if (!BaseOpcode->
NoReturn) {
9643 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9651 (!LoadVT.
isVector() && DMaskLanes > 1))
9657 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9658 !(BaseOpcode->
Gather4 && Subtarget->hasImageGather4D16Bug()))
9659 NumVDataDwords = (DMaskLanes + 1) / 2;
9661 NumVDataDwords = DMaskLanes;
9663 AdjustRetType =
true;
9667 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
9674 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9675 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9677 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
9679 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9680 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9684 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
9690 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
9694 "Bias needs to be converted to 16 bit in A16 mode");
9699 if (BaseOpcode->
Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
9703 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
9704 "require 16 bit args for both gradients and addresses");
9709 if (!
ST->hasA16()) {
9710 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
9711 "support 16 bit addresses\n");
9721 if (BaseOpcode->
Gradients && IsG16 &&
ST->hasG16()) {
9723 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9725 IntrOpcode = G16MappingInfo->
G16;
9748 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
9766 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->
Sampler);
9767 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
9768 const bool UseNSA =
ST->hasNSAEncoding() &&
9769 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
9770 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
9771 const bool UsePartialNSA =
9772 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
9775 if (UsePartialNSA) {
9777 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9778 }
else if (!UseNSA) {
9788 uint64_t UnormConst =
9789 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
9791 Unorm = UnormConst ? True : False;
9797 bool IsTexFail =
false;
9798 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9809 NumVDataDwords += 1;
9810 AdjustRetType =
true;
9815 if (AdjustRetType) {
9818 if (DMaskLanes == 0 && !BaseOpcode->
Store) {
9827 MVT::i32, NumVDataDwords)
9830 ResultTypes[0] = NewVT;
9831 if (ResultTypes.size() == 3) {
9835 ResultTypes.erase(&ResultTypes[1]);
9849 Ops.push_back(VData);
9850 if (UsePartialNSA) {
9852 Ops.push_back(VAddr);
9856 Ops.push_back(VAddr);
9859 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9861 Ops.push_back(Rsrc);
9866 Ops.push_back(Samp);
9871 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9872 Ops.push_back(Unorm);
9874 Ops.push_back(IsA16 &&
9875 ST->hasFeature(AMDGPU::FeatureR128A16)
9879 Ops.push_back(IsA16 ? True : False);
9881 if (!Subtarget->hasGFX90AInsts())
9886 "TFE is not supported on this GPU",
DL.getDebugLoc()));
9889 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9892 Ops.push_back(DimInfo->
DA ? True : False);
9894 Ops.push_back(IsD16 ? True : False);
9896 Ops.push_back(
Op.getOperand(0));
9898 int NumVAddrDwords =
9904 NumVDataDwords, NumVAddrDwords);
9905 }
else if (IsGFX11Plus) {
9907 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9908 : AMDGPU::MIMGEncGfx11Default,
9909 NumVDataDwords, NumVAddrDwords);
9910 }
else if (IsGFX10Plus) {
9912 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9913 : AMDGPU::MIMGEncGfx10Default,
9914 NumVDataDwords, NumVAddrDwords);
9916 if (Subtarget->hasGFX90AInsts()) {
9918 NumVDataDwords, NumVAddrDwords);
9922 "requested image instruction is not supported on this GPU",
9927 for (EVT VT : OrigResultTypes) {
9928 if (VT == MVT::Other)
9929 RetValues[Idx++] =
Op.getOperand(0);
9940 NumVDataDwords, NumVAddrDwords);
9943 NumVDataDwords, NumVAddrDwords);
9950 MachineMemOperand *MemRef = MemOp->getMemOperand();
9969 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9970 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9983 MachinePointerInfo(),
9988 if (!
Offset->isDivergent()) {
9995 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10004 !Subtarget->hasScalarDwordx3Loads()) {
10008 AMDGPUISD::SBUFFER_LOAD,
DL, DAG.
getVTList(WidenedVT),
Ops, WidenedVT,
10031 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10033 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
10037 unsigned NumLoads = 1;
10043 if (NumElts == 8 || NumElts == 16) {
10044 NumLoads = NumElts / 4;
10048 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
10053 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
10055 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
10056 for (
unsigned i = 0; i < NumLoads; ++i) {
10058 Loads.
push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD,
DL, VTList,
Ops,
10059 LoadVT, MMO, DAG));
10062 if (NumElts == 8 || NumElts == 16)
10070 if (!Subtarget->hasArchitectedSGPRs())
10075 return DAG.
getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
10082 unsigned Width)
const {
10084 using namespace AMDGPU::Hwreg;
10086 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
10125 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
10127 EVT VT =
Op.getValueType();
10129 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
10133 switch (IntrinsicID) {
10134 case Intrinsic::amdgcn_implicit_buffer_ptr: {
10137 return getPreloadedValue(DAG, *MFI, VT,
10140 case Intrinsic::amdgcn_dispatch_ptr:
10141 case Intrinsic::amdgcn_queue_ptr: {
10142 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
10144 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
10145 DL.getDebugLoc()));
10149 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
10152 return getPreloadedValue(DAG, *MFI, VT, RegID);
10154 case Intrinsic::amdgcn_implicitarg_ptr: {
10156 return getImplicitArgPtr(DAG,
DL);
10157 return getPreloadedValue(DAG, *MFI, VT,
10160 case Intrinsic::amdgcn_kernarg_segment_ptr: {
10166 return getPreloadedValue(DAG, *MFI, VT,
10169 case Intrinsic::amdgcn_dispatch_id: {
10172 case Intrinsic::amdgcn_rcp:
10173 return DAG.
getNode(AMDGPUISD::RCP,
DL, VT,
Op.getOperand(1));
10174 case Intrinsic::amdgcn_rsq:
10175 return DAG.
getNode(AMDGPUISD::RSQ,
DL, VT,
Op.getOperand(1));
10176 case Intrinsic::amdgcn_rsq_legacy:
10180 case Intrinsic::amdgcn_rcp_legacy:
10183 return DAG.
getNode(AMDGPUISD::RCP_LEGACY,
DL, VT,
Op.getOperand(1));
10184 case Intrinsic::amdgcn_rsq_clamp: {
10186 return DAG.
getNode(AMDGPUISD::RSQ_CLAMP,
DL, VT,
Op.getOperand(1));
10198 case Intrinsic::r600_read_ngroups_x:
10199 if (Subtarget->isAmdHsaOS())
10202 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10205 case Intrinsic::r600_read_ngroups_y:
10206 if (Subtarget->isAmdHsaOS())
10209 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10212 case Intrinsic::r600_read_ngroups_z:
10213 if (Subtarget->isAmdHsaOS())
10216 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10219 case Intrinsic::r600_read_local_size_x:
10220 if (Subtarget->isAmdHsaOS())
10223 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10225 case Intrinsic::r600_read_local_size_y:
10226 if (Subtarget->isAmdHsaOS())
10229 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10231 case Intrinsic::r600_read_local_size_z:
10232 if (Subtarget->isAmdHsaOS())
10235 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10237 case Intrinsic::amdgcn_workgroup_id_x:
10238 return lowerWorkGroupId(DAG, *MFI, VT,
10242 case Intrinsic::amdgcn_workgroup_id_y:
10243 return lowerWorkGroupId(DAG, *MFI, VT,
10247 case Intrinsic::amdgcn_workgroup_id_z:
10248 return lowerWorkGroupId(DAG, *MFI, VT,
10252 case Intrinsic::amdgcn_cluster_id_x:
10253 return Subtarget->hasClusters()
10254 ? getPreloadedValue(DAG, *MFI, VT,
10256 : DAG.getPOISON(VT);
10257 case Intrinsic::amdgcn_cluster_id_y:
10258 return Subtarget->hasClusters()
10259 ? getPreloadedValue(DAG, *MFI, VT,
10262 case Intrinsic::amdgcn_cluster_id_z:
10263 return Subtarget->hasClusters()
10264 ? getPreloadedValue(DAG, *MFI, VT,
10267 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10268 return Subtarget->hasClusters()
10269 ? getPreloadedValue(
10273 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10274 return Subtarget->hasClusters()
10275 ? getPreloadedValue(
10279 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10280 return Subtarget->hasClusters()
10281 ? getPreloadedValue(
10285 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10286 return Subtarget->hasClusters()
10289 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10290 return Subtarget->hasClusters()
10291 ? getPreloadedValue(
10295 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10296 return Subtarget->hasClusters()
10297 ? getPreloadedValue(
10301 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10302 return Subtarget->hasClusters()
10303 ? getPreloadedValue(
10307 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10308 return Subtarget->hasClusters()
10309 ? getPreloadedValue(
10313 case Intrinsic::amdgcn_wave_id:
10314 return lowerWaveID(DAG,
Op);
10315 case Intrinsic::amdgcn_lds_kernel_id: {
10317 return getLDSKernelId(DAG,
DL);
10318 return getPreloadedValue(DAG, *MFI, VT,
10321 case Intrinsic::amdgcn_workitem_id_x:
10322 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
10323 case Intrinsic::amdgcn_workitem_id_y:
10324 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
10325 case Intrinsic::amdgcn_workitem_id_z:
10326 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
10327 case Intrinsic::amdgcn_wavefrontsize:
10329 SDLoc(
Op), MVT::i32);
10330 case Intrinsic::amdgcn_s_buffer_load: {
10331 unsigned CPol =
Op.getConstantOperandVal(3);
10338 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
10339 Op.getOperand(3), DAG);
10341 case Intrinsic::amdgcn_fdiv_fast:
10342 return lowerFDIV_FAST(
Op, DAG);
10343 case Intrinsic::amdgcn_sin:
10344 return DAG.
getNode(AMDGPUISD::SIN_HW,
DL, VT,
Op.getOperand(1));
10346 case Intrinsic::amdgcn_cos:
10347 return DAG.
getNode(AMDGPUISD::COS_HW,
DL, VT,
Op.getOperand(1));
10349 case Intrinsic::amdgcn_mul_u24:
10350 return DAG.
getNode(AMDGPUISD::MUL_U24,
DL, VT,
Op.getOperand(1),
10352 case Intrinsic::amdgcn_mul_i24:
10353 return DAG.
getNode(AMDGPUISD::MUL_I24,
DL, VT,
Op.getOperand(1),
10356 case Intrinsic::amdgcn_log_clamp: {
10362 case Intrinsic::amdgcn_fract:
10363 return DAG.
getNode(AMDGPUISD::FRACT,
DL, VT,
Op.getOperand(1));
10365 case Intrinsic::amdgcn_class:
10366 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, VT,
Op.getOperand(1),
10368 case Intrinsic::amdgcn_div_fmas:
10369 return DAG.
getNode(AMDGPUISD::DIV_FMAS,
DL, VT,
Op.getOperand(1),
10370 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10372 case Intrinsic::amdgcn_div_fixup:
10373 return DAG.
getNode(AMDGPUISD::DIV_FIXUP,
DL, VT,
Op.getOperand(1),
10374 Op.getOperand(2),
Op.getOperand(3));
10376 case Intrinsic::amdgcn_div_scale: {
10382 SDValue Denominator =
Op.getOperand(2);
10389 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
10391 return DAG.
getNode(AMDGPUISD::DIV_SCALE,
DL,
Op->getVTList(), Src0,
10392 Denominator, Numerator);
10394 case Intrinsic::amdgcn_icmp: {
10396 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
10397 Op.getConstantOperandVal(2) == 0 &&
10402 case Intrinsic::amdgcn_fcmp: {
10405 case Intrinsic::amdgcn_ballot:
10407 case Intrinsic::amdgcn_fmed3:
10408 return DAG.
getNode(AMDGPUISD::FMED3,
DL, VT,
Op.getOperand(1),
10409 Op.getOperand(2),
Op.getOperand(3));
10410 case Intrinsic::amdgcn_fdot2:
10411 return DAG.
getNode(AMDGPUISD::FDOT2,
DL, VT,
Op.getOperand(1),
10412 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10413 case Intrinsic::amdgcn_fmul_legacy:
10414 return DAG.
getNode(AMDGPUISD::FMUL_LEGACY,
DL, VT,
Op.getOperand(1),
10416 case Intrinsic::amdgcn_sffbh:
10417 return DAG.
getNode(AMDGPUISD::FFBH_I32,
DL, VT,
Op.getOperand(1));
10418 case Intrinsic::amdgcn_sbfe:
10419 return DAG.
getNode(AMDGPUISD::BFE_I32,
DL, VT,
Op.getOperand(1),
10420 Op.getOperand(2),
Op.getOperand(3));
10421 case Intrinsic::amdgcn_ubfe:
10422 return DAG.
getNode(AMDGPUISD::BFE_U32,
DL, VT,
Op.getOperand(1),
10423 Op.getOperand(2),
Op.getOperand(3));
10424 case Intrinsic::amdgcn_cvt_pkrtz:
10425 case Intrinsic::amdgcn_cvt_pknorm_i16:
10426 case Intrinsic::amdgcn_cvt_pknorm_u16:
10427 case Intrinsic::amdgcn_cvt_pk_i16:
10428 case Intrinsic::amdgcn_cvt_pk_u16: {
10430 EVT VT =
Op.getValueType();
10433 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10434 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10435 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10436 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10437 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10438 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10439 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10440 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10442 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10445 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
10448 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
10451 case Intrinsic::amdgcn_fmad_ftz:
10452 return DAG.
getNode(AMDGPUISD::FMAD_FTZ,
DL, VT,
Op.getOperand(1),
10453 Op.getOperand(2),
Op.getOperand(3));
10455 case Intrinsic::amdgcn_if_break:
10457 Op->getOperand(1),
Op->getOperand(2)),
10460 case Intrinsic::amdgcn_groupstaticsize: {
10466 const GlobalValue *GV =
10472 case Intrinsic::amdgcn_is_shared:
10473 case Intrinsic::amdgcn_is_private: {
10480 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10484 Subtarget->hasGloballyAddressableScratch()) {
10487 AMDGPU::S_MOV_B32,
DL, MVT::i32,
10488 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10497 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10500 case Intrinsic::amdgcn_perm:
10501 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op.getOperand(1),
10502 Op.getOperand(2),
Op.getOperand(3));
10503 case Intrinsic::amdgcn_reloc_constant: {
10513 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10514 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10515 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10516 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10517 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10518 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10519 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10520 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10521 if (
Op.getOperand(4).getValueType() == MVT::i32)
10527 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10528 Op.getOperand(3), IndexKeyi32);
10530 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10531 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10532 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10533 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10534 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10535 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10536 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10537 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10538 if (
Op.getOperand(4).getValueType() == MVT::i64)
10543 Op.getOperand(4).getValueType() == MVT::v2i32
10547 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10548 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10549 Op.getOperand(6)});
10551 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10552 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10553 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10554 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10555 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10556 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10557 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10560 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
10565 Op.getOperand(6).getValueType().isVector()
10569 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10570 Op.getOperand(3),
Op.getOperand(4),
Op.getOperand(5),
10571 IndexKey,
Op.getOperand(7),
Op.getOperand(8)};
10572 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
10573 Args.push_back(
Op.getOperand(9));
10576 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10577 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10578 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10579 if (
Op.getOperand(6).getValueType() == MVT::i32)
10585 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10586 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10587 IndexKeyi32, Op.getOperand(7)});
10589 case Intrinsic::amdgcn_addrspacecast_nonnull:
10590 return lowerADDRSPACECAST(
Op, DAG);
10591 case Intrinsic::amdgcn_readlane:
10592 case Intrinsic::amdgcn_readfirstlane:
10593 case Intrinsic::amdgcn_writelane:
10594 case Intrinsic::amdgcn_permlane16:
10595 case Intrinsic::amdgcn_permlanex16:
10596 case Intrinsic::amdgcn_permlane64:
10597 case Intrinsic::amdgcn_set_inactive:
10598 case Intrinsic::amdgcn_set_inactive_chain_arg:
10599 case Intrinsic::amdgcn_mov_dpp8:
10600 case Intrinsic::amdgcn_update_dpp:
10602 case Intrinsic::amdgcn_dead: {
10604 for (
const EVT ValTy :
Op.getNode()->values())
10608 case Intrinsic::amdgcn_wave_shuffle:
10611 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10613 return lowerImage(
Op, ImageDimIntr, DAG,
false);
10623 if (Subtarget->hasRestrictedSOffset() &&
isNullConstant(SOffset))
10624 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10630 unsigned NewOpcode)
const {
10634 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10635 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10653 M->getMemOperand());
10658 unsigned NewOpcode)
const {
10662 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10663 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10681 M->getMemOperand());
10686 unsigned IntrID =
Op.getConstantOperandVal(1);
10690 case Intrinsic::amdgcn_ds_ordered_add:
10691 case Intrinsic::amdgcn_ds_ordered_swap: {
10696 unsigned IndexOperand =
M->getConstantOperandVal(7);
10697 unsigned WaveRelease =
M->getConstantOperandVal(8);
10698 unsigned WaveDone =
M->getConstantOperandVal(9);
10700 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10701 IndexOperand &= ~0x3f;
10702 unsigned CountDw = 0;
10705 CountDw = (IndexOperand >> 24) & 0xf;
10706 IndexOperand &= ~(0xf << 24);
10708 if (CountDw < 1 || CountDw > 4) {
10711 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
10712 DL.getDebugLoc()));
10717 if (IndexOperand) {
10720 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
10723 if (WaveDone && !WaveRelease) {
10727 Fn,
"ds_ordered_count: wave_done requires wave_release",
10728 DL.getDebugLoc()));
10731 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10732 unsigned ShaderType =
10734 unsigned Offset0 = OrderedCountIndex << 2;
10735 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10738 Offset1 |= (CountDw - 1) << 6;
10741 Offset1 |= ShaderType << 2;
10743 unsigned Offset = Offset0 | (Offset1 << 8);
10750 M->getVTList(),
Ops,
M->getMemoryVT(),
10751 M->getMemOperand());
10753 case Intrinsic::amdgcn_raw_buffer_load:
10754 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10755 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10756 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10757 case Intrinsic::amdgcn_raw_buffer_load_format:
10758 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10759 const bool IsFormat =
10760 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10761 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10763 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10764 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10778 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
10780 case Intrinsic::amdgcn_struct_buffer_load:
10781 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10782 case Intrinsic::amdgcn_struct_buffer_load_format:
10783 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10784 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10785 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10786 const bool IsFormat =
10787 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10788 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10790 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10791 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10806 case Intrinsic::amdgcn_raw_tbuffer_load:
10807 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10809 EVT LoadVT =
Op.getValueType();
10810 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10811 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10827 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10829 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
10830 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10833 case Intrinsic::amdgcn_struct_tbuffer_load:
10834 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10836 EVT LoadVT =
Op.getValueType();
10837 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10838 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10854 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10856 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
10857 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10860 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10861 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10862 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10863 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10864 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10865 return lowerStructBufferAtomicIntrin(
Op, DAG,
10866 AMDGPUISD::BUFFER_ATOMIC_FADD);
10867 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10868 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10869 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10870 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10871 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10872 return lowerStructBufferAtomicIntrin(
Op, DAG,
10873 AMDGPUISD::BUFFER_ATOMIC_FMIN);
10874 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10875 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10876 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10877 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10878 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10879 return lowerStructBufferAtomicIntrin(
Op, DAG,
10880 AMDGPUISD::BUFFER_ATOMIC_FMAX);
10881 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10882 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10883 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10884 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10885 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10886 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10887 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10888 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10889 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10890 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10891 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10892 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10893 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10894 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10895 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10896 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10898 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10899 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10900 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10901 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10902 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10903 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10904 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10905 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10906 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10907 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10908 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10909 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10910 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10911 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10913 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10914 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10915 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10916 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10917 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10918 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10919 return lowerStructBufferAtomicIntrin(
Op, DAG,
10920 AMDGPUISD::BUFFER_ATOMIC_SWAP);
10921 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10922 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10923 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10924 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10925 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10926 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10927 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10928 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10929 return lowerStructBufferAtomicIntrin(
Op, DAG,
10930 AMDGPUISD::BUFFER_ATOMIC_SMIN);
10931 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10932 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10933 return lowerStructBufferAtomicIntrin(
Op, DAG,
10934 AMDGPUISD::BUFFER_ATOMIC_UMIN);
10935 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10936 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10937 return lowerStructBufferAtomicIntrin(
Op, DAG,
10938 AMDGPUISD::BUFFER_ATOMIC_SMAX);
10939 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10940 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10941 return lowerStructBufferAtomicIntrin(
Op, DAG,
10942 AMDGPUISD::BUFFER_ATOMIC_UMAX);
10943 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10945 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10946 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10947 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10948 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10949 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10950 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10951 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10952 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10953 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10954 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10955 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10956 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10957 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10958 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10959 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10960 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
10961 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10962 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10963 return lowerStructBufferAtomicIntrin(
Op, DAG,
10964 AMDGPUISD::BUFFER_ATOMIC_CSUB);
10965 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10966 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10967 return lowerRawBufferAtomicIntrin(
Op, DAG,
10968 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10969 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10970 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10971 return lowerStructBufferAtomicIntrin(
Op, DAG,
10972 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10973 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10974 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10975 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
10976 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10990 EVT VT =
Op.getValueType();
10994 Op->getVTList(),
Ops, VT,
10995 M->getMemOperand());
10997 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10998 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10999 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
11000 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
11014 EVT VT =
Op.getValueType();
11018 Op->getVTList(),
Ops, VT,
11019 M->getMemOperand());
11021 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
11022 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
11024 SDValue NodePtr =
M->getOperand(2);
11025 SDValue RayExtent =
M->getOperand(3);
11026 SDValue InstanceMask =
M->getOperand(4);
11027 SDValue RayOrigin =
M->getOperand(5);
11028 SDValue RayDir =
M->getOperand(6);
11030 SDValue TDescr =
M->getOperand(8);
11035 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
11040 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
11041 const unsigned NumVDataDwords = 10;
11042 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
11044 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
11045 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
11046 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
11050 Ops.push_back(NodePtr);
11053 {DAG.getBitcast(MVT::i32, RayExtent),
11054 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
11055 Ops.push_back(RayOrigin);
11056 Ops.push_back(RayDir);
11057 Ops.push_back(Offsets);
11058 Ops.push_back(TDescr);
11059 Ops.push_back(
M->getChain());
11062 MachineMemOperand *MemRef =
M->getMemOperand();
11066 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
11068 SDValue NodePtr =
M->getOperand(2);
11069 SDValue RayExtent =
M->getOperand(3);
11070 SDValue RayOrigin =
M->getOperand(4);
11071 SDValue RayDir =
M->getOperand(5);
11072 SDValue RayInvDir =
M->getOperand(6);
11073 SDValue TDescr =
M->getOperand(7);
11080 if (!Subtarget->hasGFX10_AEncoding()) {
11090 const unsigned NumVDataDwords = 4;
11091 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
11092 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
11093 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
11096 const unsigned BaseOpcodes[2][2] = {
11097 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
11098 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
11099 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
11103 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
11104 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
11105 : AMDGPU::MIMGEncGfx10NSA,
11106 NumVDataDwords, NumVAddrDwords);
11110 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
11111 : AMDGPU::MIMGEncGfx10Default,
11112 NumVDataDwords, NumVAddrDwords);
11118 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
11121 if (Lanes[0].getValueSizeInBits() == 32) {
11122 for (
unsigned I = 0;
I < 3; ++
I)
11129 Ops.push_back(Lanes[2]);
11141 if (UseNSA && IsGFX11Plus) {
11142 Ops.push_back(NodePtr);
11144 Ops.push_back(RayOrigin);
11149 for (
unsigned I = 0;
I < 3; ++
I) {
11152 {DirLanes[I], InvDirLanes[I]})));
11156 Ops.push_back(RayDir);
11157 Ops.push_back(RayInvDir);
11164 Ops.push_back(NodePtr);
11167 packLanes(RayOrigin,
true);
11168 packLanes(RayDir,
true);
11169 packLanes(RayInvDir,
false);
11174 if (NumVAddrDwords > 12) {
11182 Ops.push_back(MergedOps);
11185 Ops.push_back(TDescr);
11187 Ops.push_back(
M->getChain());
11190 MachineMemOperand *MemRef =
M->getMemOperand();
11194 case Intrinsic::amdgcn_global_atomic_fmin_num:
11195 case Intrinsic::amdgcn_global_atomic_fmax_num:
11196 case Intrinsic::amdgcn_flat_atomic_fmin_num:
11197 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11204 unsigned Opcode = 0;
11206 case Intrinsic::amdgcn_global_atomic_fmin_num:
11207 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
11211 case Intrinsic::amdgcn_global_atomic_fmax_num:
11212 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11219 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
11220 Ops,
M->getMemOperand());
11222 case Intrinsic::amdgcn_s_alloc_vgpr: {
11230 ReadFirstLaneID, NumVGPRs);
11233 Op.getOperand(0),
Op.getOperand(1), NumVGPRs);
11235 case Intrinsic::amdgcn_s_get_barrier_state:
11236 case Intrinsic::amdgcn_s_get_named_barrier_state: {
11243 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
11244 BarID = (BarID >> 4) & 0x3F;
11245 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
11248 Ops.push_back(Chain);
11250 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
11251 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
11259 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11267 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11268 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11269 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11273 EVT VT =
Op->getValueType(0);
11277 case Intrinsic::amdgcn_flat_load_monitor_b32:
11278 case Intrinsic::amdgcn_flat_load_monitor_b64:
11279 case Intrinsic::amdgcn_flat_load_monitor_b128: {
11284 Op->getVTList(), {Chain, Ptr},
11287 case Intrinsic::amdgcn_global_load_monitor_b32:
11288 case Intrinsic::amdgcn_global_load_monitor_b64:
11289 case Intrinsic::amdgcn_global_load_monitor_b128: {
11294 Op->getVTList(), {Chain, Ptr},
11299 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11301 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11309SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
11316 EVT VT = VTList.
VTs[0];
11319 bool IsTFE = VTList.
NumVTs == 3;
11322 unsigned NumOpDWords = NumValueDWords + 1;
11324 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
11325 MachineMemOperand *OpDWordsMMO =
11327 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
11328 OpDWordsVT, OpDWordsMMO, DAG);
11333 NumValueDWords == 1
11342 if (!Subtarget->hasDwordx3LoadStores() &&
11343 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11347 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
11349 WidenedMemVT, WidenedMMO);
11359 bool ImageStore)
const {
11369 if (Subtarget->hasUnpackedD16VMem()) {
11383 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11394 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
11400 if ((NumElements % 2) == 1) {
11402 unsigned I = Elts.
size() / 2;
11418 if (NumElements == 3) {
11437 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11438 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11439 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11440 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
11441 case Intrinsic::amdgcn_load_async_to_lds:
11442 case Intrinsic::amdgcn_global_load_async_lds:
11452 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
11454 switch (IntrinsicID) {
11455 case Intrinsic::amdgcn_exp_compr: {
11456 if (!Subtarget->hasCompressedExport()) {
11459 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
11481 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11485 case Intrinsic::amdgcn_struct_tbuffer_store:
11486 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11488 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11490 VData = handleD16VData(VData, DAG);
11491 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11492 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11506 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11507 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11510 M->getMemoryVT(),
M->getMemOperand());
11513 case Intrinsic::amdgcn_raw_tbuffer_store:
11514 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11516 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11518 VData = handleD16VData(VData, DAG);
11519 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11520 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11534 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11535 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11538 M->getMemoryVT(),
M->getMemOperand());
11541 case Intrinsic::amdgcn_raw_buffer_store:
11542 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11543 case Intrinsic::amdgcn_raw_buffer_store_format:
11544 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11545 const bool IsFormat =
11546 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11547 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11554 VData = handleD16VData(VData, DAG);
11564 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11565 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11579 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11580 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11585 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
11588 M->getMemoryVT(),
M->getMemOperand());
11591 case Intrinsic::amdgcn_struct_buffer_store:
11592 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11593 case Intrinsic::amdgcn_struct_buffer_store_format:
11594 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11595 const bool IsFormat =
11596 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11597 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11605 VData = handleD16VData(VData, DAG);
11615 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11616 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11630 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11631 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11635 EVT VDataType = VData.getValueType().getScalarType();
11637 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
11640 M->getMemoryVT(),
M->getMemOperand());
11642 case Intrinsic::amdgcn_raw_buffer_load_lds:
11643 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
11644 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11645 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
11646 case Intrinsic::amdgcn_struct_buffer_load_lds:
11647 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
11648 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
11649 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
11650 if (!Subtarget->hasVMemToLDSLoad())
11654 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11655 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds ||
11656 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds ||
11657 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;
11658 unsigned OpOffset = HasVIndex ? 1 : 0;
11659 SDValue VOffset =
Op.getOperand(5 + OpOffset);
11661 unsigned Size =
Op->getConstantOperandVal(4);
11667 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11668 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11669 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11670 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11673 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11674 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11675 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11676 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11679 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11680 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11681 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11682 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11685 if (!Subtarget->hasLDSLoadB96_B128())
11687 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11688 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11689 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11690 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11693 if (!Subtarget->hasLDSLoadB96_B128())
11695 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11696 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11697 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11698 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11706 if (HasVIndex && HasVOffset)
11710 else if (HasVIndex)
11711 Ops.push_back(
Op.getOperand(5));
11712 else if (HasVOffset)
11713 Ops.push_back(VOffset);
11715 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11716 Ops.push_back(Rsrc);
11717 Ops.push_back(
Op.getOperand(6 + OpOffset));
11718 Ops.push_back(
Op.getOperand(7 + OpOffset));
11720 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
11743 case Intrinsic::amdgcn_load_to_lds:
11744 case Intrinsic::amdgcn_load_async_to_lds:
11745 case Intrinsic::amdgcn_global_load_lds:
11746 case Intrinsic::amdgcn_global_load_async_lds: {
11747 if (!Subtarget->hasVMemToLDSLoad())
11751 unsigned Size =
Op->getConstantOperandVal(4);
11756 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11759 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11762 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11765 if (!Subtarget->hasLDSLoadB96_B128())
11767 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11770 if (!Subtarget->hasLDSLoadB96_B128())
11772 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11788 if (
LHS->isDivergent())
11792 RHS.getOperand(0).getValueType() == MVT::i32) {
11795 VOffset =
RHS.getOperand(0);
11799 Ops.push_back(Addr);
11807 Ops.push_back(VOffset);
11810 Ops.push_back(
Op.getOperand(5));
11812 unsigned Aux =
Op.getConstantOperandVal(6);
11827 case Intrinsic::amdgcn_end_cf:
11829 Op->getOperand(2), Chain),
11831 case Intrinsic::amdgcn_s_barrier_init:
11832 case Intrinsic::amdgcn_s_barrier_signal_var: {
11839 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11840 ? AMDGPU::S_BARRIER_INIT_M0
11841 : AMDGPU::S_BARRIER_SIGNAL_M0;
11856 constexpr unsigned ShAmt = 16;
11863 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11868 case Intrinsic::amdgcn_s_wakeup_barrier: {
11869 if (!Subtarget->hasSWakeupBarrier())
11873 case Intrinsic::amdgcn_s_barrier_join: {
11882 switch (IntrinsicID) {
11885 case Intrinsic::amdgcn_s_barrier_join:
11886 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11888 case Intrinsic::amdgcn_s_wakeup_barrier:
11889 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11893 unsigned BarID = (BarVal >> 4) & 0x3F;
11896 Ops.push_back(Chain);
11898 switch (IntrinsicID) {
11901 case Intrinsic::amdgcn_s_barrier_join:
11902 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11904 case Intrinsic::amdgcn_s_wakeup_barrier:
11905 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11916 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11922 case Intrinsic::amdgcn_s_prefetch_data: {
11925 return Op.getOperand(0);
11928 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11930 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
11937 Op->getVTList(),
Ops,
M->getMemoryVT(),
11938 M->getMemOperand());
11940 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11941 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11942 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11951 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11953 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11969 return PtrVT == MVT::i64;
11983std::pair<SDValue, SDValue>
11996 bool CheckNUW = Subtarget->hasGFX1250Insts();
12013 unsigned Overflow = ImmOffset & ~MaxImm;
12014 ImmOffset -= Overflow;
12015 if ((int32_t)Overflow < 0) {
12016 Overflow += ImmOffset;
12021 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
12040void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
12042 Align Alignment)
const {
12044 SDLoc
DL(CombinedOffset);
12046 uint32_t
Imm =
C->getZExtValue();
12047 uint32_t SOffset, ImmOffset;
12048 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
12059 bool CheckNUW = Subtarget->hasGFX1250Insts();
12062 uint32_t SOffset, ImmOffset;
12065 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
12073 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
12082SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
12085 return MaybePointer;
12099 SDValue NumRecords =
Op->getOperand(3);
12105 if (Subtarget->has45BitNumRecordsBufferResource()) {
12124 SDValue ExtShiftedStrideVec =
12136 DAG.
getNode(
ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
12138 DAG.
getNode(
ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
12143 auto [LowHalf, HighHalf] =
12144 DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
12154 NumRecords, Flags);
12166 bool IsTFE)
const {
12171 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
12172 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
12175 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
12187 ? AMDGPUISD::BUFFER_LOAD_UBYTE
12188 : AMDGPUISD::BUFFER_LOAD_USHORT;
12190 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
12204 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
12208 Ops[1] = BufferStoreExt;
12209 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
12210 : AMDGPUISD::BUFFER_STORE_SHORT;
12213 M->getMemOperand());
12238 DAGCombinerInfo &DCI)
const {
12239 SelectionDAG &DAG = DCI.DAG;
12254 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
12261 "unexpected vector extload");
12274 "unexpected fp extload");
12292 DCI.AddToWorklist(Cvt.
getNode());
12297 DCI.AddToWorklist(Cvt.
getNode());
12308 if (Info.isEntryFunction())
12309 return Info.getUserSGPRInfo().hasFlatScratchInit();
12317 EVT MemVT =
Load->getMemoryVT();
12318 MachineMemOperand *MMO =
Load->getMemOperand();
12330 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12358 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
12359 "Custom lowering for non-i32 vectors hasn't been implemented.");
12362 unsigned AS =
Load->getAddressSpace();
12363 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12370 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12374 !Subtarget->hasMultiDwordFlatScratchAddressing())
12384 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
12387 Alignment >=
Align(4) && NumElements < 32) {
12389 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12401 if (NumElements > 4)
12404 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12414 switch (Subtarget->getMaxPrivateElementSize()) {
12420 if (NumElements > 2)
12425 if (NumElements > 4)
12428 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12437 auto Flags =
Load->getMemOperand()->getFlags();
12439 Load->getAlign(), Flags, &
Fast) &&
12448 MemVT, *
Load->getMemOperand())) {
12457 EVT VT =
Op.getValueType();
12494 EVT VT =
Op.getValueType();
12495 const SDNodeFlags
Flags =
Op->getFlags();
12497 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
12503 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12506 if (CLHS->isExactlyValue(1.0)) {
12519 return DAG.
getNode(AMDGPUISD::RCP, SL, VT,
RHS);
12523 if (CLHS->isExactlyValue(-1.0)) {
12526 return DAG.
getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12532 if (!AllowInaccurateRcp &&
12533 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
12547 EVT VT =
Op.getValueType();
12548 const SDNodeFlags
Flags =
Op->getFlags();
12550 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
12551 if (!AllowInaccurateDiv)
12572 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
12582 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12586 return DAG.
getNode(Opcode, SL, VTList,
12595 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
12605 Opcode = AMDGPUISD::FMA_W_CHAIN;
12609 return DAG.
getNode(Opcode, SL, VTList,
12615 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12616 return FastLowered;
12619 EVT VT =
Op.getValueType();
12626 if (VT == MVT::bf16) {
12649 unsigned FMADOpCode =
12653 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt,
Op->getFlags());
12656 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12658 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
12659 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12669 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst,
RHS,
LHS,
12675 SDNodeFlags
Flags =
Op->getFlags();
12685 const APFloat K0Val(0x1p+96f);
12688 const APFloat K1Val(0x1p-32f);
12715 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
12716 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12717 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12722 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12723 return FastLowered;
12729 SDNodeFlags
Flags =
Op->getFlags();
12730 Flags.setNoFPExcept(
true);
12738 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
12747 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12751 using namespace AMDGPU::Hwreg;
12752 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12756 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
12757 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
12760 const bool HasDynamicDenormals =
12766 if (!PreservesDenormals) {
12771 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12774 if (HasDynamicDenormals) {
12778 SavedDenormMode =
SDValue(GetReg, 0);
12784 SDNode *EnableDenorm;
12785 if (Subtarget->hasDenormModeInst()) {
12786 const SDValue EnableDenormValue =
12789 EnableDenorm = DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12793 const SDValue EnableDenormValue =
12795 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12796 {EnableDenormValue,
BitField, Glue});
12806 ApproxRcp, One, NegDivScale0, Flags);
12809 ApproxRcp, Fma0, Flags);
12815 NumeratorScaled,
Mul, Flags);
12821 NumeratorScaled, Fma3, Flags);
12823 if (!PreservesDenormals) {
12824 SDNode *DisableDenorm;
12825 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12829 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12831 DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12835 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
12836 const SDValue DisableDenormValue =
12837 HasDynamicDenormals
12842 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12853 {Fma4, Fma1, Fma3, Scale},
Flags);
12855 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas,
RHS,
LHS, Flags);
12859 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
12860 return FastLowered;
12868 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
12874 SDValue Rcp = DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12892 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12922 DAG.
getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3,
Mul, Scale);
12924 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas,
Y,
X);
12928 EVT VT =
Op.getValueType();
12930 if (VT == MVT::f32)
12931 return LowerFDIV32(
Op, DAG);
12933 if (VT == MVT::f64)
12934 return LowerFDIV64(
Op, DAG);
12936 if (VT == MVT::f16 || VT == MVT::bf16)
12937 return LowerFDIV16(
Op, DAG);
12946 EVT ResultExpVT =
Op->getValueType(1);
12947 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12957 if (Subtarget->hasFractBug()) {
12975 EVT VT =
Store->getMemoryVT();
12977 if (VT == MVT::i1) {
12981 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
12985 Store->getValue().getValueType().getScalarType() == MVT::i32);
12987 unsigned AS =
Store->getAddressSpace();
12988 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12996 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
13000 !Subtarget->hasMultiDwordFlatScratchAddressing())
13007 if (NumElements > 4)
13010 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13014 VT, *
Store->getMemOperand()))
13020 switch (Subtarget->getMaxPrivateElementSize()) {
13024 if (NumElements > 2)
13028 if (NumElements > 4 ||
13029 (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
13037 auto Flags =
Store->getMemOperand()->getFlags();
13056 assert(!Subtarget->has16BitInsts());
13057 SDNodeFlags
Flags =
Op->getFlags();
13071 SDNodeFlags
Flags =
Op->getFlags();
13072 MVT VT =
Op.getValueType().getSimpleVT();
13180 SDNodeFlags
Flags =
Op->getFlags();
13243 EVT VT =
Op.getValueType();
13254 if (!
V.getValueType().isVector())
13262 if (Subtarget->hasTrigReducedRange()) {
13264 TrigVal = UnrollIfVec(DAG.
getNode(AMDGPUISD::FRACT,
DL, VT, MulVal, Flags));
13269 switch (
Op.getOpcode()) {
13271 TrigVal = DAG.
getNode(AMDGPUISD::COS_HW, SDLoc(
Op), VT, TrigVal, Flags);
13274 TrigVal = DAG.
getNode(AMDGPUISD::SIN_HW, SDLoc(
Op), VT, TrigVal, Flags);
13280 return UnrollIfVec(TrigVal);
13300 EVT VT =
Op.getValueType();
13308 Op->getVTList(),
Ops, VT,
13317SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
13318 DAGCombinerInfo &DCI)
const {
13319 EVT VT =
N->getValueType(0);
13321 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13324 SelectionDAG &DAG = DCI.DAG;
13328 EVT SrcVT = Src.getValueType();
13334 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13337 DCI.AddToWorklist(Cvt.
getNode());
13340 if (ScalarVT != MVT::f32) {
13352 DAGCombinerInfo &DCI)
const {
13363 SelectionDAG &DAG = DCI.DAG;
13382 for (
unsigned I = 0;
I != NumElts; ++
I) {
13406 if (NewElts.
size() == 1)
13428 for (
unsigned I = 0;
I != NumElts; ++
I) {
13463SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
13465 DAGCombinerInfo &DCI)
const {
13482 SelectionDAG &DAG = DCI.DAG;
13495 AM.BaseOffs =
Offset.getSExtValue();
13500 EVT VT =
N->getValueType(0);
13506 Flags.setNoUnsignedWrap(
13507 N->getFlags().hasNoUnsignedWrap() &&
13519 switch (
N->getOpcode()) {
13530 DAGCombinerInfo &DCI)
const {
13531 SelectionDAG &DAG = DCI.DAG;
13538 SDValue NewPtr = performSHLPtrCombine(Ptr.
getNode(),
N->getAddressSpace(),
13539 N->getMemoryVT(), DCI);
13543 NewOps[PtrIdx] = NewPtr;
13552 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13553 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13562SDValue SITargetLowering::splitBinaryBitConstantOp(
13566 uint32_t ValLo =
Lo_32(Val);
13567 uint32_t ValHi =
Hi_32(Val);
13574 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
13588 if (V.getValueType() != MVT::i1)
13590 switch (V.getOpcode()) {
13595 case AMDGPUISD::FP_CLASS:
13607 return V.getResNo() == 1;
13609 unsigned IntrinsicID = V.getConstantOperandVal(0);
13610 switch (IntrinsicID) {
13611 case Intrinsic::amdgcn_is_shared:
13612 case Intrinsic::amdgcn_is_private:
13629 if (!(
C & 0x000000ff))
13630 ZeroByteMask |= 0x000000ff;
13631 if (!(
C & 0x0000ff00))
13632 ZeroByteMask |= 0x0000ff00;
13633 if (!(
C & 0x00ff0000))
13634 ZeroByteMask |= 0x00ff0000;
13635 if (!(
C & 0xff000000))
13636 ZeroByteMask |= 0xff000000;
13637 uint32_t NonZeroByteMask = ~ZeroByteMask;
13638 if ((NonZeroByteMask &
C) != NonZeroByteMask)
13651 assert(V.getValueSizeInBits() == 32);
13653 if (V.getNumOperands() != 2)
13662 switch (V.getOpcode()) {
13667 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13672 return (0x03020100 & ~ConstMask) | ConstMask;
13679 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
13685 return uint32_t(0x0c0c0c0c03020100ull >>
C);
13692 DAGCombinerInfo &DCI)
const {
13693 if (DCI.isBeforeLegalize())
13696 SelectionDAG &DAG = DCI.DAG;
13697 EVT VT =
N->getValueType(0);
13702 if (VT == MVT::i64 && CRHS) {
13704 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
13708 if (CRHS && VT == MVT::i32) {
13718 unsigned Shift = CShift->getZExtValue();
13720 unsigned Offset = NB + Shift;
13721 if ((
Offset & (Bits - 1)) == 0) {
13724 DAG.
getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
LHS->getOperand(0),
13745 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13747 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
13760 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
13765 if (
X !=
LHS.getOperand(1))
13769 const ConstantFPSDNode *C1 =
13786 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
X,
13792 if (
RHS.getOpcode() ==
ISD::SETCC &&
LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13795 if (
LHS.getOpcode() ==
ISD::SETCC &&
RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13803 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
13804 LHS.getOperand(0) ==
LHS.getOperand(1))) {
13806 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
13807 :
Mask->getZExtValue() & OrdMask;
13810 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
RHS.getOperand(0),
13828 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13831 if (LHSMask != ~0u && RHSMask != ~0u) {
13834 if (LHSMask > RHSMask) {
13841 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13842 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13845 if (!(LHSUsedLanes & RHSUsedLanes) &&
13848 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13854 uint32_t
Mask = LHSMask & RHSMask;
13855 for (
unsigned I = 0;
I < 32;
I += 8) {
13856 uint32_t ByteSel = 0xff <<
I;
13857 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13858 Mask &= (0x0c <<
I) & 0xffffffff;
13863 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
13866 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
13916static const std::optional<ByteProvider<SDValue>>
13918 unsigned Depth = 0) {
13921 return std::nullopt;
13923 if (
Op.getValueSizeInBits() < 8)
13924 return std::nullopt;
13926 if (
Op.getValueType().isVector())
13929 switch (
Op->getOpcode()) {
13942 NarrowVT = VTSign->getVT();
13945 return std::nullopt;
13948 if (SrcIndex >= NarrowByteWidth)
13949 return std::nullopt;
13957 return std::nullopt;
13959 uint64_t BitShift = ShiftOp->getZExtValue();
13961 if (BitShift % 8 != 0)
13962 return std::nullopt;
13964 SrcIndex += BitShift / 8;
13982static const std::optional<ByteProvider<SDValue>>
13984 unsigned StartingIndex = 0) {
13988 return std::nullopt;
13990 unsigned BitWidth =
Op.getScalarValueSizeInBits();
13992 return std::nullopt;
13994 return std::nullopt;
13996 bool IsVec =
Op.getValueType().isVector();
13997 switch (
Op.getOpcode()) {
14000 return std::nullopt;
14005 return std::nullopt;
14009 return std::nullopt;
14012 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
14013 return std::nullopt;
14014 if (!
LHS ||
LHS->isConstantZero())
14016 if (!
RHS ||
RHS->isConstantZero())
14018 return std::nullopt;
14023 return std::nullopt;
14027 return std::nullopt;
14029 uint32_t BitMask = BitMaskOp->getZExtValue();
14031 uint32_t IndexMask = 0xFF << (Index * 8);
14033 if ((IndexMask & BitMask) != IndexMask) {
14036 if (IndexMask & BitMask)
14037 return std::nullopt;
14046 return std::nullopt;
14050 if (!ShiftOp ||
Op.getValueType().isVector())
14051 return std::nullopt;
14053 uint64_t BitsProvided =
Op.getValueSizeInBits();
14054 if (BitsProvided % 8 != 0)
14055 return std::nullopt;
14057 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
14059 return std::nullopt;
14061 uint64_t ConcatSizeInBytes = BitsProvided / 4;
14062 uint64_t ByteShift = BitShift / 8;
14064 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
14065 uint64_t BytesProvided = BitsProvided / 8;
14066 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
14067 NewIndex %= BytesProvided;
14074 return std::nullopt;
14078 return std::nullopt;
14080 uint64_t BitShift = ShiftOp->getZExtValue();
14082 return std::nullopt;
14084 auto BitsProvided =
Op.getScalarValueSizeInBits();
14085 if (BitsProvided % 8 != 0)
14086 return std::nullopt;
14088 uint64_t BytesProvided = BitsProvided / 8;
14089 uint64_t ByteShift = BitShift / 8;
14094 return BytesProvided - ByteShift > Index
14102 return std::nullopt;
14106 return std::nullopt;
14108 uint64_t BitShift = ShiftOp->getZExtValue();
14109 if (BitShift % 8 != 0)
14110 return std::nullopt;
14111 uint64_t ByteShift = BitShift / 8;
14117 return Index < ByteShift
14120 Depth + 1, StartingIndex);
14129 return std::nullopt;
14137 NarrowBitWidth = VTSign->getVT().getSizeInBits();
14139 if (NarrowBitWidth % 8 != 0)
14140 return std::nullopt;
14141 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14143 if (Index >= NarrowByteWidth)
14145 ? std::optional<ByteProvider<SDValue>>(
14153 return std::nullopt;
14157 if (NarrowByteWidth >= Index) {
14162 return std::nullopt;
14169 return std::nullopt;
14175 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
14176 if (NarrowBitWidth % 8 != 0)
14177 return std::nullopt;
14178 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14183 if (Index >= NarrowByteWidth) {
14185 ? std::optional<ByteProvider<SDValue>>(
14190 if (NarrowByteWidth > Index) {
14194 return std::nullopt;
14199 return std::nullopt;
14202 Depth + 1, StartingIndex);
14208 return std::nullopt;
14209 auto VecIdx = IdxOp->getZExtValue();
14210 auto ScalarSize =
Op.getScalarValueSizeInBits();
14211 if (ScalarSize < 32)
14212 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
14214 StartingIndex, Index);
14217 case AMDGPUISD::PERM: {
14219 return std::nullopt;
14223 return std::nullopt;
14226 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
14227 if (IdxMask > 0x07 && IdxMask != 0x0c)
14228 return std::nullopt;
14230 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
14231 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
14233 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
14239 return std::nullopt;
14254 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
14261 auto MemVT = L->getMemoryVT();
14264 return L->getMemoryVT().getSizeInBits() == 16;
14274 int Low8 = Mask & 0xff;
14275 int Hi8 = (Mask & 0xff00) >> 8;
14277 assert(Low8 < 8 && Hi8 < 8);
14279 bool IsConsecutive = (Hi8 - Low8 == 1);
14284 bool Is16Aligned = !(Low8 % 2);
14286 return IsConsecutive && Is16Aligned;
14294 int Low16 = PermMask & 0xffff;
14295 int Hi16 = (PermMask & 0xffff0000) >> 16;
14305 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14307 if (!OtherOpIs16Bit)
14315 unsigned DWordOffset) {
14320 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14325 if (Src.getValueType().isVector()) {
14326 auto ScalarTySize = Src.getScalarValueSizeInBits();
14327 auto ScalarTy = Src.getValueType().getScalarType();
14328 if (ScalarTySize == 32) {
14332 if (ScalarTySize > 32) {
14335 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
14336 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14343 assert(ScalarTySize < 32);
14344 auto NumElements =
TypeSize / ScalarTySize;
14345 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14346 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14347 auto NumElementsIn32 = 32 / ScalarTySize;
14348 auto NumAvailElements = DWordOffset < Trunc32Elements
14350 : NumElements - NormalizedTrunc;
14363 auto ShiftVal = 32 * DWordOffset;
14371 [[maybe_unused]]
EVT VT =
N->getValueType(0);
14376 for (
int i = 0; i < 4; i++) {
14378 std::optional<ByteProvider<SDValue>>
P =
14381 if (!
P ||
P->isConstantZero())
14386 if (PermNodes.
size() != 4)
14389 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14390 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14392 for (
size_t i = 0; i < PermNodes.
size(); i++) {
14393 auto PermOp = PermNodes[i];
14396 int SrcByteAdjust = 4;
14400 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14401 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14403 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14404 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14408 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14409 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14412 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14414 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14417 SDValue Op = *PermNodes[FirstSrc.first].Src;
14419 assert(
Op.getValueSizeInBits() == 32);
14423 int Low16 = PermMask & 0xffff;
14424 int Hi16 = (PermMask & 0xffff0000) >> 16;
14426 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14427 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14430 if (WellFormedLow && WellFormedHi)
14434 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
14443 (
N->getOperand(0) ==
Op ||
N->getOperand(0) == OtherOp) &&
14444 (
N->getOperand(1) ==
Op ||
N->getOperand(1) == OtherOp))
14449 assert(
Op.getValueType().isByteSized() &&
14460 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op, OtherOp,
14467 DAGCombinerInfo &DCI)
const {
14468 SelectionDAG &DAG = DCI.DAG;
14472 EVT VT =
N->getValueType(0);
14473 if (VT == MVT::i1) {
14475 if (
LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14476 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14478 if (Src !=
RHS.getOperand(0))
14483 if (!CLHS || !CRHS)
14487 static const uint32_t MaxMask = 0x3ff;
14492 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1, Src,
14501 LHS.getOpcode() == AMDGPUISD::PERM &&
14507 Sel |=
LHS.getConstantOperandVal(2);
14509 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14516 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14520 auto usesCombinedOperand = [](SDNode *OrUse) {
14523 !OrUse->getValueType(0).isVector())
14527 for (
auto *VUser : OrUse->users()) {
14528 if (!VUser->getValueType(0).isVector())
14535 if (VUser->getOpcode() == VectorwiseOp)
14541 if (!
any_of(
N->users(), usesCombinedOperand))
14547 if (LHSMask != ~0u && RHSMask != ~0u) {
14550 if (LHSMask > RHSMask) {
14557 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14558 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14561 if (!(LHSUsedLanes & RHSUsedLanes) &&
14564 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14566 LHSMask &= ~RHSUsedLanes;
14567 RHSMask &= ~LHSUsedLanes;
14569 LHSMask |= LHSUsedLanes & 0x04040404;
14571 uint32_t Sel = LHSMask | RHSMask;
14574 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14579 if (LHSMask == ~0u || RHSMask == ~0u) {
14620 return IdentitySrc;
14626 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14641 if (SrcVT == MVT::i32) {
14646 DCI.AddToWorklist(LowOr.
getNode());
14647 DCI.AddToWorklist(HiBits.getNode());
14658 N->getOperand(0), CRHS))
14666 DAGCombinerInfo &DCI)
const {
14667 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
14674 SelectionDAG &DAG = DCI.DAG;
14676 EVT VT =
N->getValueType(0);
14677 if (CRHS && VT == MVT::i64) {
14679 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
14686 unsigned Opc =
LHS.getOpcode();
14716 LHS->getOperand(0), FNegLHS, FNegRHS);
14725SITargetLowering::performZeroOrAnyExtendCombine(
SDNode *
N,
14726 DAGCombinerInfo &DCI)
const {
14727 if (!Subtarget->has16BitInsts() ||
14731 EVT VT =
N->getValueType(0);
14732 if (VT != MVT::i32)
14736 if (Src.getValueType() != MVT::i16)
14739 if (!Src->hasOneUse())
14746 std::optional<ByteProvider<SDValue>> BP0 =
14748 if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
14752 std::optional<ByteProvider<SDValue>> BP1 =
14754 if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
14762 SelectionDAG &DAG = DCI.DAG;
14764 uint32_t PermMask = 0x0c0c0c0c;
14767 PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
14772 PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
14775 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32, V0, V1,
14780SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
14781 DAGCombinerInfo &DCI)
const {
14787 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14788 VTSign->getVT() == MVT::i8) ||
14789 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14790 VTSign->getVT() == MVT::i16))) {
14791 assert(Subtarget->hasScalarSubwordLoads() &&
14792 "s_buffer_load_{u8, i8} are supported "
14793 "in GFX12 (or newer) architectures.");
14794 EVT VT = Src.getValueType();
14795 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14796 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14797 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14799 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14806 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14807 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14811 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14812 VTSign->getVT() == MVT::i8) ||
14813 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14814 VTSign->getVT() == MVT::i16)) &&
14823 Src.getOperand(6), Src.getOperand(7)};
14826 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14827 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14828 ? AMDGPUISD::BUFFER_LOAD_BYTE
14829 : AMDGPUISD::BUFFER_LOAD_SHORT;
14830 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14831 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14832 return DCI.DAG.getMergeValues(
14833 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
14839 DAGCombinerInfo &DCI)
const {
14840 SelectionDAG &DAG = DCI.DAG;
14847 if (
N->getOperand(0).isUndef())
14854 DAGCombinerInfo &DCI)
const {
14855 EVT VT =
N->getValueType(0);
14865 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(
N), VT, N0,
14872 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(
N), VT, N0.
getOperand(0),
14881 unsigned MaxDepth)
const {
14882 unsigned Opcode =
Op.getOpcode();
14887 const auto &
F = CFP->getValueAPF();
14888 if (
F.isNaN() &&
F.isSignaling())
14890 if (!
F.isDenormal())
14922 case AMDGPUISD::FMUL_LEGACY:
14923 case AMDGPUISD::FMAD_FTZ:
14924 case AMDGPUISD::RCP:
14925 case AMDGPUISD::RSQ:
14926 case AMDGPUISD::RSQ_CLAMP:
14927 case AMDGPUISD::RCP_LEGACY:
14928 case AMDGPUISD::RCP_IFLAG:
14929 case AMDGPUISD::LOG:
14930 case AMDGPUISD::EXP:
14931 case AMDGPUISD::DIV_SCALE:
14932 case AMDGPUISD::DIV_FMAS:
14933 case AMDGPUISD::DIV_FIXUP:
14934 case AMDGPUISD::FRACT:
14935 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14936 case AMDGPUISD::CVT_F32_UBYTE0:
14937 case AMDGPUISD::CVT_F32_UBYTE1:
14938 case AMDGPUISD::CVT_F32_UBYTE2:
14939 case AMDGPUISD::CVT_F32_UBYTE3:
14940 case AMDGPUISD::FP_TO_FP16:
14941 case AMDGPUISD::SIN_HW:
14942 case AMDGPUISD::COS_HW:
14953 if (
Op.getValueType() == MVT::i32) {
14959 if (RHS->getZExtValue() == 0xffff0000) {
14969 return Op.getValueType().getScalarType() != MVT::f16;
14979 case AMDGPUISD::CLAMP:
14980 case AMDGPUISD::FMED3:
14981 case AMDGPUISD::FMAX3:
14982 case AMDGPUISD::FMIN3:
14983 case AMDGPUISD::FMAXIMUM3:
14984 case AMDGPUISD::FMINIMUM3: {
14990 if (Subtarget->supportsMinMaxDenormModes() ||
15000 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
15012 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
15039 if (
Op.getValueType() == MVT::i16) {
15050 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
15052 switch (IntrinsicID) {
15053 case Intrinsic::amdgcn_cvt_pkrtz:
15054 case Intrinsic::amdgcn_cubeid:
15055 case Intrinsic::amdgcn_frexp_mant:
15056 case Intrinsic::amdgcn_fdot2:
15057 case Intrinsic::amdgcn_rcp:
15058 case Intrinsic::amdgcn_rsq:
15059 case Intrinsic::amdgcn_rsq_clamp:
15060 case Intrinsic::amdgcn_rcp_legacy:
15061 case Intrinsic::amdgcn_rsq_legacy:
15062 case Intrinsic::amdgcn_trig_preop:
15063 case Intrinsic::amdgcn_tanh:
15064 case Intrinsic::amdgcn_log:
15065 case Intrinsic::amdgcn_exp2:
15066 case Intrinsic::amdgcn_sqrt:
15084 unsigned MaxDepth)
const {
15087 unsigned Opcode =
MI->getOpcode();
15089 if (Opcode == AMDGPU::G_FCANONICALIZE)
15092 std::optional<FPValueAndVReg> FCR;
15095 if (FCR->Value.isSignaling())
15097 if (!FCR->Value.isDenormal())
15108 case AMDGPU::G_FADD:
15109 case AMDGPU::G_FSUB:
15110 case AMDGPU::G_FMUL:
15111 case AMDGPU::G_FCEIL:
15112 case AMDGPU::G_FFLOOR:
15113 case AMDGPU::G_FRINT:
15114 case AMDGPU::G_FNEARBYINT:
15115 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
15116 case AMDGPU::G_INTRINSIC_TRUNC:
15117 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
15118 case AMDGPU::G_FMA:
15119 case AMDGPU::G_FMAD:
15120 case AMDGPU::G_FSQRT:
15121 case AMDGPU::G_FDIV:
15122 case AMDGPU::G_FREM:
15123 case AMDGPU::G_FPOW:
15124 case AMDGPU::G_FPEXT:
15125 case AMDGPU::G_FLOG:
15126 case AMDGPU::G_FLOG2:
15127 case AMDGPU::G_FLOG10:
15128 case AMDGPU::G_FPTRUNC:
15129 case AMDGPU::G_AMDGPU_RCP_IFLAG:
15130 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
15131 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
15132 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
15133 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
15135 case AMDGPU::G_FNEG:
15136 case AMDGPU::G_FABS:
15137 case AMDGPU::G_FCOPYSIGN:
15139 case AMDGPU::G_FMINNUM:
15140 case AMDGPU::G_FMAXNUM:
15141 case AMDGPU::G_FMINNUM_IEEE:
15142 case AMDGPU::G_FMAXNUM_IEEE:
15143 case AMDGPU::G_FMINIMUM:
15144 case AMDGPU::G_FMAXIMUM:
15145 case AMDGPU::G_FMINIMUMNUM:
15146 case AMDGPU::G_FMAXIMUMNUM: {
15147 if (Subtarget->supportsMinMaxDenormModes() ||
15154 case AMDGPU::G_BUILD_VECTOR:
15159 case AMDGPU::G_INTRINSIC:
15160 case AMDGPU::G_INTRINSIC_CONVERGENT:
15162 case Intrinsic::amdgcn_fmul_legacy:
15163 case Intrinsic::amdgcn_fmad_ftz:
15164 case Intrinsic::amdgcn_sqrt:
15165 case Intrinsic::amdgcn_fmed3:
15166 case Intrinsic::amdgcn_sin:
15167 case Intrinsic::amdgcn_cos:
15168 case Intrinsic::amdgcn_log:
15169 case Intrinsic::amdgcn_exp2:
15170 case Intrinsic::amdgcn_log_clamp:
15171 case Intrinsic::amdgcn_rcp:
15172 case Intrinsic::amdgcn_rcp_legacy:
15173 case Intrinsic::amdgcn_rsq:
15174 case Intrinsic::amdgcn_rsq_clamp:
15175 case Intrinsic::amdgcn_rsq_legacy:
15176 case Intrinsic::amdgcn_div_scale:
15177 case Intrinsic::amdgcn_div_fmas:
15178 case Intrinsic::amdgcn_div_fixup:
15179 case Intrinsic::amdgcn_fract:
15180 case Intrinsic::amdgcn_cvt_pkrtz:
15181 case Intrinsic::amdgcn_cubeid:
15182 case Intrinsic::amdgcn_cubema:
15183 case Intrinsic::amdgcn_cubesc:
15184 case Intrinsic::amdgcn_cubetc:
15185 case Intrinsic::amdgcn_frexp_mant:
15186 case Intrinsic::amdgcn_fdot2:
15187 case Intrinsic::amdgcn_trig_preop:
15188 case Intrinsic::amdgcn_tanh:
15207 if (
C.isDenormal()) {
15221 if (
C.isSignaling()) {
15244SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
15245 DAGCombinerInfo &DCI)
const {
15246 SelectionDAG &DAG = DCI.DAG;
15248 EVT VT =
N->getValueType(0);
15257 EVT VT =
N->getValueType(0);
15258 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
15274 EVT EltVT =
Lo.getValueType();
15277 for (
unsigned I = 0;
I != 2; ++
I) {
15281 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
15282 }
else if (
Op.isUndef()) {
15318 return AMDGPUISD::FMAX3;
15320 return AMDGPUISD::FMAXIMUM3;
15322 return AMDGPUISD::SMAX3;
15324 return AMDGPUISD::UMAX3;
15328 return AMDGPUISD::FMIN3;
15330 return AMDGPUISD::FMINIMUM3;
15332 return AMDGPUISD::SMIN3;
15334 return AMDGPUISD::UMIN3;
15355 if (!MinK || !MaxK)
15367 unsigned Med3Opc =
Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15368 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15369 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
15428 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15434 if (
Info->getMode().DX10Clamp) {
15443 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15475 case AMDGPUISD::FMIN_LEGACY:
15476 case AMDGPUISD::FMAX_LEGACY:
15477 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
15478 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15481 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15482 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15483 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15488 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
15497 DAGCombinerInfo &DCI)
const {
15498 SelectionDAG &DAG = DCI.DAG;
15530 if (
SDValue Med3 = performIntMed3ImmCombine(
15535 if (
SDValue Med3 = performIntMed3ImmCombine(
15541 if (
SDValue Med3 = performIntMed3ImmCombine(
15546 if (
SDValue Med3 = performIntMed3ImmCombine(
15559 (
Opc == AMDGPUISD::FMIN_LEGACY &&
15560 Op0.
getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15561 (VT == MVT::f32 || VT == MVT::f64 ||
15562 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15563 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15564 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15565 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15567 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1))
15574 const SDNodeFlags
Flags =
N->getFlags();
15576 !Subtarget->hasIEEEMinimumMaximumInsts() &&
15580 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
15590 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15591 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15600 DAGCombinerInfo &DCI)
const {
15601 EVT VT =
N->getValueType(0);
15605 SelectionDAG &DAG = DCI.DAG;
15616 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15620 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15624 if (
Info->getMode().DX10Clamp) {
15637 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15644 DAGCombinerInfo &DCI)
const {
15648 return DCI.DAG.getUNDEF(
N->getValueType(0));
15656 bool IsDivergentIdx,
15661 unsigned VecSize = EltSize * NumElem;
15664 if (VecSize <= 64 && EltSize < 32)
15673 if (IsDivergentIdx)
15677 unsigned NumInsts = NumElem +
15678 ((EltSize + 31) / 32) * NumElem ;
15682 if (Subtarget->useVGPRIndexMode())
15683 return NumInsts <= 16;
15687 if (Subtarget->hasMovrel())
15688 return NumInsts <= 15;
15694 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
15709SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
15710 DAGCombinerInfo &DCI)
const {
15716 EVT ResVT =
N->getValueType(0);
15740 if (!
C ||
C->getZExtValue() != 0x1f)
15756 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15784 DCI.AddToWorklist(Elt0.
getNode());
15785 DCI.AddToWorklist(Elt1.
getNode());
15807 if (!DCI.isBeforeLegalize())
15815 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15818 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15819 unsigned EltIdx = BitIndex / 32;
15820 unsigned LeftoverBitIdx = BitIndex % 32;
15824 DCI.AddToWorklist(Cast.
getNode());
15828 DCI.AddToWorklist(Elt.
getNode());
15831 DCI.AddToWorklist(Srl.
getNode());
15835 DCI.AddToWorklist(Trunc.
getNode());
15837 if (VecEltVT == ResVT) {
15849SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
15850 DAGCombinerInfo &DCI)
const {
15861 SelectionDAG &DAG = DCI.DAG;
15881 Src.getOperand(0).getValueType() == MVT::f16) {
15882 return Src.getOperand(0);
15886 APFloat Val = CFP->getValueAPF();
15887 bool LosesInfo =
true;
15897 DAGCombinerInfo &DCI)
const {
15898 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15899 "combine only useful on gfx8");
15901 SDValue TruncSrc =
N->getOperand(0);
15902 EVT VT =
N->getValueType(0);
15903 if (VT != MVT::f16)
15906 if (TruncSrc.
getOpcode() != AMDGPUISD::FMED3 ||
15910 SelectionDAG &DAG = DCI.DAG;
15941unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
15943 const SDNode *N1)
const {
15948 if (((VT == MVT::f32 &&
15950 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15970 EVT VT =
N->getValueType(0);
15971 if (VT != MVT::i32 && VT != MVT::i64)
15977 unsigned Opc =
N->getOpcode();
16032 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
16051 DAGCombinerInfo &DCI)
const {
16054 SelectionDAG &DAG = DCI.DAG;
16055 EVT VT =
N->getValueType(0);
16065 if (!
N->isDivergent() && Subtarget->hasSMulHi())
16069 if (NumBits <= 32 || NumBits > 64)
16080 if (!Subtarget->hasFullRate64Ops()) {
16081 unsigned NumUsers = 0;
16082 for (SDNode *User :
LHS->
users()) {
16085 if (!
User->isAnyAdd())
16109 bool MulSignedLo =
false;
16110 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
16119 if (VT != MVT::i64) {
16142 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
16144 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
16145 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
16147 if (!MulLHSUnsigned32) {
16154 if (!MulRHSUnsigned32) {
16165 if (VT != MVT::i64)
16171SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
16172 DAGCombinerInfo &DCI)
const {
16182 SelectionDAG &DAG = DCI.DAG;
16197 unsigned Opcode =
N->getOpcode();
16201 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
16212static std::optional<ByteProvider<SDValue>>
16215 if (!Byte0 || Byte0->isConstantZero()) {
16216 return std::nullopt;
16219 if (Byte1 && !Byte1->isConstantZero()) {
16220 return std::nullopt;
16226 unsigned FirstCs =
First & 0x0c0c0c0c;
16227 unsigned SecondCs = Second & 0x0c0c0c0c;
16228 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
16229 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
16231 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
16232 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
16233 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
16234 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
16236 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
16260 for (
int BPI = 0; BPI < 2; BPI++) {
16263 BPP = {Src1, Src0};
16265 unsigned ZeroMask = 0x0c0c0c0c;
16266 unsigned FMask = 0xFF << (8 * (3 - Step));
16268 unsigned FirstMask =
16269 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16270 unsigned SecondMask =
16271 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16275 int FirstGroup = -1;
16276 for (
int I = 0;
I < 2;
I++) {
16278 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
16279 return IterElt.SrcOp == *BPP.first.Src &&
16280 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
16284 if (Match != Srcs.
end()) {
16285 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
16290 if (FirstGroup != -1) {
16292 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
16293 return IterElt.SrcOp == *BPP.second.Src &&
16294 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
16297 if (Match != Srcs.
end()) {
16298 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
16300 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
16308 unsigned ZeroMask = 0x0c0c0c0c;
16309 unsigned FMask = 0xFF << (8 * (3 - Step));
16313 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16317 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16326 if (Srcs.
size() == 1) {
16327 auto *Elt = Srcs.
begin();
16331 if (Elt->PermMask == 0x3020100)
16334 return DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16338 auto *FirstElt = Srcs.
begin();
16339 auto *SecondElt = std::next(FirstElt);
16346 auto FirstMask = FirstElt->PermMask;
16347 auto SecondMask = SecondElt->PermMask;
16349 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
16350 unsigned FirstPlusFour = FirstMask | 0x04040404;
16353 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
16365 FirstElt = std::next(SecondElt);
16366 if (FirstElt == Srcs.
end())
16369 SecondElt = std::next(FirstElt);
16372 if (SecondElt == Srcs.
end()) {
16377 DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16378 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
16384 return Perms.
size() == 2
16390 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16391 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
16392 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
16393 EntryMask += ZeroMask;
16398 auto Opcode =
Op.getOpcode();
16400 return (Opcode ==
ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16401 Opcode == AMDGPUISD::MUL_I24);
16404static std::optional<bool>
16415 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16418 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16420 assert(!(S0IsUnsigned && S0IsSigned));
16421 assert(!(S1IsUnsigned && S1IsSigned));
16429 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16435 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16436 return std::nullopt;
16448 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16449 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16454 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16460 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16461 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16462 return std::nullopt;
16468 DAGCombinerInfo &DCI)
const {
16469 SelectionDAG &DAG = DCI.DAG;
16470 EVT VT =
N->getValueType(0);
16476 if (Subtarget->hasMad64_32()) {
16477 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16482 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
16486 if (VT == MVT::i64) {
16487 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16492 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16494 std::optional<bool> IsSigned;
16500 int ChainLength = 0;
16501 for (
int I = 0;
I < 4;
I++) {
16505 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16508 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16513 TempNode->getOperand(MulIdx), *Src0, *Src1,
16514 TempNode->getOperand(MulIdx)->getOperand(0),
16515 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16519 IsSigned = *IterIsSigned;
16520 if (*IterIsSigned != *IsSigned)
16523 auto AddIdx = 1 - MulIdx;
16526 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
16527 Src2s.
push_back(TempNode->getOperand(AddIdx));
16537 TempNode->getOperand(AddIdx), *Src0, *Src1,
16538 TempNode->getOperand(AddIdx)->getOperand(0),
16539 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16543 if (*IterIsSigned != *IsSigned)
16547 ChainLength =
I + 2;
16551 TempNode = TempNode->getOperand(AddIdx);
16553 ChainLength =
I + 1;
16554 if (TempNode->getNumOperands() < 2)
16556 LHS = TempNode->getOperand(0);
16557 RHS = TempNode->getOperand(1);
16560 if (ChainLength < 2)
16566 if (ChainLength < 4) {
16576 bool UseOriginalSrc =
false;
16577 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
16578 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
16579 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
16580 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
16581 SmallVector<unsigned, 4> SrcBytes;
16582 auto Src0Mask = Src0s.
begin()->PermMask;
16583 SrcBytes.
push_back(Src0Mask & 0xFF000000);
16584 bool UniqueEntries =
true;
16585 for (
auto I = 1;
I < 4;
I++) {
16586 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
16589 UniqueEntries =
false;
16595 if (UniqueEntries) {
16596 UseOriginalSrc =
true;
16598 auto *FirstElt = Src0s.
begin();
16602 auto *SecondElt = Src1s.
begin();
16604 SecondElt->DWordOffset);
16613 if (!UseOriginalSrc) {
16620 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16623 : Intrinsic::amdgcn_udot4,
16633 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16638 unsigned Opc =
LHS.getOpcode();
16650 auto Cond =
RHS.getOperand(0);
16655 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16672 DAGCombinerInfo &DCI)
const {
16673 SelectionDAG &DAG = DCI.DAG;
16675 EVT VT =
N->getValueType(0);
16688 SDNodeFlags ShlFlags = N1->
getFlags();
16692 SDNodeFlags NewShlFlags =
16697 DCI.AddToWorklist(Inner.
getNode());
16704 if (Subtarget->hasMad64_32()) {
16705 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16714 if (VT == MVT::i64) {
16715 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16728 if (!YIsConstant && !ZIsConstant && !
X->isDivergent() &&
16729 Y->isDivergent() !=
Z->isDivergent()) {
16738 if (
Y->isDivergent())
16741 SDNodeFlags ReassocFlags =
16744 DCI.AddToWorklist(UniformInner.
getNode());
16752 DAGCombinerInfo &DCI)
const {
16753 SelectionDAG &DAG = DCI.DAG;
16754 EVT VT =
N->getValueType(0);
16756 if (VT == MVT::i64) {
16757 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16761 if (VT != MVT::i32)
16770 unsigned Opc =
RHS.getOpcode();
16777 auto Cond =
RHS.getOperand(0);
16782 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16800SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
16801 DAGCombinerInfo &DCI)
const {
16803 if (
N->getValueType(0) != MVT::i32)
16809 SelectionDAG &DAG = DCI.DAG;
16814 unsigned LHSOpc =
LHS.getOpcode();
16815 unsigned Opc =
N->getOpcode();
16819 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
16825 DAGCombinerInfo &DCI)
const {
16829 SelectionDAG &DAG = DCI.DAG;
16830 EVT VT =
N->getValueType(0);
16842 if (
A ==
LHS.getOperand(1)) {
16843 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16844 if (FusedOp != 0) {
16846 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
16854 if (
A ==
RHS.getOperand(1)) {
16855 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16856 if (FusedOp != 0) {
16858 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
16867 DAGCombinerInfo &DCI)
const {
16871 SelectionDAG &DAG = DCI.DAG;
16873 EVT VT =
N->getValueType(0);
16886 if (
A ==
LHS.getOperand(1)) {
16887 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16888 if (FusedOp != 0) {
16892 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
16901 if (
A ==
RHS.getOperand(1)) {
16902 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16903 if (FusedOp != 0) {
16905 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
16914 DAGCombinerInfo &DCI)
const {
16915 SelectionDAG &DAG = DCI.DAG;
16917 EVT VT =
N->getValueType(0);
16926 SDNodeFlags
Flags =
N->getFlags();
16927 SDNodeFlags RHSFlags =
RHS->getFlags();
16933 bool IsNegative =
false;
16934 if (CLHS->isExactlyValue(1.0) ||
16935 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16941 DAG.
getNode(AMDGPUISD::RSQ, SL, VT,
RHS.getOperand(0), Flags);
16951 DAGCombinerInfo &DCI)
const {
16952 SelectionDAG &DAG = DCI.DAG;
16953 EVT VT =
N->getValueType(0);
16957 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
16958 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16973 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16978 const ConstantFPSDNode *FalseNode =
16988 if (ScalarVT == MVT::f32 &&
16994 if (TrueNodeExpVal == INT_MIN)
16997 if (FalseNodeExpVal == INT_MIN)
17017 DAGCombinerInfo &DCI)
const {
17018 SelectionDAG &DAG = DCI.DAG;
17019 EVT VT =
N->getValueType(0);
17022 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
17040 (
N->getFlags().hasAllowContract() &&
17041 FMA->getFlags().hasAllowContract())) {
17075 if (Vec1 == Vec2 || Vec3 == Vec4)
17081 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
17082 return DAG.
getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
17090 DAGCombinerInfo &DCI)
const {
17091 SelectionDAG &DAG = DCI.DAG;
17096 EVT VT =
LHS.getValueType();
17125 return LHS.getOperand(0);
17139 const APInt &CT =
LHS.getConstantOperandAPInt(1);
17140 const APInt &CF =
LHS.getConstantOperandAPInt(2);
17145 return DAG.
getNOT(SL,
LHS.getOperand(0), MVT::i1);
17148 return LHS.getOperand(0);
17169 if (VT == MVT::i64) {
17181 const std::optional<bool> KnownEq =
17189 const std::optional<bool> KnownEq =
17200 const std::optional<bool> KnownUge =
17220 const std::optional<bool> KnownUle =
17271 DAG.
getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
17276 {Op0Hi, Op1Hi, CarryInHi});
17286 DCI.CombineTo(
LHS.getNode(), Result);
17290 if (VT != MVT::f32 && VT != MVT::f64 &&
17291 (!Subtarget->has16BitInsts() || VT != MVT::f16))
17306 const unsigned IsInfMask =
17308 const unsigned IsFiniteMask =
17313 return DAG.
getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1,
LHS.getOperand(0),
17322SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
17323 DAGCombinerInfo &DCI)
const {
17324 SelectionDAG &DAG = DCI.DAG;
17326 unsigned Offset =
N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
17345 unsigned ShiftOffset = 8 *
Offset;
17347 ShiftOffset -=
C->getZExtValue();
17349 ShiftOffset +=
C->getZExtValue();
17351 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
17352 return DAG.
getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
17353 MVT::f32, Shifted);
17364 DCI.AddToWorklist(
N);
17371 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
17377 DAGCombinerInfo &DCI)
const {
17382 const MachineFunction &MF = DCI.DAG.getMachineFunction();
17386 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
17387 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
17390 APFloat One(
F.getSemantics(),
"1.0");
17392 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
17398 DAGCombinerInfo &DCI)
const {
17419 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
17420 bool isInteger =
LHS.getValueType().isInteger();
17423 if (!isFloatingPoint && !isInteger)
17428 if (!isEquality && !isNonEquality)
17445 if (isFloatingPoint) {
17447 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
17458 if (!(isEquality && TrueVal == ConstVal) &&
17459 !(isNonEquality && FalseVal == ConstVal))
17466 SelectLHS, SelectRHS);
17471 switch (
N->getOpcode()) {
17487 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
17497 switch (
N->getOpcode()) {
17499 return performAddCombine(
N, DCI);
17501 return performPtrAddCombine(
N, DCI);
17503 return performSubCombine(
N, DCI);
17506 return performAddCarrySubCarryCombine(
N, DCI);
17508 return performFAddCombine(
N, DCI);
17510 return performFSubCombine(
N, DCI);
17512 return performFDivCombine(
N, DCI);
17514 return performFMulCombine(
N, DCI);
17516 return performSetCCCombine(
N, DCI);
17518 if (
auto Res = performSelectCombine(
N, DCI))
17533 case AMDGPUISD::FMIN_LEGACY:
17534 case AMDGPUISD::FMAX_LEGACY:
17535 return performMinMaxCombine(
N, DCI);
17537 return performFMACombine(
N, DCI);
17539 return performAndCombine(
N, DCI);
17541 return performOrCombine(
N, DCI);
17544 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
17545 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
17551 return performXorCombine(
N, DCI);
17554 return performZeroOrAnyExtendCombine(
N, DCI);
17556 return performSignExtendInRegCombine(
N, DCI);
17557 case AMDGPUISD::FP_CLASS:
17558 return performClassCombine(
N, DCI);
17560 return performFCanonicalizeCombine(
N, DCI);
17561 case AMDGPUISD::RCP:
17562 return performRcpCombine(
N, DCI);
17564 case AMDGPUISD::FRACT:
17565 case AMDGPUISD::RSQ:
17566 case AMDGPUISD::RCP_LEGACY:
17567 case AMDGPUISD::RCP_IFLAG:
17568 case AMDGPUISD::RSQ_CLAMP: {
17577 return performUCharToFloatCombine(
N, DCI);
17579 return performFCopySignCombine(
N, DCI);
17580 case AMDGPUISD::CVT_F32_UBYTE0:
17581 case AMDGPUISD::CVT_F32_UBYTE1:
17582 case AMDGPUISD::CVT_F32_UBYTE2:
17583 case AMDGPUISD::CVT_F32_UBYTE3:
17584 return performCvtF32UByteNCombine(
N, DCI);
17585 case AMDGPUISD::FMED3:
17586 return performFMed3Combine(
N, DCI);
17587 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17588 return performCvtPkRTZCombine(
N, DCI);
17589 case AMDGPUISD::CLAMP:
17590 return performClampCombine(
N, DCI);
17593 EVT VT =
N->getValueType(0);
17596 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17599 EVT EltVT = Src.getValueType();
17600 if (EltVT != MVT::i16)
17610 return performExtractVectorEltCombine(
N, DCI);
17612 return performInsertVectorEltCombine(
N, DCI);
17614 return performFPRoundCombine(
N, DCI);
17623 return performMemSDNodeCombine(MemNode, DCI);
17654 unsigned Opcode =
Node->getMachineOpcode();
17657 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17658 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
17661 SDNode *
Users[5] = {
nullptr};
17663 unsigned DmaskIdx =
17664 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17665 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
17666 unsigned NewDmask = 0;
17667 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17668 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17669 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
17670 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
17671 unsigned TFCLane = 0;
17672 bool HasChain =
Node->getNumValues() > 1;
17674 if (OldDmask == 0) {
17682 TFCLane = OldBitsSet;
17686 for (SDUse &Use :
Node->uses()) {
17689 if (
Use.getResNo() != 0)
17692 SDNode *
User =
Use.getUser();
17695 if (!
User->isMachineOpcode() ||
17696 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17708 if (UsesTFC && Lane == TFCLane) {
17713 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17715 Dmask &= ~(1 << Comp);
17723 NewDmask |= 1 << Comp;
17728 bool NoChannels = !NewDmask;
17735 if (OldBitsSet == 1)
17741 if (NewDmask == OldDmask)
17750 unsigned NewChannels = BitsSet + UsesTFC;
17754 assert(NewOpcode != -1 &&
17755 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
17756 "failed to find equivalent MIMG op");
17764 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
17766 MVT ResultVT = NewChannels == 1
17769 : NewChannels == 5 ? 8
17771 SDVTList NewVTList =
17774 MachineSDNode *NewNode =
17783 if (NewChannels == 1) {
17793 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17798 if (i || !NoChannels)
17803 if (NewUser != User) {
17813 Idx = AMDGPU::sub1;
17816 Idx = AMDGPU::sub2;
17819 Idx = AMDGPU::sub3;
17822 Idx = AMDGPU::sub4;
17833 Op =
Op.getOperand(0);
17854 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17858 Node->getOperand(0), SL, VReg, SrcVal,
17864 return ToResultReg.
getNode();
17869 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
17871 Ops.push_back(
Node->getOperand(i));
17877 Node->getOperand(i).getValueType(),
17878 Node->getOperand(i)),
17890 unsigned Opcode =
Node->getMachineOpcode();
17892 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
17893 !
TII->isGather4(Opcode) &&
17895 return adjustWritemask(
Node, DAG);
17898 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17904 case AMDGPU::V_DIV_SCALE_F32_e64:
17905 case AMDGPU::V_DIV_SCALE_F64_e64: {
17915 (Src0 == Src1 || Src0 == Src2))
17971 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
17972 unsigned InitIdx = 0;
17974 if (
TII->isImage(
MI)) {
17982 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
17983 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
17984 unsigned D16Val = D16 ? D16->getImm() : 0;
17986 if (!TFEVal && !LWEVal)
17997 assert(MO_Dmask &&
"Expected dmask operand in instruction");
17999 unsigned dmask = MO_Dmask->
getImm();
18004 bool Packed = !Subtarget->hasUnpackedD16VMem();
18006 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
18013 uint32_t DstSize =
TRI.getRegSizeInBits(*DstRC) / 32;
18014 if (DstSize < InitIdx)
18018 InitIdx =
TRI.getRegSizeInBits(*DstRC) / 32;
18026 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
18027 unsigned NewDst = 0;
18032 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
18033 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
18036 for (; SizeLeft; SizeLeft--, CurrIdx++) {
18037 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
18057 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
18069 if (
TII->isVOP3(
MI.getOpcode())) {
18071 TII->legalizeOperandsVOP3(
MRI,
MI);
18073 if (
TII->isMAI(
MI)) {
18078 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
18079 AMDGPU::OpName::scale_src0);
18080 if (Src0Idx != -1) {
18081 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
18082 AMDGPU::OpName::scale_src1);
18083 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
18084 TII->usesConstantBus(
MRI,
MI, Src1Idx))
18085 TII->legalizeOpWithMove(
MI, Src1Idx);
18092 if (
TII->isImage(
MI))
18093 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
18167std::pair<unsigned, const TargetRegisterClass *>
18174 if (Constraint.
size() == 1) {
18178 if (VT == MVT::Other)
18181 switch (Constraint[0]) {
18188 RC = &AMDGPU::SReg_32RegClass;
18191 RC = &AMDGPU::SGPR_64RegClass;
18196 return std::pair(0U,
nullptr);
18203 return std::pair(0U,
nullptr);
18205 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
18206 : &AMDGPU::VGPR_32_Lo256RegClass;
18209 RC = Subtarget->has1024AddressableVGPRs()
18210 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
18213 return std::pair(0U,
nullptr);
18218 if (!Subtarget->hasMAIInsts())
18222 return std::pair(0U,
nullptr);
18224 RC = &AMDGPU::AGPR_32RegClass;
18229 return std::pair(0U,
nullptr);
18234 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
18238 RC = &AMDGPU::AV_32RegClass;
18241 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
18243 return std::pair(0U,
nullptr);
18252 return std::pair(0U, RC);
18255 if (Kind !=
'\0') {
18257 RC = &AMDGPU::VGPR_32_Lo256RegClass;
18258 }
else if (Kind ==
's') {
18259 RC = &AMDGPU::SGPR_32RegClass;
18260 }
else if (Kind ==
'a') {
18261 RC = &AMDGPU::AGPR_32RegClass;
18267 return std::pair(0U,
nullptr);
18273 return std::pair(0U,
nullptr);
18277 RC =
TRI->getVGPRClassForBitWidth(Width);
18279 RC =
TRI->getSGPRClassForBitWidth(Width);
18281 RC =
TRI->getAGPRClassForBitWidth(Width);
18283 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
18288 return std::pair(0U,
nullptr);
18290 return std::pair(Reg, RC);
18296 return std::pair(0U,
nullptr);
18297 if (Idx < RC->getNumRegs())
18299 return std::pair(0U,
nullptr);
18305 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
18311 if (Constraint.
size() == 1) {
18312 switch (Constraint[0]) {
18322 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
18330 if (Constraint.
size() == 1) {
18331 switch (Constraint[0]) {
18339 }
else if (Constraint.
size() == 2) {
18340 if (Constraint ==
"VA")
18358 std::vector<SDValue> &
Ops,
18373 unsigned Size =
Op.getScalarValueSizeInBits();
18377 if (
Size == 16 && !Subtarget->has16BitInsts())
18381 Val =
C->getSExtValue();
18385 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
18389 if (
Size != 16 ||
Op.getNumOperands() != 2)
18391 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
18394 Val =
C->getSExtValue();
18398 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
18408 if (Constraint.
size() == 1) {
18409 switch (Constraint[0]) {
18424 }
else if (Constraint.
size() == 2) {
18425 if (Constraint ==
"DA") {
18426 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
18427 int64_t LoBits =
static_cast<int32_t
>(Val);
18431 if (Constraint ==
"DB") {
18439 unsigned MaxSize)
const {
18440 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
18441 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
18443 MVT VT =
Op.getSimpleValueType();
18468 switch (UnalignedClassID) {
18469 case AMDGPU::VReg_64RegClassID:
18470 return AMDGPU::VReg_64_Align2RegClassID;
18471 case AMDGPU::VReg_96RegClassID:
18472 return AMDGPU::VReg_96_Align2RegClassID;
18473 case AMDGPU::VReg_128RegClassID:
18474 return AMDGPU::VReg_128_Align2RegClassID;
18475 case AMDGPU::VReg_160RegClassID:
18476 return AMDGPU::VReg_160_Align2RegClassID;
18477 case AMDGPU::VReg_192RegClassID:
18478 return AMDGPU::VReg_192_Align2RegClassID;
18479 case AMDGPU::VReg_224RegClassID:
18480 return AMDGPU::VReg_224_Align2RegClassID;
18481 case AMDGPU::VReg_256RegClassID:
18482 return AMDGPU::VReg_256_Align2RegClassID;
18483 case AMDGPU::VReg_288RegClassID:
18484 return AMDGPU::VReg_288_Align2RegClassID;
18485 case AMDGPU::VReg_320RegClassID:
18486 return AMDGPU::VReg_320_Align2RegClassID;
18487 case AMDGPU::VReg_352RegClassID:
18488 return AMDGPU::VReg_352_Align2RegClassID;
18489 case AMDGPU::VReg_384RegClassID:
18490 return AMDGPU::VReg_384_Align2RegClassID;
18491 case AMDGPU::VReg_512RegClassID:
18492 return AMDGPU::VReg_512_Align2RegClassID;
18493 case AMDGPU::VReg_1024RegClassID:
18494 return AMDGPU::VReg_1024_Align2RegClassID;
18495 case AMDGPU::AReg_64RegClassID:
18496 return AMDGPU::AReg_64_Align2RegClassID;
18497 case AMDGPU::AReg_96RegClassID:
18498 return AMDGPU::AReg_96_Align2RegClassID;
18499 case AMDGPU::AReg_128RegClassID:
18500 return AMDGPU::AReg_128_Align2RegClassID;
18501 case AMDGPU::AReg_160RegClassID:
18502 return AMDGPU::AReg_160_Align2RegClassID;
18503 case AMDGPU::AReg_192RegClassID:
18504 return AMDGPU::AReg_192_Align2RegClassID;
18505 case AMDGPU::AReg_256RegClassID:
18506 return AMDGPU::AReg_256_Align2RegClassID;
18507 case AMDGPU::AReg_512RegClassID:
18508 return AMDGPU::AReg_512_Align2RegClassID;
18509 case AMDGPU::AReg_1024RegClassID:
18510 return AMDGPU::AReg_1024_Align2RegClassID;
18526 if (Info->isEntryFunction()) {
18533 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18535 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
18536 :
TRI->getAlignedHighSGPRForRC(MF, 2,
18537 &AMDGPU::SGPR_64RegClass);
18538 Info->setSGPRForEXECCopy(SReg);
18540 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
18541 Info->getStackPtrOffsetReg()));
18542 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18543 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
18547 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18548 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
18550 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18551 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
18553 Info->limitOccupancy(MF);
18555 if (ST.isWave32() && !MF.
empty()) {
18556 for (
auto &
MBB : MF) {
18557 for (
auto &
MI :
MBB) {
18558 TII->fixImplicitOperands(
MI);
18568 if (ST.needsAlignedVGPRs()) {
18569 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
18575 if (NewClassID != -1)
18576 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
18585 const APInt &DemandedElts,
18587 unsigned Depth)
const {
18589 unsigned Opc =
Op.getOpcode();
18592 unsigned IID =
Op.getConstantOperandVal(0);
18594 case Intrinsic::amdgcn_mbcnt_lo:
18595 case Intrinsic::amdgcn_mbcnt_hi: {
18601 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18611 Op, Known, DemandedElts, DAG,
Depth);
18627 unsigned MaxValue =
18634 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
18638 unsigned Src1Cst = 0;
18639 if (Src1.
isImm()) {
18640 Src1Cst = Src1.
getImm();
18641 }
else if (Src1.
isReg()) {
18645 Src1Cst = Cst->Value.getZExtValue();
18656 if (Width >= BFEWidth)
18665 Known = Known.
sext(BFEWidth);
18667 Known = Known.
zext(BFEWidth);
18673 unsigned Depth)
const {
18676 switch (
MI->getOpcode()) {
18677 case AMDGPU::S_BFE_I32:
18680 case AMDGPU::S_BFE_U32:
18683 case AMDGPU::S_BFE_I64:
18686 case AMDGPU::S_BFE_U64:
18689 case AMDGPU::G_INTRINSIC:
18690 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18693 case Intrinsic::amdgcn_workitem_id_x:
18696 case Intrinsic::amdgcn_workitem_id_y:
18699 case Intrinsic::amdgcn_workitem_id_z:
18702 case Intrinsic::amdgcn_mbcnt_lo:
18703 case Intrinsic::amdgcn_mbcnt_hi: {
18715 case Intrinsic::amdgcn_groupstaticsize: {
18726 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18729 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18732 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
18737 case AMDGPU::G_AMDGPU_SMED3:
18738 case AMDGPU::G_AMDGPU_UMED3: {
18739 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
18766 unsigned Depth)
const {
18773 AttributeList Attrs =
18775 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
18793 if (Header->getAlignment() != PrefAlign)
18794 return Header->getAlignment();
18795 if (needsFetchWindowAlignment(*Header))
18816 if (Header->getAlignment() != PrefAlign)
18817 return Header->getAlignment();
18819 unsigned LoopSize = 0;
18824 LoopSize +=
MBB->getAlignment().value() / 2;
18827 LoopSize +=
TII->getInstSizeInBytes(
MI);
18828 if (LoopSize > 192)
18833 if (LoopSize <= 64)
18836 if (LoopSize <= 128)
18837 return CacheLineAlign;
18843 auto I = Exit->getFirstNonDebugInstr();
18844 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18845 return CacheLineAlign;
18854 if (PreTerm == Pre->
begin() ||
18855 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18859 auto ExitHead = Exit->getFirstNonDebugInstr();
18860 if (ExitHead == Exit->end() ||
18861 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18866 return CacheLineAlign;
18874 if (needsFetchWindowAlignment(*
MBB))
18879bool SITargetLowering::needsFetchWindowAlignment(
18881 if (!
getSubtarget()->hasLoopHeadInstSplitSensitivity())
18885 if (
MI.isMetaInstruction())
18888 return TII->getInstSizeInBytes(
MI) > 4;
18898 N =
N->getOperand(0).getNode();
18908 switch (
N->getOpcode()) {
18916 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
18917 return !
TRI->isSGPRReg(
MRI, Reg);
18923 return !
TRI->isSGPRReg(
MRI, Reg);
18927 unsigned AS = L->getAddressSpace();
18937 case AMDGPUISD::ATOMIC_CMP_SWAP:
18938 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18939 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18940 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18941 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18942 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18943 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18944 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18945 case AMDGPUISD::BUFFER_ATOMIC_AND:
18946 case AMDGPUISD::BUFFER_ATOMIC_OR:
18947 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18948 case AMDGPUISD::BUFFER_ATOMIC_INC:
18949 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18950 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18951 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18952 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18953 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18959 return A->readMem() &&
A->writeMem();
18980 switch (Ty.getScalarSizeInBits()) {
18992 const APInt &DemandedElts,
18995 unsigned Depth)
const {
18996 if (
Op.getOpcode() == AMDGPUISD::CLAMP) {
19000 if (Info->getMode().DX10Clamp)
19012 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
19032 <<
"Hardware instruction generated for atomic "
19034 <<
" operation at memory scope " << MemScope;
19039 Type *EltTy = VT->getElementType();
19040 return VT->getNumElements() == 2 &&
19060 unsigned BW =
IT->getBitWidth();
19061 return BW == 32 || BW == 64;
19075 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
19076 return BW == 32 || BW == 64;
19079 if (Ty->isFloatTy() || Ty->isDoubleTy())
19083 return VT->getNumElements() == 2 &&
19084 VT->getElementType()->getPrimitiveSizeInBits() == 16;
19094 bool HasSystemScope) {
19101 if (HasSystemScope) {
19102 if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
19105 if (Subtarget.hasEmulatedSystemScopeAtomics())
19107 }
else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
19110 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
19123 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
19131 return STI.hasGloballyAddressableScratch()
19149 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
19162 bool HasSystemScope =
19194 if (!
IT ||
IT->getBitWidth() != 32)
19200 if (Subtarget->hasEmulatedSystemScopeAtomics())
19216 if (!HasSystemScope &&
19217 Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
19229 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
19237 ConstVal && ConstVal->isNullValue())
19275 if (Ty->isFloatTy()) {
19280 if (Ty->isDoubleTy()) {
19301 if (Ty->isFloatTy() &&
19302 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
19315 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
19319 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
19323 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
19328 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
19333 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
19337 if (Ty->isFloatTy()) {
19340 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19343 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19348 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
19356 if (Subtarget->hasFlatAtomicFaddF32Inst())
19365 if (Subtarget->hasLDSFPAtomicAddF32()) {
19366 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19368 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19396 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
19398 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
19402 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
19404 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
19458 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
19459 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
19460 : &AMDGPU::SReg_32RegClass;
19461 if (!
TRI->isSGPRClass(RC) && !isDivergent)
19462 return TRI->getEquivalentSGPRClass(RC);
19463 if (
TRI->isSGPRClass(RC) && isDivergent) {
19464 if (Subtarget->hasGFX90AInsts())
19465 return TRI->getEquivalentAVClass(RC);
19466 return TRI->getEquivalentVGPRClass(RC);
19479 unsigned WaveSize) {
19484 if (!
IT ||
IT->getBitWidth() != WaveSize)
19489 if (!Visited.
insert(V).second)
19491 bool Result =
false;
19492 for (
const auto *U : V->users()) {
19494 if (V == U->getOperand(1)) {
19499 case Intrinsic::amdgcn_if_break:
19500 case Intrinsic::amdgcn_if:
19501 case Intrinsic::amdgcn_else:
19506 if (V == U->getOperand(0)) {
19511 case Intrinsic::amdgcn_end_cf:
19512 case Intrinsic::amdgcn_loop:
19518 Result =
hasCFUser(U, Visited, WaveSize);
19527 const Value *V)
const {
19529 if (CI->isInlineAsm()) {
19538 for (
auto &TC : TargetConstraints) {
19552 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
19580 return MRI.hasOneNonDBGUse(N0);
19587 if (
I.getMetadata(
"amdgpu.noclobber"))
19589 if (
I.getMetadata(
"amdgpu.last.use"))
19653 Alignment = RMW->getAlign();
19666 bool FullFlatEmulation =
19668 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19669 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19670 RMW->getType()->isDoubleTy()));
19673 bool ReturnValueIsUsed = !AI->
use_empty();
19682 if (FullFlatEmulation) {
19693 std::prev(BB->
end())->eraseFromParent();
19694 Builder.SetInsertPoint(BB);
19696 Value *LoadedShared =
nullptr;
19697 if (FullFlatEmulation) {
19698 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19699 {Addr},
nullptr,
"is.shared");
19700 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19701 Builder.SetInsertPoint(SharedBB);
19702 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19708 LoadedShared = Clone;
19710 Builder.CreateBr(PhiBB);
19711 Builder.SetInsertPoint(CheckPrivateBB);
19714 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19715 {Addr},
nullptr,
"is.private");
19716 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19718 Builder.SetInsertPoint(PrivateBB);
19720 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19723 Value *LoadedPrivate;
19725 LoadedPrivate = Builder.CreateAlignedLoad(
19726 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
19729 LoadedPrivate, RMW->getValOperand());
19731 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19733 auto [ResultLoad, Equal] =
19739 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19742 Builder.CreateBr(PhiBB);
19744 Builder.SetInsertPoint(GlobalBB);
19748 if (FullFlatEmulation) {
19749 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19758 if (!FullFlatEmulation) {
19763 MDNode *RangeNotPrivate =
19766 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
19770 Builder.CreateBr(PhiBB);
19772 Builder.SetInsertPoint(PhiBB);
19774 if (ReturnValueIsUsed) {
19777 if (FullFlatEmulation)
19778 Loaded->addIncoming(LoadedShared, SharedBB);
19779 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19780 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19781 Loaded->takeName(AI);
19784 Builder.CreateBr(ExitBB);
19788 unsigned PtrOpIdx) {
19789 Value *PtrOp =
I->getOperand(PtrOpIdx);
19796 I->setOperand(PtrOpIdx, ASCast);
19808 ConstVal && ConstVal->isNullValue()) {
19838 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19846 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19861 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI, unsigned ArgIdx)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
bool isBottomOfStack() const
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned AndSaveExecOpc
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf()
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
const SIInstrInfo * getInstrInfo() const override
const SIRegisterInfo * getRegisterInfo() const override
bool hasMin3Max3_16() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Type * getValueType() const
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, SDNodeFlags UserFlags={}, unsigned MaxDepth=5) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const override
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
constexpr bool empty() const
empty - Check if the string is empty.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< user_iterator > users()
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
LLVM_READONLY int32_t getVOPe64(uint32_t Opcode)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
constexpr T MinAlign(U A, V B)
A and B are either alignments or offsets.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
constexpr RegState getUndefRegState(bool B)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
unsigned AtomicNoRetBaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
unsigned getBitWidth() const
Get the bit width of this value.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
static LLVM_ABI std::optional< bool > ule(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_ULE result.
static LLVM_ABI std::optional< bool > uge(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_UGE result.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
std::optional< unsigned > fallbackAddressSpace
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const