44#include "llvm/IR/IntrinsicsAMDGPU.h"
45#include "llvm/IR/IntrinsicsR600.h"
56#define DEBUG_TYPE "si-lower"
62 cl::desc(
"Do not align and prefetch loops"),
66 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
67 cl::desc(
"Use indirect register addressing for divergent indexes"),
81 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
82 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
84 return AMDGPU::SGPR0 +
Reg;
100 TRI->getDefaultVectorSuperClassForBitWidth(32);
106 TRI->getDefaultVectorSuperClassForBitWidth(64);
144 TRI->getDefaultVectorSuperClassForBitWidth(320));
148 TRI->getDefaultVectorSuperClassForBitWidth(352));
152 TRI->getDefaultVectorSuperClassForBitWidth(384));
156 TRI->getDefaultVectorSuperClassForBitWidth(512));
163 TRI->getDefaultVectorSuperClassForBitWidth(1024));
165 if (Subtarget->has16BitInsts()) {
166 if (Subtarget->useRealTrue16Insts()) {
196 TRI->getDefaultVectorSuperClassForBitWidth(1024));
209 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
210 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
211 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
212 MVT::i1, MVT::v32i32},
216 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
217 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
218 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
219 MVT::i1, MVT::v32i32},
289 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
296 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
297 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
298 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
301 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
302 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
303 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
307 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
308 MVT::v3i16, MVT::v4i16, MVT::Other},
313 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
329 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
330 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
331 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
332 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
333 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
334 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
335 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
336 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
368 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
382 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
396 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
410 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
424 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
439 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
440 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
443 if (Subtarget->hasPkMovB32()) {
464 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
465 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
470 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
474 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
475 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
476 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
477 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
501 if (Subtarget->hasSMemRealTime() ||
506 if (Subtarget->has16BitInsts()) {
513 if (Subtarget->hasMadMacF32Insts())
530 if (Subtarget->hasIntClamp())
533 if (Subtarget->hasAddNoCarry())
539 {MVT::f32, MVT::f64},
Custom);
545 {MVT::f32, MVT::f64},
Legal);
547 if (Subtarget->haveRoundOpsF64())
570 if (Subtarget->has16BitInsts()) {
623 if (Subtarget->hasBF16TransInsts())
642 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
643 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
644 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
777 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
778 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
779 MVT::v32f16, MVT::v32bf16},
789 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
793 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
797 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
798 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
806 if (Subtarget->hasVOP3PInsts()) {
817 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
820 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
821 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
822 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
825 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
833 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
840 {MVT::v2f16, MVT::v4f16},
Custom);
846 if (Subtarget->hasBF16PackedInsts()) {
847 for (
MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
853 if (Subtarget->hasPackedFP32Ops()) {
857 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
864 if (Subtarget->has16BitInsts()) {
877 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
878 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
879 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
880 MVT::v32f16, MVT::v32bf16},
885 if (Subtarget->hasVectorMulU64())
887 else if (Subtarget->hasScalarSMulU64())
890 if (Subtarget->hasMad64_32())
893 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
896 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
898 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
901 if (Subtarget->hasMinimum3Maximum3F32())
904 if (Subtarget->hasMinimum3Maximum3PKF16()) {
908 if (!Subtarget->hasMinimum3Maximum3F16())
913 if (Subtarget->hasVOP3PInsts()) {
916 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
920 if (Subtarget->hasIntMinMax64())
925 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
926 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
931 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
932 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
933 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
934 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
938 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
939 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
940 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
941 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
956 if (Subtarget->hasBF16ConversionInsts()) {
961 if (Subtarget->hasBF16PackedInsts()) {
967 if (Subtarget->hasBF16TransInsts()) {
971 if (Subtarget->hasCvtPkF16F32Inst()) {
973 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1023 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1064 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1077 EVT DestVT,
EVT SrcVT)
const {
1079 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1080 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1082 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1089 LLT DestTy,
LLT SrcTy)
const {
1090 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1091 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1093 SrcTy.getScalarSizeInBits() == 16 &&
1114 if (Subtarget->has16BitInsts())
1116 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1120 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1142 if (
Size == 16 && Subtarget->has16BitInsts())
1143 return (NumElts + 1) / 2;
1149 return NumElts * ((
Size + 31) / 32);
1158 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1166 if (
Size == 16 && Subtarget->has16BitInsts()) {
1168 IntermediateVT = RegisterVT;
1169 NumIntermediates = (NumElts + 1) / 2;
1170 return NumIntermediates;
1175 IntermediateVT = RegisterVT;
1176 NumIntermediates = NumElts;
1177 return NumIntermediates;
1182 RegisterVT = MVT::i16;
1183 IntermediateVT = ScalarVT;
1184 NumIntermediates = NumElts;
1185 return NumIntermediates;
1189 RegisterVT = MVT::i32;
1190 IntermediateVT = ScalarVT;
1191 NumIntermediates = NumElts;
1192 return NumIntermediates;
1196 RegisterVT = MVT::i32;
1197 IntermediateVT = RegisterVT;
1198 NumIntermediates = NumElts * ((
Size + 31) / 32);
1199 return NumIntermediates;
1204 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1209 unsigned MaxNumLanes) {
1210 assert(MaxNumLanes != 0);
1214 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1225 unsigned MaxNumLanes) {
1231 assert(ST->getNumContainedTypes() == 2 &&
1232 ST->getContainedType(1)->isIntegerTy(32));
1246 return MVT::amdgpuBufferFatPointer;
1248 DL.getPointerSizeInBits(AS) == 192)
1249 return MVT::amdgpuBufferStridedPointer;
1258 DL.getPointerSizeInBits(AS) == 160) ||
1260 DL.getPointerSizeInBits(AS) == 192))
1267 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1268 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1269 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1271 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1272 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1273 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1274 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1275 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1277 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1278 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1279 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1280 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1281 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1283 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1284 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1285 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1286 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1287 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1326 unsigned IntrID)
const {
1328 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1346 if (RsrcIntr->IsImage) {
1361 Info.ptrVal = RsrcArg;
1364 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1373 if (RsrcIntr->IsImage) {
1374 unsigned MaxNumLanes = 4;
1389 std::numeric_limits<unsigned>::max());
1399 if (RsrcIntr->IsImage) {
1420 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1422 Info.memVT = MVT::i32;
1429 case Intrinsic::amdgcn_raw_buffer_load_lds:
1430 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1431 case Intrinsic::amdgcn_struct_buffer_load_lds:
1432 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1438 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1439 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1440 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1441 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1444 std::numeric_limits<unsigned>::max());
1454 case Intrinsic::amdgcn_ds_ordered_add:
1455 case Intrinsic::amdgcn_ds_ordered_swap: {
1468 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1469 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1472 Info.ptrVal =
nullptr;
1477 case Intrinsic::amdgcn_ds_append:
1478 case Intrinsic::amdgcn_ds_consume: {
1491 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1492 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1493 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1498 Info.memVT = MVT::i64;
1504 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1505 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1506 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1509 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1512 ->getElementType(0));
1520 case Intrinsic::amdgcn_global_atomic_fmin_num:
1521 case Intrinsic::amdgcn_global_atomic_fmax_num:
1522 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1523 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1524 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1534 case Intrinsic::amdgcn_flat_load_monitor_b32:
1535 case Intrinsic::amdgcn_flat_load_monitor_b64:
1536 case Intrinsic::amdgcn_flat_load_monitor_b128:
1537 case Intrinsic::amdgcn_global_load_monitor_b32:
1538 case Intrinsic::amdgcn_global_load_monitor_b64:
1539 case Intrinsic::amdgcn_global_load_monitor_b128:
1540 case Intrinsic::amdgcn_cluster_load_b32:
1541 case Intrinsic::amdgcn_cluster_load_b64:
1542 case Intrinsic::amdgcn_cluster_load_b128:
1543 case Intrinsic::amdgcn_ds_load_tr6_b96:
1544 case Intrinsic::amdgcn_ds_load_tr4_b64:
1545 case Intrinsic::amdgcn_ds_load_tr8_b64:
1546 case Intrinsic::amdgcn_ds_load_tr16_b128:
1547 case Intrinsic::amdgcn_global_load_tr6_b96:
1548 case Intrinsic::amdgcn_global_load_tr4_b64:
1549 case Intrinsic::amdgcn_global_load_tr_b64:
1550 case Intrinsic::amdgcn_global_load_tr_b128:
1551 case Intrinsic::amdgcn_ds_read_tr4_b64:
1552 case Intrinsic::amdgcn_ds_read_tr6_b96:
1553 case Intrinsic::amdgcn_ds_read_tr8_b64:
1554 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1562 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1563 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1564 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1572 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1573 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1574 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1582 case Intrinsic::amdgcn_ds_gws_init:
1583 case Intrinsic::amdgcn_ds_gws_barrier:
1584 case Intrinsic::amdgcn_ds_gws_sema_v:
1585 case Intrinsic::amdgcn_ds_gws_sema_br:
1586 case Intrinsic::amdgcn_ds_gws_sema_p:
1587 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1597 Info.memVT = MVT::i32;
1599 Info.align =
Align(4);
1601 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1607 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1608 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1609 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1610 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1611 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1612 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1613 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1614 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1621 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1622 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1623 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1624 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1631 case Intrinsic::amdgcn_load_to_lds:
1632 case Intrinsic::amdgcn_global_load_lds: {
1643 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1644 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1645 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1646 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1656 Info.memVT = MVT::i32;
1658 Info.align =
Align(4);
1663 case Intrinsic::amdgcn_s_prefetch_data:
1664 case Intrinsic::amdgcn_flat_prefetch:
1665 case Intrinsic::amdgcn_global_prefetch: {
1680 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1683 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1684 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1696 Type *&AccessTy)
const {
1697 Value *Ptr =
nullptr;
1698 switch (
II->getIntrinsicID()) {
1699 case Intrinsic::amdgcn_cluster_load_b128:
1700 case Intrinsic::amdgcn_cluster_load_b64:
1701 case Intrinsic::amdgcn_cluster_load_b32:
1702 case Intrinsic::amdgcn_ds_append:
1703 case Intrinsic::amdgcn_ds_consume:
1704 case Intrinsic::amdgcn_ds_load_tr8_b64:
1705 case Intrinsic::amdgcn_ds_load_tr16_b128:
1706 case Intrinsic::amdgcn_ds_load_tr4_b64:
1707 case Intrinsic::amdgcn_ds_load_tr6_b96:
1708 case Intrinsic::amdgcn_ds_read_tr4_b64:
1709 case Intrinsic::amdgcn_ds_read_tr6_b96:
1710 case Intrinsic::amdgcn_ds_read_tr8_b64:
1711 case Intrinsic::amdgcn_ds_read_tr16_b64:
1712 case Intrinsic::amdgcn_ds_ordered_add:
1713 case Intrinsic::amdgcn_ds_ordered_swap:
1714 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1715 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1716 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1717 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1718 case Intrinsic::amdgcn_flat_load_monitor_b128:
1719 case Intrinsic::amdgcn_flat_load_monitor_b32:
1720 case Intrinsic::amdgcn_flat_load_monitor_b64:
1721 case Intrinsic::amdgcn_global_atomic_fmax_num:
1722 case Intrinsic::amdgcn_global_atomic_fmin_num:
1723 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1724 case Intrinsic::amdgcn_global_load_monitor_b128:
1725 case Intrinsic::amdgcn_global_load_monitor_b32:
1726 case Intrinsic::amdgcn_global_load_monitor_b64:
1727 case Intrinsic::amdgcn_global_load_tr_b64:
1728 case Intrinsic::amdgcn_global_load_tr_b128:
1729 case Intrinsic::amdgcn_global_load_tr4_b64:
1730 case Intrinsic::amdgcn_global_load_tr6_b96:
1731 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1732 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1733 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1734 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1735 Ptr =
II->getArgOperand(0);
1737 case Intrinsic::amdgcn_load_to_lds:
1738 case Intrinsic::amdgcn_global_load_lds:
1739 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1740 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1741 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1742 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1743 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1744 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1745 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1746 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1747 Ptr =
II->getArgOperand(1);
1752 AccessTy =
II->getType();
1758 unsigned AddrSpace)
const {
1759 if (!Subtarget->hasFlatInstOffsets()) {
1770 return AM.
Scale == 0 &&
1771 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1772 AM.
BaseOffs, AddrSpace, FlatVariant));
1776 if (Subtarget->hasFlatGlobalInsts())
1779 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1792 return isLegalMUBUFAddressingMode(AM);
1795bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1806 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1818 if (AM.HasBaseReg) {
1850 return isLegalMUBUFAddressingMode(AM);
1852 if (!Subtarget->hasScalarSubwordLoads()) {
1857 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1905 return Subtarget->enableFlatScratch()
1907 : isLegalMUBUFAddressingMode(AM);
1954 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1963 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
1966 Align RequiredAlignment(
1968 if (Subtarget->hasLDSMisalignedBug() &&
Size > 32 &&
1969 Alignment < RequiredAlignment)
1984 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
1990 RequiredAlignment =
Align(4);
1992 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2008 *IsFast = (Alignment >= RequiredAlignment) ? 64
2009 : (Alignment <
Align(4)) ? 32
2016 if (!Subtarget->hasDS96AndDS128())
2022 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2031 *IsFast = (Alignment >= RequiredAlignment) ? 96
2032 : (Alignment <
Align(4)) ? 32
2039 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2045 RequiredAlignment =
Align(8);
2047 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2056 *IsFast = (Alignment >= RequiredAlignment) ? 128
2057 : (Alignment <
Align(4)) ? 32
2074 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2076 return Alignment >= RequiredAlignment ||
2077 Subtarget->hasUnalignedDSAccessEnabled();
2085 bool AlignedBy4 = Alignment >=
Align(4);
2086 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2088 *IsFast = AlignedBy4 ?
Size : 1;
2093 *IsFast = AlignedBy4;
2104 return Alignment >=
Align(4) ||
2105 Subtarget->hasUnalignedBufferAccessEnabled();
2117 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2132 return Size >= 32 && Alignment >=
Align(4);
2137 unsigned *IsFast)
const {
2139 Alignment, Flags, IsFast);
2144 const AttributeList &FuncAttributes)
const {
2150 if (
Op.size() >= 16 &&
2154 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2172 unsigned DestAS)
const {
2175 Subtarget->hasGloballyAddressableScratch()) {
2205 unsigned Index)
const {
2221 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2256 auto [InputPtrReg, RC, ArgTy] =
2266 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2272 const SDLoc &SL)
const {
2279 const SDLoc &SL)
const {
2282 std::optional<uint32_t> KnownSize =
2284 if (KnownSize.has_value())
2310 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2319SDValue SITargetLowering::lowerKernargMemParameter(
2324 MachinePointerInfo PtrInfo =
2333 int64_t OffsetDiff =
Offset - AlignDownOffset;
2339 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2350 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2355 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain,
Offset);
2360 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2369 const SDLoc &SL)
const {
2438 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2441 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2442 if (ConvertedVal == ArgValue)
2443 return ConvertedVal;
2448SDValue SITargetLowering::lowerWorkGroupId(
2453 if (!Subtarget->hasClusters())
2454 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2462 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2463 SDLoc SL(ClusterIdXYZ);
2464 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2467 SDValue ClusterWorkGroupIdXYZ =
2468 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2478 return ClusterIdXYZ;
2480 using namespace AMDGPU::Hwreg;
2484 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2495SDValue SITargetLowering::getPreloadedValue(
2498 const ArgDescriptor *
Reg =
nullptr;
2499 const TargetRegisterClass *RC;
2503 const ArgDescriptor WorkGroupIDX =
2511 const ArgDescriptor WorkGroupIDZ =
2513 const ArgDescriptor ClusterWorkGroupIDX =
2515 const ArgDescriptor ClusterWorkGroupIDY =
2517 const ArgDescriptor ClusterWorkGroupIDZ =
2519 const ArgDescriptor ClusterWorkGroupMaxIDX =
2521 const ArgDescriptor ClusterWorkGroupMaxIDY =
2523 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2525 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2528 auto LoadConstant = [&](
unsigned N) {
2532 if (Subtarget->hasArchitectedSGPRs() &&
2539 Reg = &WorkGroupIDX;
2540 RC = &AMDGPU::SReg_32RegClass;
2544 Reg = &WorkGroupIDY;
2545 RC = &AMDGPU::SReg_32RegClass;
2549 Reg = &WorkGroupIDZ;
2550 RC = &AMDGPU::SReg_32RegClass;
2554 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2555 return LoadConstant(0);
2556 Reg = &ClusterWorkGroupIDX;
2557 RC = &AMDGPU::SReg_32RegClass;
2561 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2562 return LoadConstant(0);
2563 Reg = &ClusterWorkGroupIDY;
2564 RC = &AMDGPU::SReg_32RegClass;
2568 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2569 return LoadConstant(0);
2570 Reg = &ClusterWorkGroupIDZ;
2571 RC = &AMDGPU::SReg_32RegClass;
2576 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2577 Reg = &ClusterWorkGroupMaxIDX;
2578 RC = &AMDGPU::SReg_32RegClass;
2583 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2584 Reg = &ClusterWorkGroupMaxIDY;
2585 RC = &AMDGPU::SReg_32RegClass;
2590 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2591 Reg = &ClusterWorkGroupMaxIDZ;
2592 RC = &AMDGPU::SReg_32RegClass;
2596 Reg = &ClusterWorkGroupMaxFlatID;
2597 RC = &AMDGPU::SReg_32RegClass;
2628 for (
unsigned I = 0,
E = Ins.
size(), PSInputNum = 0;
I !=
E; ++
I) {
2632 "vector type argument should have been split");
2637 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2645 "unexpected vector split in ps argument type");
2659 Info->markPSInputAllocated(PSInputNum);
2661 Info->markPSInputEnabled(PSInputNum);
2677 if (Info.hasWorkItemIDX()) {
2683 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2687 if (Info.hasWorkItemIDY()) {
2688 assert(Info.hasWorkItemIDX());
2689 if (Subtarget->hasPackedTID()) {
2690 Info.setWorkItemIDY(
2693 unsigned Reg = AMDGPU::VGPR1;
2701 if (Info.hasWorkItemIDZ()) {
2702 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2703 if (Subtarget->hasPackedTID()) {
2704 Info.setWorkItemIDZ(
2707 unsigned Reg = AMDGPU::VGPR2;
2727 if (RegIdx == ArgVGPRs.
size()) {
2734 unsigned Reg = ArgVGPRs[RegIdx];
2746 unsigned NumArgRegs) {
2749 if (RegIdx == ArgSGPRs.
size())
2752 unsigned Reg = ArgSGPRs[RegIdx];
2794 const unsigned Mask = 0x3ff;
2797 if (Info.hasWorkItemIDX()) {
2799 Info.setWorkItemIDX(Arg);
2802 if (Info.hasWorkItemIDY()) {
2804 Info.setWorkItemIDY(Arg);
2807 if (Info.hasWorkItemIDZ())
2819 const unsigned Mask = 0x3ff;
2828 auto &
ArgInfo = Info.getArgInfo();
2840 if (Info.hasImplicitArgPtr())
2848 if (Info.hasWorkGroupIDX())
2851 if (Info.hasWorkGroupIDY())
2854 if (Info.hasWorkGroupIDZ())
2857 if (Info.hasLDSKernelId())
2868 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2869 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2875 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2876 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2881 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2882 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2888 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2894 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
2903 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2908 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
2909 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2914 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
2915 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2930 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2932 bool InPreloadSequence =
true;
2934 bool AlignedForImplictArgs =
false;
2935 unsigned ImplicitArgOffset = 0;
2936 for (
auto &Arg :
F.args()) {
2937 if (!InPreloadSequence || !Arg.hasInRegAttr())
2940 unsigned ArgIdx = Arg.getArgNo();
2943 if (InIdx < Ins.
size() &&
2944 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2947 for (; InIdx < Ins.
size() && Ins[InIdx].isOrigArg() &&
2948 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2950 assert(ArgLocs[ArgIdx].isMemLoc());
2951 auto &ArgLoc = ArgLocs[InIdx];
2953 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2955 unsigned NumAllocSGPRs =
2956 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2959 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2960 if (!AlignedForImplictArgs) {
2962 alignTo(LastExplicitArgOffset,
2963 Subtarget->getAlignmentForImplicitArgPtr()) -
2964 LastExplicitArgOffset;
2965 AlignedForImplictArgs =
true;
2967 ArgOffset += ImplicitArgOffset;
2971 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2972 assert(InIdx >= 1 &&
"No previous SGPR");
2973 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2974 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2978 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2979 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2982 InPreloadSequence =
false;
2988 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2990 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2992 if (PreloadRegs->
size() > 1)
2993 RC = &AMDGPU::SGPR_32RegClass;
2994 for (
auto &Reg : *PreloadRegs) {
3000 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3009 if (Info.hasLDSKernelId()) {
3010 Register Reg = Info.addLDSKernelId();
3011 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3020 bool IsShader)
const {
3021 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3022 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3028 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3030 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3034 unsigned NumRequiredSystemSGPRs =
3035 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3036 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3037 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3038 Register Reg = Info.addReservedUserSGPR();
3039 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3044 if (!HasArchitectedSGPRs) {
3045 if (Info.hasWorkGroupIDX()) {
3046 Register Reg = Info.addWorkGroupIDX();
3047 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3051 if (Info.hasWorkGroupIDY()) {
3052 Register Reg = Info.addWorkGroupIDY();
3053 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3057 if (Info.hasWorkGroupIDZ()) {
3058 Register Reg = Info.addWorkGroupIDZ();
3059 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3064 if (Info.hasWorkGroupInfo()) {
3065 Register Reg = Info.addWorkGroupInfo();
3066 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3070 if (Info.hasPrivateSegmentWaveByteOffset()) {
3072 unsigned PrivateSegmentWaveByteOffsetReg;
3075 PrivateSegmentWaveByteOffsetReg =
3076 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3080 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3082 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3085 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3087 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3088 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3091 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3092 Info.getNumPreloadedSGPRs() >= 16);
3107 if (HasStackObjects)
3108 Info.setHasNonSpillStackObjects(
true);
3113 HasStackObjects =
true;
3117 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3119 if (!ST.enableFlatScratch()) {
3120 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3127 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3129 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3139 Info.setScratchRSrcReg(ReservedBufferReg);
3158 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
3159 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3166 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3167 if (!
MRI.isLiveIn(
Reg)) {
3168 Info.setStackPtrOffsetReg(
Reg);
3173 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3180 if (ST.getFrameLowering()->hasFP(MF)) {
3181 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3197 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3206 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3207 RC = &AMDGPU::SGPR_64RegClass;
3208 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3209 RC = &AMDGPU::SGPR_32RegClass;
3215 Entry->addLiveIn(*
I);
3220 for (
auto *Exit : Exits)
3222 TII->get(TargetOpcode::COPY), *
I)
3237 bool IsError =
false;
3241 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3259 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3260 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3262 if (!Subtarget->enableFlatScratch())
3267 !Subtarget->hasArchitectedSGPRs())
3268 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3269 !Info->hasWorkGroupIDZ());
3272 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3290 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3291 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3294 Info->markPSInputAllocated(0);
3295 Info->markPSInputEnabled(0);
3297 if (Subtarget->isAmdPalOS()) {
3306 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3307 if ((PsInputBits & 0x7F) == 0 ||
3308 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3311 }
else if (IsKernel) {
3312 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3324 if (IsKernel && Subtarget->hasKernargPreload())
3328 }
else if (!IsGraphics) {
3333 if (!Subtarget->enableFlatScratch())
3345 Info->setNumWaveDispatchSGPRs(
3347 Info->setNumWaveDispatchVGPRs(
3349 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3350 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3355 if (IsWholeWaveFunc) {
3357 {MVT::i1, MVT::Other}, Chain);
3369 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.
size(), ArgIdx = 0; i != e;
3380 if (IsEntryFunc && VA.
isMemLoc()) {
3403 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3407 int64_t OffsetDiff =
Offset - AlignDownOffset;
3414 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3425 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3426 Ins[i].Flags.isSExt(), &Ins[i]);
3434 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3437 if (PreloadRegs.
size() == 1) {
3438 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3443 TRI->getRegSizeInBits(*RC)));
3451 for (
auto Reg : PreloadRegs) {
3458 PreloadRegs.size()),
3475 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3476 Ins[i].Flags.isSExt(), &Ins[i]);
3488 "hidden argument in kernel signature was not preloaded",
3494 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3495 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3515 if (!IsEntryFunc && VA.
isMemLoc()) {
3516 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3527 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3528 RC = &AMDGPU::VGPR_32RegClass;
3529 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3530 RC = &AMDGPU::SGPR_32RegClass;
3550 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3559 auto &ArgUsageInfo =
3562 }
else if (
auto *MFAM = DAG.
getMFAM()) {
3564 auto *ArgUsageInfo =
3566 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3568 ArgUsageInfo->setFuncArgInfo(Fn, Info->getArgInfo());
3572 Info->setBytesInStackArgArea(StackArgSize);
3574 return Chains.
empty() ? Chain
3583 const Type *RetTy)
const {
3591 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3596 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3597 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3598 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3599 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3622 Info->setIfReturnsVoid(Outs.
empty());
3623 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3642 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3643 ++
I, ++RealRVLocIdx) {
3647 SDValue Arg = OutVals[RealRVLocIdx];
3670 ReadFirstLane, Arg);
3677 if (!Info->isEntryFunction()) {
3683 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3685 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3698 unsigned Opc = AMDGPUISD::ENDPGM;
3700 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3701 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3702 : AMDGPUISD::RET_GLUE;
3784 auto &ArgUsageInfo =
3787 &ArgUsageInfo.getArgUsageInfo().lookupFuncArgInfo(*CalleeFunc);
3788 }
else if (
auto *MFAM = DAG.
getMFAM()) {
3790 auto *ArgUsageInfo =
3795 CalleeArgInfo = &ArgUsageInfo->lookupFuncArgInfo(*CalleeFunc);
3823 const auto [OutgoingArg, ArgRC, ArgTy] =
3828 const auto [IncomingArg, IncomingArgRC, Ty] =
3830 assert(IncomingArgRC == ArgRC);
3833 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3841 InputReg = getImplicitArgPtr(DAG,
DL);
3843 std::optional<uint32_t> Id =
3845 if (Id.has_value()) {
3856 if (OutgoingArg->isRegister()) {
3857 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3858 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3861 unsigned SpecialArgOffset =
3872 auto [OutgoingArg, ArgRC, Ty] =
3875 std::tie(OutgoingArg, ArgRC, Ty) =
3878 std::tie(OutgoingArg, ArgRC, Ty) =
3893 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3894 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3895 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3900 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3908 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
3918 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
3927 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3928 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3939 : IncomingArgY ? *IncomingArgY
3946 if (OutgoingArg->isRegister()) {
3948 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3974 if (Callee->isDivergent())
3981 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3985 if (!CallerPreserved)
3988 bool CCMatch = CallerCC == CalleeCC;
4001 if (Arg.hasByValAttr())
4015 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
4016 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4025 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4038 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4040 if (!CCVA.isRegLoc())
4045 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4047 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4071enum ChainCallArgIdx {
4093 bool UsesDynamicVGPRs =
false;
4094 if (IsChainCallConv) {
4099 auto RequestedExecIt =
4101 return Arg.OrigArgIndex == 2;
4103 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4105 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4108 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4111 "Haven't popped all the special args");
4114 CLI.
Args[ChainCallArgIdx::Exec];
4115 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4123 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4125 ChainCallSpecialArgs.
push_back(Arg.Node);
4128 PushNodeOrTargetConstant(RequestedExecArg);
4134 if (FlagsValue.
isZero()) {
4135 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4137 "no additional args allowed if flags == 0");
4139 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4143 if (!Subtarget->isWave32()) {
4145 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4148 UsesDynamicVGPRs =
true;
4149 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4150 CLI.
Args.end(), PushNodeOrTargetConstant);
4159 bool IsSibCall =
false;
4173 "unsupported call to variadic function ");
4181 "unsupported required tail call to function ");
4186 Outs, OutVals, Ins, DAG);
4190 "site marked musttail or on llvm.amdgcn.cs.chain");
4197 if (!TailCallOpt && IsTailCall)
4237 auto *
TRI = Subtarget->getRegisterInfo();
4244 if (!IsSibCall || IsChainCallConv) {
4245 if (!Subtarget->enableFlatScratch()) {
4251 RegsToPass.emplace_back(IsChainCallConv
4252 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4253 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4260 const unsigned NumSpecialInputs = RegsToPass.size();
4262 MVT PtrVT = MVT::i32;
4265 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4293 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4301 int32_t
Offset = LocMemOffset;
4308 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4314 ? Flags.getNonZeroByValAlign()
4341 if (Outs[i].Flags.isByVal()) {
4343 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4346 Outs[i].Flags.getNonZeroByValAlign(),
4348 nullptr, std::nullopt, DstInfo,
4354 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4360 if (!MemOpChains.
empty())
4376 unsigned ArgIdx = 0;
4377 for (
auto [Reg, Val] : RegsToPass) {
4378 if (ArgIdx++ >= NumSpecialInputs &&
4379 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4405 if (IsTailCall && !IsSibCall) {
4410 std::vector<SDValue>
Ops({Chain});
4416 Ops.push_back(Callee);
4433 Ops.push_back(Callee);
4444 if (IsChainCallConv)
4449 for (
auto &[Reg, Val] : RegsToPass)
4453 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4454 assert(Mask &&
"Missing call preserved mask for calling convention");
4464 MVT::Glue, GlueOps),
4469 Ops.push_back(InGlue);
4475 unsigned OPC = AMDGPUISD::TC_RETURN;
4478 OPC = AMDGPUISD::TC_RETURN_GFX;
4482 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4483 : AMDGPUISD::TC_RETURN_CHAIN;
4489 if (Info->isWholeWaveFunction())
4490 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4497 Chain =
Call.getValue(0);
4498 InGlue =
Call.getValue(1);
4500 uint64_t CalleePopBytes = NumBytes;
4521 EVT VT =
Op.getValueType();
4535 "Stack grows upwards for AMDGPU");
4537 Chain = BaseAddr.getValue(1);
4539 if (Alignment > StackAlign) {
4541 << Subtarget->getWavefrontSizeLog2();
4542 uint64_t StackAlignMask = ScaledAlignment - 1;
4549 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4555 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4566 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4582 if (
Op.getValueType() != MVT::i32)
4601 assert(
Op.getValueType() == MVT::i32);
4610 Op.getOperand(0), IntrinID, GetRoundBothImm);
4644 SDValue RoundModeTimesNumBits =
4664 TableEntry, EnumOffset);
4680 static_cast<uint32_t>(ConstMode->getZExtValue()),
4692 if (UseReducedTable) {
4698 SDValue RoundModeTimesNumBits =
4718 SDValue RoundModeTimesNumBits =
4727 NewMode = TruncTable;
4736 ReadFirstLaneID, NewMode);
4749 IntrinID, RoundBothImm, NewMode);
4755 if (
Op->isDivergent() &&
4756 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4766 if (Subtarget->hasSafeSmemPrefetch())
4774 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4783 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4784 EVT SrcVT = Src.getValueType();
4793 EVT DstVT =
Op.getValueType();
4802 if (
Op.getValueType() != MVT::i64)
4816 Op.getOperand(0), IntrinID, ModeHwRegImm);
4818 Op.getOperand(0), IntrinID, TrapHwRegImm);
4832 if (
Op.getOperand(1).getValueType() != MVT::i64)
4844 ReadFirstLaneID, NewModeReg);
4846 ReadFirstLaneID, NewTrapReg);
4848 unsigned ModeHwReg =
4851 unsigned TrapHwReg =
4859 IntrinID, ModeHwRegImm, NewModeReg);
4862 IntrinID, TrapHwRegImm, NewTrapReg);
4871 .
Case(
"m0", AMDGPU::M0)
4872 .
Case(
"exec", AMDGPU::EXEC)
4873 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4874 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4875 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4876 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4877 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4882 if (!Subtarget->hasFlatScrRegister() &&
4883 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4885 "\" for subtarget."));
4890 case AMDGPU::EXEC_LO:
4891 case AMDGPU::EXEC_HI:
4892 case AMDGPU::FLAT_SCR_LO:
4893 case AMDGPU::FLAT_SCR_HI:
4898 case AMDGPU::FLAT_SCR:
4917 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4926static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4948 auto Next = std::next(
I);
4959 MBB.addSuccessor(LoopBB);
4961 return std::pair(LoopBB, RemainderBB);
4968 auto I =
MI.getIterator();
4969 auto E = std::next(
I);
4991 Src->setIsKill(
false);
5001 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
5007 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5010 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5034 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5035 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5045 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
5046 Register NewExec =
MRI.createVirtualRegister(BoolRC);
5048 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5049 Register CondReg =
MRI.createVirtualRegister(BoolRC);
5057 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5064 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5068 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5076 MRI.setSimpleHint(NewExec, CondReg);
5078 if (UseGPRIdxMode) {
5080 SGPRIdxReg = CurrentIdxReg;
5082 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5083 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5093 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5124 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5125 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5133 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5135 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
5136 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
5152 InitResultReg, DstReg, PhiReg, TmpExec,
5153 Offset, UseGPRIdxMode, SGPRIdxReg);
5159 LoopBB->removeSuccessor(RemainderBB);
5161 LoopBB->addSuccessor(LandingPad);
5172static std::pair<unsigned, int>
5176 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5181 return std::pair(AMDGPU::sub0,
Offset);
5221 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5238 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5239 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5248 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5251 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5255 if (UseGPRIdxMode) {
5262 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5275 MI.eraseFromParent();
5284 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5285 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5291 UseGPRIdxMode, SGPRIdxReg);
5295 if (UseGPRIdxMode) {
5297 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5299 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5304 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5309 MI.eraseFromParent();
5326 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5336 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5338 if (Idx->
getReg() == AMDGPU::NoRegister) {
5349 MI.eraseFromParent();
5354 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5358 if (UseGPRIdxMode) {
5362 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5371 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5372 TRI.getRegSizeInBits(*VecRC), 32,
false);
5378 MI.eraseFromParent();
5388 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5392 UseGPRIdxMode, SGPRIdxReg);
5395 if (UseGPRIdxMode) {
5397 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5399 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5405 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5406 TRI.getRegSizeInBits(*VecRC), 32,
false);
5407 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5413 MI.eraseFromParent();
5429 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5430 if (ST.hasScalarAddSub64()) {
5431 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5441 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5442 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5445 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5447 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5450 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5452 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5454 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5455 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5464 MI.eraseFromParent();
5470 case AMDGPU::S_MIN_U32:
5471 return std::numeric_limits<uint32_t>::max();
5472 case AMDGPU::S_MIN_I32:
5473 return std::numeric_limits<int32_t>::max();
5474 case AMDGPU::S_MAX_U32:
5475 return std::numeric_limits<uint32_t>::min();
5476 case AMDGPU::S_MAX_I32:
5477 return std::numeric_limits<int32_t>::min();
5478 case AMDGPU::V_ADD_F32_e64:
5480 case AMDGPU::V_SUB_F32_e64:
5482 case AMDGPU::S_ADD_I32:
5483 case AMDGPU::S_SUB_I32:
5484 case AMDGPU::S_OR_B32:
5485 case AMDGPU::S_XOR_B32:
5486 return std::numeric_limits<uint32_t>::min();
5487 case AMDGPU::S_AND_B32:
5488 return std::numeric_limits<uint32_t>::max();
5489 case AMDGPU::V_MIN_F32_e64:
5490 case AMDGPU::V_MAX_F32_e64:
5494 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5500 case AMDGPU::V_CMP_LT_U64_e64:
5501 return std::numeric_limits<uint64_t>::max();
5502 case AMDGPU::V_CMP_LT_I64_e64:
5503 return std::numeric_limits<int64_t>::max();
5504 case AMDGPU::V_CMP_GT_U64_e64:
5505 return std::numeric_limits<uint64_t>::min();
5506 case AMDGPU::V_CMP_GT_I64_e64:
5507 return std::numeric_limits<int64_t>::min();
5508 case AMDGPU::S_ADD_U64_PSEUDO:
5509 case AMDGPU::S_SUB_U64_PSEUDO:
5510 case AMDGPU::S_OR_B64:
5511 case AMDGPU::S_XOR_B64:
5512 return std::numeric_limits<uint64_t>::min();
5513 case AMDGPU::S_AND_B64:
5514 return std::numeric_limits<uint64_t>::max();
5517 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5522 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5523 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5524 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5525 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5526 Opc == AMDGPU::S_XOR_B32 ||
Opc == AMDGPU::V_MIN_F32_e64 ||
5527 Opc == AMDGPU::V_MAX_F32_e64 ||
Opc == AMDGPU::V_ADD_F32_e64 ||
5528 Opc == AMDGPU::V_SUB_F32_e64;
5532 return Opc == AMDGPU::V_MIN_F32_e64 ||
Opc == AMDGPU::V_MAX_F32_e64 ||
5533 Opc == AMDGPU::V_ADD_F32_e64 ||
Opc == AMDGPU::V_SUB_F32_e64;
5547 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5552 case AMDGPU::S_MIN_U32:
5553 case AMDGPU::S_MIN_I32:
5554 case AMDGPU::V_MIN_F32_e64:
5555 case AMDGPU::S_MAX_U32:
5556 case AMDGPU::S_MAX_I32:
5557 case AMDGPU::V_MAX_F32_e64:
5558 case AMDGPU::S_AND_B32:
5559 case AMDGPU::S_OR_B32: {
5565 case AMDGPU::V_CMP_LT_U64_e64:
5566 case AMDGPU::V_CMP_LT_I64_e64:
5567 case AMDGPU::V_CMP_GT_U64_e64:
5568 case AMDGPU::V_CMP_GT_I64_e64:
5569 case AMDGPU::S_AND_B64:
5570 case AMDGPU::S_OR_B64: {
5576 case AMDGPU::S_XOR_B32:
5577 case AMDGPU::S_XOR_B64:
5578 case AMDGPU::S_ADD_I32:
5579 case AMDGPU::S_ADD_U64_PSEUDO:
5580 case AMDGPU::V_ADD_F32_e64:
5581 case AMDGPU::S_SUB_I32:
5582 case AMDGPU::S_SUB_U64_PSEUDO:
5583 case AMDGPU::V_SUB_F32_e64: {
5586 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5588 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5590 bool IsWave32 = ST.isWave32();
5591 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5592 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5593 unsigned BitCountOpc =
5594 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5598 auto NewAccumulator =
5603 case AMDGPU::S_XOR_B32:
5604 case AMDGPU::S_XOR_B64: {
5610 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5613 .
addReg(NewAccumulator->getOperand(0).getReg())
5616 if (
Opc == AMDGPU::S_XOR_B32) {
5622 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5624 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5628 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5631 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5633 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5643 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5651 case AMDGPU::S_SUB_I32: {
5652 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5660 .
addReg(NewAccumulator->getOperand(0).getReg());
5663 case AMDGPU::S_ADD_I32: {
5666 .
addReg(NewAccumulator->getOperand(0).getReg());
5669 case AMDGPU::S_ADD_U64_PSEUDO:
5670 case AMDGPU::S_SUB_U64_PSEUDO: {
5671 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5672 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5674 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5676 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5677 Register CarryReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5678 Register AddReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5680 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5682 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5686 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5689 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5691 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5693 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5696 .
addReg(NewAccumulator->getOperand(0).getReg())
5706 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5708 : NewAccumulator->getOperand(0).getReg();
5719 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5725 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5731 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5738 case AMDGPU::V_ADD_F32_e64:
5739 case AMDGPU::V_SUB_F32_e64: {
5741 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5742 Register DstVreg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5746 .
addReg(NewAccumulator->getOperand(0).getReg())
5751 unsigned srcMod =
Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
5759 BuildMI(BB,
MI,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5788 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5789 Register IdentityValReg =
MRI.createVirtualRegister(DstRegClass);
5790 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5791 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5792 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5793 Register FF1Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5794 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
5796 bool IsWave32 = ST.isWave32();
5797 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5798 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5805 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5809 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5818 I = ComputeLoop->begin();
5820 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5824 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5828 I = ComputeLoop->end();
5831 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5835 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5841 MRI.createVirtualRegister(
MRI.getRegClass(SrcReg));
5842 Register DstVreg =
MRI.createVirtualRegister(
MRI.getRegClass(SrcReg));
5844 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_MOV_B32_e32),
5854 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5855 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5864 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5866 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5867 Register LaneValReg =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5870 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5872 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5874 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5876 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5880 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5884 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5885 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5891 case AMDGPU::S_OR_B64:
5892 case AMDGPU::S_AND_B64:
5893 case AMDGPU::S_XOR_B64: {
5896 .
addReg(LaneValue->getOperand(0).getReg())
5900 case AMDGPU::V_CMP_GT_I64_e64:
5901 case AMDGPU::V_CMP_GT_U64_e64:
5902 case AMDGPU::V_CMP_LT_I64_e64:
5903 case AMDGPU::V_CMP_LT_U64_e64: {
5904 Register LaneMaskReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5906 MRI.createVirtualRegister(WaveMaskRegClass);
5909 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5910 Register AccumulatorVReg =
MRI.createVirtualRegister(VregClass);
5913 VregClass, AMDGPU::sub0, VSubRegClass);
5916 VregClass, AMDGPU::sub1, VSubRegClass);
5917 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
5924 .
addReg(LaneValue->getOperand(0).getReg())
5925 .
addReg(AccumulatorVReg);
5927 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5928 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
5932 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5933 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5934 .
addReg(LaneValue->getOperand(0).getReg())
5938 case AMDGPU::S_ADD_U64_PSEUDO:
5939 case AMDGPU::S_SUB_U64_PSEUDO: {
5942 .
addReg(LaneValue->getOperand(0).getReg());
5949 unsigned BITSETOpc =
5950 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5951 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5957 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5960 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5962 .
addReg(NewActiveBitsReg)
5964 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5969 MI.eraseFromParent();
5984 switch (
MI.getOpcode()) {
5985 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5987 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5989 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5991 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5993 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
5995 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5997 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5999 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6001 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6003 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6005 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6007 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6009 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6011 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6013 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6015 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6017 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6019 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6021 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6023 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6025 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6027 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6029 case AMDGPU::S_UADDO_PSEUDO:
6030 case AMDGPU::S_USUBO_PSEUDO: {
6036 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6038 : AMDGPU::S_SUB_U32;
6046 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6049 MI.eraseFromParent();
6052 case AMDGPU::S_ADD_U64_PSEUDO:
6053 case AMDGPU::S_SUB_U64_PSEUDO: {
6056 case AMDGPU::V_ADD_U64_PSEUDO:
6057 case AMDGPU::V_SUB_U64_PSEUDO: {
6058 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6064 if (ST.hasAddSubU64Insts()) {
6066 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6067 : AMDGPU::V_SUB_U64_e64),
6072 TII->legalizeOperands(*
I);
6073 MI.eraseFromParent();
6077 if (IsAdd && ST.hasLshlAddU64Inst()) {
6083 TII->legalizeOperands(*
Add);
6084 MI.eraseFromParent();
6088 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6090 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6091 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6093 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
6094 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
6098 : &AMDGPU::VReg_64RegClass;
6101 : &AMDGPU::VReg_64RegClass;
6104 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6106 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6109 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6111 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6114 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6116 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6119 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6126 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6140 TII->legalizeOperands(*LoHalf);
6141 TII->legalizeOperands(*HiHalf);
6142 MI.eraseFromParent();
6145 case AMDGPU::S_ADD_CO_PSEUDO:
6146 case AMDGPU::S_SUB_CO_PSEUDO: {
6157 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6158 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6163 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6164 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6168 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6170 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6175 if (ST.isWave64()) {
6176 if (ST.hasScalarCompareEq64()) {
6183 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6185 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6187 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6188 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6190 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6204 unsigned Opc =
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6205 ? AMDGPU::S_ADDC_U32
6206 : AMDGPU::S_SUBB_U32;
6211 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6217 MI.eraseFromParent();
6220 case AMDGPU::SI_INIT_M0: {
6223 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6226 MI.eraseFromParent();
6229 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6232 TII->get(AMDGPU::S_CMP_EQ_U32))
6237 case AMDGPU::GET_GROUPSTATICSIZE: {
6241 .
add(
MI.getOperand(0))
6243 MI.eraseFromParent();
6246 case AMDGPU::GET_SHADERCYCLESHILO: {
6259 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6261 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6262 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6264 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6265 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6267 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6271 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6276 .
add(
MI.getOperand(0))
6281 MI.eraseFromParent();
6284 case AMDGPU::SI_INDIRECT_SRC_V1:
6285 case AMDGPU::SI_INDIRECT_SRC_V2:
6286 case AMDGPU::SI_INDIRECT_SRC_V3:
6287 case AMDGPU::SI_INDIRECT_SRC_V4:
6288 case AMDGPU::SI_INDIRECT_SRC_V5:
6289 case AMDGPU::SI_INDIRECT_SRC_V6:
6290 case AMDGPU::SI_INDIRECT_SRC_V7:
6291 case AMDGPU::SI_INDIRECT_SRC_V8:
6292 case AMDGPU::SI_INDIRECT_SRC_V9:
6293 case AMDGPU::SI_INDIRECT_SRC_V10:
6294 case AMDGPU::SI_INDIRECT_SRC_V11:
6295 case AMDGPU::SI_INDIRECT_SRC_V12:
6296 case AMDGPU::SI_INDIRECT_SRC_V16:
6297 case AMDGPU::SI_INDIRECT_SRC_V32:
6299 case AMDGPU::SI_INDIRECT_DST_V1:
6300 case AMDGPU::SI_INDIRECT_DST_V2:
6301 case AMDGPU::SI_INDIRECT_DST_V3:
6302 case AMDGPU::SI_INDIRECT_DST_V4:
6303 case AMDGPU::SI_INDIRECT_DST_V5:
6304 case AMDGPU::SI_INDIRECT_DST_V6:
6305 case AMDGPU::SI_INDIRECT_DST_V7:
6306 case AMDGPU::SI_INDIRECT_DST_V8:
6307 case AMDGPU::SI_INDIRECT_DST_V9:
6308 case AMDGPU::SI_INDIRECT_DST_V10:
6309 case AMDGPU::SI_INDIRECT_DST_V11:
6310 case AMDGPU::SI_INDIRECT_DST_V12:
6311 case AMDGPU::SI_INDIRECT_DST_V16:
6312 case AMDGPU::SI_INDIRECT_DST_V32:
6314 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6315 case AMDGPU::SI_KILL_I1_PSEUDO:
6317 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6321 Register SrcCond =
MI.getOperand(3).getReg();
6323 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6324 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6325 const auto *CondRC =
TRI->getWaveMaskRegClass();
6326 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
6330 : &AMDGPU::VReg_64RegClass;
6333 : &AMDGPU::VReg_64RegClass;
6336 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6338 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6341 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6343 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6346 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6348 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6369 MI.eraseFromParent();
6372 case AMDGPU::SI_BR_UNDEF: {
6374 .
add(
MI.getOperand(0));
6376 MI.eraseFromParent();
6379 case AMDGPU::ADJCALLSTACKUP:
6380 case AMDGPU::ADJCALLSTACKDOWN: {
6387 case AMDGPU::SI_CALL_ISEL: {
6388 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
6391 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6397 MI.eraseFromParent();
6400 case AMDGPU::V_ADD_CO_U32_e32:
6401 case AMDGPU::V_SUB_CO_U32_e32:
6402 case AMDGPU::V_SUBREV_CO_U32_e32: {
6404 unsigned Opc =
MI.getOpcode();
6406 bool NeedClampOperand =
false;
6407 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
6409 NeedClampOperand =
true;
6413 if (
TII->isVOP3(*
I)) {
6416 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
6417 if (NeedClampOperand)
6420 TII->legalizeOperands(*
I);
6422 MI.eraseFromParent();
6425 case AMDGPU::V_ADDC_U32_e32:
6426 case AMDGPU::V_SUBB_U32_e32:
6427 case AMDGPU::V_SUBBREV_U32_e32:
6430 TII->legalizeOperands(
MI);
6432 case AMDGPU::DS_GWS_INIT:
6433 case AMDGPU::DS_GWS_SEMA_BR:
6434 case AMDGPU::DS_GWS_BARRIER:
6435 case AMDGPU::DS_GWS_SEMA_V:
6436 case AMDGPU::DS_GWS_SEMA_P:
6437 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6445 case AMDGPU::S_SETREG_B32: {
6461 const unsigned SetMask = WidthMask <<
Offset;
6464 unsigned SetDenormOp = 0;
6465 unsigned SetRoundOp = 0;
6473 SetRoundOp = AMDGPU::S_ROUND_MODE;
6474 SetDenormOp = AMDGPU::S_DENORM_MODE;
6476 SetRoundOp = AMDGPU::S_ROUND_MODE;
6478 SetDenormOp = AMDGPU::S_DENORM_MODE;
6481 if (SetRoundOp || SetDenormOp) {
6483 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6484 unsigned ImmVal = Def->getOperand(1).getImm();
6498 MI.eraseFromParent();
6507 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6511 case AMDGPU::S_INVERSE_BALLOT_U32:
6512 case AMDGPU::S_INVERSE_BALLOT_U64:
6515 MI.setDesc(
TII->get(AMDGPU::COPY));
6517 case AMDGPU::ENDPGM_TRAP: {
6519 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6539 MI.eraseFromParent();
6542 case AMDGPU::SIMULATED_TRAP: {
6543 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6545 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6546 MI.eraseFromParent();
6549 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6550 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6556 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6557 Register OriginalExec = Setup->getOperand(0).getReg();
6559 MI.getOperand(0).setReg(OriginalExec);
6596 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6600 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6627 if (!Subtarget->hasMadMacF32Insts())
6628 return Subtarget->hasFastFMAF32();
6634 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6637 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6653 switch (Ty.getScalarSizeInBits()) {
6671 if (Ty.getScalarSizeInBits() == 16)
6673 if (Ty.getScalarSizeInBits() == 32)
6674 return Subtarget->hasMadMacF32Insts() &&
6684 EVT VT =
N->getValueType(0);
6686 return Subtarget->hasMadMacF32Insts() &&
6688 if (VT == MVT::f16) {
6689 return Subtarget->hasMadF16() &&
6704 unsigned Opc =
Op.getOpcode();
6705 EVT VT =
Op.getValueType();
6706 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6707 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6708 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6709 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6710 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6711 VT == MVT::v32bf16);
6727 [[maybe_unused]]
EVT VT =
Op.getValueType();
6729 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6730 VT == MVT::v16i32) &&
6731 "Unexpected ValueType.");
6740 unsigned Opc =
Op.getOpcode();
6741 EVT VT =
Op.getValueType();
6742 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6743 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6744 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6745 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6746 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6747 VT == MVT::v32bf16);
6755 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6757 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6764 unsigned Opc =
Op.getOpcode();
6765 EVT VT =
Op.getValueType();
6766 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6767 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6768 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6769 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6770 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6771 VT == MVT::v32bf16);
6776 : std::pair(Op0, Op0);
6785 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
6787 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
6793 switch (
Op.getOpcode()) {
6797 return LowerBRCOND(
Op, DAG);
6799 return LowerRETURNADDR(
Op, DAG);
6802 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6803 "Load should return a value and a chain");
6807 EVT VT =
Op.getValueType();
6809 return lowerFSQRTF32(
Op, DAG);
6811 return lowerFSQRTF64(
Op, DAG);
6816 return LowerTrig(
Op, DAG);
6818 return LowerSELECT(
Op, DAG);
6820 return LowerFDIV(
Op, DAG);
6822 return LowerFFREXP(
Op, DAG);
6824 return LowerATOMIC_CMP_SWAP(
Op, DAG);
6826 return LowerSTORE(
Op, DAG);
6830 return LowerGlobalAddress(MFI,
Op, DAG);
6833 return LowerExternalSymbol(
Op, DAG);
6835 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
6837 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
6839 return LowerINTRINSIC_VOID(
Op, DAG);
6841 return lowerADDRSPACECAST(
Op, DAG);
6843 return lowerINSERT_SUBVECTOR(
Op, DAG);
6845 return lowerINSERT_VECTOR_ELT(
Op, DAG);
6847 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
6849 return lowerVECTOR_SHUFFLE(
Op, DAG);
6851 return lowerSCALAR_TO_VECTOR(
Op, DAG);
6853 return lowerBUILD_VECTOR(
Op, DAG);
6856 return lowerFP_ROUND(
Op, DAG);
6858 return lowerTRAP(
Op, DAG);
6860 return lowerDEBUGTRAP(
Op, DAG);
6869 return lowerFMINNUM_FMAXNUM(
Op, DAG);
6872 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
6875 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
6878 return lowerFLDEXP(
Op, DAG);
6884 Op.getValueType() == MVT::i16 &&
6885 Op.getOperand(0).getValueType() == MVT::f32) {
6909 return lowerFCOPYSIGN(
Op, DAG);
6911 return lowerMUL(
Op, DAG);
6914 return lowerXMULO(
Op, DAG);
6917 return lowerXMUL_LOHI(
Op, DAG);
6952 EVT FittingLoadVT = LoadVT;
6984SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6987 bool IsIntrinsic)
const {
6990 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6991 EVT LoadVT =
M->getValueType(0);
6993 EVT EquivLoadVT = LoadVT;
7007 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
7011 M->getMemoryVT(),
M->getMemOperand());
7022 EVT LoadVT =
M->getValueType(0);
7028 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
7029 bool IsTFE =
M->getNumValues() == 3;
7031 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7032 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7033 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7034 : AMDGPUISD::BUFFER_LOAD;
7037 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG,
Ops);
7042 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
7046 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
7047 M->getMemOperand(), DAG);
7051 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
7053 M->getMemOperand(), DAG);
7061 EVT VT =
N->getValueType(0);
7062 unsigned CondCode =
N->getConstantOperandVal(3);
7073 EVT CmpVT =
LHS.getValueType();
7074 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
7075 unsigned PromoteOp =
7095 EVT VT =
N->getValueType(0);
7097 unsigned CondCode =
N->getConstantOperandVal(3);
7106 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7115 SDValue SetCC = DAG.
getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7124 EVT VT =
N->getValueType(0);
7148 Exec = AMDGPU::EXEC_LO;
7150 Exec = AMDGPU::EXEC;
7167 EVT VT =
N->getValueType(0);
7169 unsigned IID =
N->getConstantOperandVal(0);
7170 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7171 IID == Intrinsic::amdgcn_permlanex16;
7172 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7173 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7177 unsigned SplitSize = 32;
7178 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7179 ST->hasDPALU_DPP() &&
7187 case Intrinsic::amdgcn_permlane16:
7188 case Intrinsic::amdgcn_permlanex16:
7189 case Intrinsic::amdgcn_update_dpp:
7194 case Intrinsic::amdgcn_writelane:
7197 case Intrinsic::amdgcn_readlane:
7198 case Intrinsic::amdgcn_set_inactive:
7199 case Intrinsic::amdgcn_set_inactive_chain_arg:
7200 case Intrinsic::amdgcn_mov_dpp8:
7203 case Intrinsic::amdgcn_readfirstlane:
7204 case Intrinsic::amdgcn_permlane64:
7212 std::reverse(Operands.
begin(), Operands.
end());
7214 if (
SDNode *GL =
N->getGluedNode()) {
7216 GL = GL->getOperand(0).getNode();
7226 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7227 IID == Intrinsic::amdgcn_mov_dpp8 ||
7228 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7229 Src1 =
N->getOperand(2);
7230 if (IID == Intrinsic::amdgcn_writelane ||
7231 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7232 Src2 =
N->getOperand(3);
7235 if (ValSize == SplitSize) {
7245 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7250 if (IID == Intrinsic::amdgcn_writelane) {
7255 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7257 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7260 if (ValSize % SplitSize != 0)
7264 EVT VT =
N->getValueType(0);
7268 unsigned NumOperands =
N->getNumOperands();
7270 SDNode *GL =
N->getGluedNode();
7275 for (
unsigned i = 0; i != NE; ++i) {
7276 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7278 SDValue Operand =
N->getOperand(j);
7287 Operands[j] = Operand;
7292 Operands[NumOperands - 1] =
7308 if (SplitSize == 32) {
7310 return unrollLaneOp(LaneOp.
getNode());
7316 unsigned SubVecNumElt =
7320 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7321 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7325 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7330 if (IID == Intrinsic::amdgcn_writelane)
7335 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7336 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7337 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7338 EltIdx += SubVecNumElt;
7352 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7355 if (IID == Intrinsic::amdgcn_writelane)
7358 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7365 EVT VT =
N->getValueType(0);
7383 auto MakeIntrinsic = [&DAG, &SL](
unsigned IID,
MVT RetVT,
7387 Operands.
append(IntrinArgs);
7393 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7394 {ShiftedIndex, ValueI32});
7404 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7405 {ValueI32, PoisonVal});
7406 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7407 {ShiftedIndex, PoisonVal});
7410 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7413 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7414 {WWMIndex, WWMValue});
7415 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7416 MVT::i32, {WWMIndex, Swapped});
7418 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7426 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7434 DAG.
getSetCC(SL, MVT::i1, SameOrOtherHalf,
7444 switch (
N->getOpcode()) {
7456 unsigned IID =
N->getConstantOperandVal(0);
7458 case Intrinsic::amdgcn_make_buffer_rsrc:
7459 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
7461 case Intrinsic::amdgcn_cvt_pkrtz: {
7466 DAG.
getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7470 case Intrinsic::amdgcn_cvt_pknorm_i16:
7471 case Intrinsic::amdgcn_cvt_pknorm_u16:
7472 case Intrinsic::amdgcn_cvt_pk_i16:
7473 case Intrinsic::amdgcn_cvt_pk_u16: {
7479 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7480 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7481 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7482 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7483 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7484 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7486 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7488 EVT VT =
N->getValueType(0);
7497 case Intrinsic::amdgcn_s_buffer_load: {
7503 if (!Subtarget->hasScalarSubwordLoads())
7509 EVT VT =
Op.getValueType();
7510 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
7522 if (!
Offset->isDivergent()) {
7541 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
7546 case Intrinsic::amdgcn_dead: {
7547 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7558 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7559 Results.push_back(Res.getOperand(
I));
7563 Results.push_back(Res.getValue(1));
7572 EVT VT =
N->getValueType(0);
7577 EVT SelectVT = NewVT;
7578 if (NewVT.
bitsLT(MVT::i32)) {
7581 SelectVT = MVT::i32;
7587 if (NewVT != SelectVT)
7593 if (
N->getValueType(0) != MVT::v2f16)
7605 if (
N->getValueType(0) != MVT::v2f16)
7617 if (
N->getValueType(0) != MVT::f16)
7632 if (U.get() !=
Value)
7635 if (U.getUser()->getOpcode() == Opcode)
7641unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
7644 case Intrinsic::amdgcn_if:
7645 return AMDGPUISD::IF;
7646 case Intrinsic::amdgcn_else:
7647 return AMDGPUISD::ELSE;
7648 case Intrinsic::amdgcn_loop:
7649 return AMDGPUISD::LOOP;
7650 case Intrinsic::amdgcn_end_cf:
7670 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7697 SDNode *Intr = BRCOND.getOperand(1).getNode();
7714 Intr =
LHS.getNode();
7722 assert(BR &&
"brcond missing unconditional branch user");
7727 unsigned CFNode = isCFIntrinsic(Intr);
7747 Ops.push_back(Target);
7770 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
7789 MVT VT =
Op.getSimpleValueType();
7792 if (
Op.getConstantOperandVal(0) != 0)
7796 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7798 if (
Info->isEntryFunction())
7815 return Op.getValueType().bitsLE(VT)
7823 EVT DstVT =
Op.getValueType();
7830 unsigned Opc =
Op.getOpcode();
7842 EVT SrcVT = Src.getValueType();
7843 EVT DstVT =
Op.getValueType();
7846 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
7849 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
7856 if (DstVT == MVT::f16) {
7861 if (!Subtarget->has16BitInsts()) {
7866 if (
Op->getFlags().hasApproximateFuncs()) {
7877 "custom lower FP_ROUND for f16 or bf16");
7878 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
7890 EVT VT =
Op.getValueType();
7892 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7893 bool IsIEEEMode =
Info->getMode().IEEE;
7902 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7909SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
7911 EVT VT =
Op.getValueType();
7913 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7914 bool IsIEEEMode =
Info->getMode().IEEE;
7919 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7927 EVT VT =
Op.getValueType();
7931 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7932 !Subtarget->hasMinimum3Maximum3F16() &&
7933 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7934 "should not need to widen f16 minimum/maximum to v2f16");
7948 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7956 EVT VT =
Op.getValueType();
7960 EVT ExpVT =
Exp.getValueType();
7961 if (ExpVT == MVT::i16)
7982 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
7989 switch (
Op->getOpcode()) {
8019 DAGCombinerInfo &DCI)
const {
8020 const unsigned Opc =
Op.getOpcode();
8028 :
Op->getOperand(0).getValueType();
8029 auto &DAG = DCI.DAG;
8032 if (DCI.isBeforeLegalizeOps() ||
8040 LHS =
Op->getOperand(1);
8041 RHS =
Op->getOperand(2);
8043 LHS =
Op->getOperand(0);
8044 RHS =
Op->getOperand(1);
8083 if (MagVT == SignVT)
8100 EVT VT =
Op.getValueType();
8106 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
8133 if (
Op->isDivergent())
8146 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8148 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8151 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8153 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8159 EVT VT =
Op.getValueType();
8166 const APInt &
C = RHSC->getAPIntValue();
8168 if (
C.isPowerOf2()) {
8170 bool UseArithShift = isSigned && !
C.isMinSignedValue();
8197 if (
Op->isDivergent()) {
8201 if (Subtarget->hasSMulHi()) {
8212 if (!Subtarget->isTrapHandlerEnabled() ||
8214 return lowerTrapEndpgm(
Op, DAG);
8216 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
8217 : lowerTrapHsaQueuePtr(
Op, DAG);
8223 return DAG.
getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8227SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
8229 ImplicitParameter Param)
const {
8233 MachinePointerInfo PtrInfo =
8250 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
8253 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8256 if (UserSGPR == AMDGPU::NoRegister) {
8273 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8282 if (Subtarget->hasPrivEnabledTrap2NopBug())
8283 return DAG.
getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8287 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8295 if (!Subtarget->isTrapHandlerEnabled() ||
8299 "debugtrap handler not supported",
8307 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8310SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
8312 if (Subtarget->hasApertureRegs()) {
8314 ? AMDGPU::SRC_SHARED_BASE
8315 : AMDGPU::SRC_PRIVATE_BASE;
8316 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8317 !Subtarget->hasGloballyAddressableScratch()) &&
8318 "Cannot use src_private_base with globally addressable scratch!");
8339 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
8343 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8345 if (UserSGPR == AMDGPU::NoRegister) {
8390 const AMDGPUTargetMachine &TM =
8393 unsigned DestAS, SrcAS;
8395 bool IsNonNull =
false;
8397 SrcAS = ASC->getSrcAddressSpace();
8398 Src = ASC->getOperand(0);
8399 DestAS = ASC->getDestAddressSpace();
8402 Op.getConstantOperandVal(0) ==
8403 Intrinsic::amdgcn_addrspacecast_nonnull);
8404 Src =
Op->getOperand(1);
8405 SrcAS =
Op->getConstantOperandVal(2);
8406 DestAS =
Op->getConstantOperandVal(3);
8419 Subtarget->hasGloballyAddressableScratch()) {
8424 AMDGPU::S_MOV_B32, SL, MVT::i32,
8425 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8433 unsigned NullVal = TM.getNullPointerValue(DestAS);
8448 Subtarget->hasGloballyAddressableScratch()) {
8457 if (Subtarget->isWave64())
8463 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8471 AMDGPU::S_MOV_B64, SL, MVT::i64,
8472 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8474 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8476 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8484 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8496 Op.getValueType() == MVT::i64) {
8497 const SIMachineFunctionInfo *
Info =
8499 if (
Info->get32BitAddressHighBits() == 0)
8508 Src.getValueType() == MVT::i64)
8536 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
8541 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8543 MVT::i32, InsNumElts / 2);
8548 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
8550 if (InsNumElts == 2) {
8563 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
8586 if (NumElts == 4 && EltSize == 16 && KIdx) {
8597 unsigned Idx = KIdx->getZExtValue();
8598 bool InsertLo = Idx < 2;
8602 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8608 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8621 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
8656 EVT ResultVT =
Op.getValueType();
8669 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8672 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8676 if (VecSize == 128) {
8684 }
else if (VecSize == 256) {
8687 for (
unsigned P = 0;
P < 4; ++
P) {
8693 Parts[0], Parts[1]));
8695 Parts[2], Parts[3]));
8701 for (
unsigned P = 0;
P < 8; ++
P) {
8708 Parts[0], Parts[1], Parts[2], Parts[3]));
8711 Parts[4], Parts[5], Parts[6], Parts[7]));
8731 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8746 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8756 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8761 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8762 !(Mask[Elt + 1] & 1);
8768 EVT ResultVT =
Op.getValueType();
8771 const int NewSrcNumElts = 2;
8773 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
8789 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
8811 if (ShouldUseConsecutiveExtract &&
8814 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8815 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8827 if (Idx0 >= SrcNumElts) {
8832 if (Idx1 >= SrcNumElts) {
8837 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8838 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8846 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8847 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8852 if (SubVec0 != SubVec1) {
8853 NewMaskIdx1 += NewSrcNumElts;
8860 {NewMaskIdx0, NewMaskIdx1});
8865 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8866 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8867 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8868 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8887 EVT ResultVT =
Op.getValueType();
8903 EVT VT =
Op.getValueType();
8905 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8906 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
8940 for (
unsigned P = 0;
P < NumParts; ++
P) {
8942 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
8961 if (!Subtarget->isAmdHsaOS())
9004 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET64,
DL, PtrVT, Ptr);
9013 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET,
DL, PtrVT, PtrLo, PtrHi);
9021 EVT PtrVT =
Op.getValueType();
9023 const GlobalValue *GV = GSD->
getGlobal();
9037 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
9052 return DAG.
getNode(AMDGPUISD::LDS,
DL, MVT::i32, GA);
9055 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9056 if (Subtarget->has64BitLiterals()) {
9087 MachinePointerInfo PtrInfo =
9100 Fn,
"unsupported external symbol",
Op.getDebugLoc()));
9124 SDValue Param = lowerKernargMemParameter(
9135 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
9143 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
9151 unsigned NumElts = Elts.
size();
9153 if (NumElts <= 12) {
9162 for (
unsigned i = 0; i < Elts.
size(); ++i) {
9168 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
9178 EVT SrcVT = Src.getValueType();
9199 bool Unpacked,
bool IsD16,
int DMaskPop,
9200 int NumVDataDwords,
bool IsAtomicPacked16Bit,
9204 EVT ReqRetVT = ResultTypes[0];
9206 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9207 ? (ReqRetNumElts + 1) / 2
9210 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9221 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
9232 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
9234 NumDataDwords - MaskPopDwords);
9239 EVT LegalReqRetVT = ReqRetVT;
9241 if (!
Data.getValueType().isInteger())
9243 Data.getValueType().changeTypeToInteger(),
Data);
9264 if (Result->getNumValues() == 1)
9271 SDValue *LWE,
bool &IsTexFail) {
9291 unsigned DimIdx,
unsigned EndIdx,
9292 unsigned NumGradients) {
9294 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
9302 if (((
I + 1) >= EndIdx) ||
9303 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
9304 I == DimIdx + NumGradients - 1))) {
9326 !
Op.getNode()->hasAnyUseOfValue(0))
9328 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9338 ResultTypes.erase(&ResultTypes[0]);
9344 int NumVDataDwords = 0;
9345 bool AdjustRetType =
false;
9346 bool IsAtomicPacked16Bit =
false;
9349 const unsigned ArgOffset = WithChain ? 2 : 1;
9352 unsigned DMaskLanes = 0;
9354 if (BaseOpcode->
Atomic) {
9355 VData =
Op.getOperand(2);
9357 IsAtomicPacked16Bit =
9358 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9359 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9360 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9361 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9372 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9374 DMask = Is64Bit ? 0xf : 0x3;
9375 NumVDataDwords = Is64Bit ? 4 : 2;
9377 DMask = Is64Bit ? 0x3 : 0x1;
9378 NumVDataDwords = Is64Bit ? 2 : 1;
9381 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
9384 if (BaseOpcode->
Store) {
9385 VData =
Op.getOperand(2);
9389 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9393 VData = handleD16VData(VData, DAG,
true);
9396 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
9397 }
else if (!BaseOpcode->
NoReturn) {
9402 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9410 (!LoadVT.
isVector() && DMaskLanes > 1))
9416 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9417 !(BaseOpcode->
Gather4 && Subtarget->hasImageGather4D16Bug()))
9418 NumVDataDwords = (DMaskLanes + 1) / 2;
9420 NumVDataDwords = DMaskLanes;
9422 AdjustRetType =
true;
9426 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
9433 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9434 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9436 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
9438 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9439 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9443 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
9449 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
9453 "Bias needs to be converted to 16 bit in A16 mode");
9458 if (BaseOpcode->
Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
9462 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
9463 "require 16 bit args for both gradients and addresses");
9468 if (!
ST->hasA16()) {
9469 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
9470 "support 16 bit addresses\n");
9480 if (BaseOpcode->
Gradients && IsG16 &&
ST->hasG16()) {
9482 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9484 IntrOpcode = G16MappingInfo->
G16;
9507 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
9525 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->
Sampler);
9526 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
9527 const bool UseNSA =
ST->hasNSAEncoding() &&
9528 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
9529 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
9530 const bool UsePartialNSA =
9531 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
9534 if (UsePartialNSA) {
9536 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9537 }
else if (!UseNSA) {
9547 uint64_t UnormConst =
9548 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
9550 Unorm = UnormConst ? True : False;
9556 bool IsTexFail =
false;
9557 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9568 NumVDataDwords += 1;
9569 AdjustRetType =
true;
9574 if (AdjustRetType) {
9577 if (DMaskLanes == 0 && !BaseOpcode->
Store) {
9586 MVT::i32, NumVDataDwords)
9589 ResultTypes[0] = NewVT;
9590 if (ResultTypes.size() == 3) {
9594 ResultTypes.erase(&ResultTypes[1]);
9608 Ops.push_back(VData);
9609 if (UsePartialNSA) {
9611 Ops.push_back(VAddr);
9615 Ops.push_back(VAddr);
9618 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9620 Ops.push_back(Rsrc);
9625 Ops.push_back(Samp);
9630 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9631 Ops.push_back(Unorm);
9633 Ops.push_back(IsA16 &&
9634 ST->hasFeature(AMDGPU::FeatureR128A16)
9638 Ops.push_back(IsA16 ? True : False);
9640 if (!Subtarget->hasGFX90AInsts())
9645 "TFE is not supported on this GPU",
DL.getDebugLoc()));
9648 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9651 Ops.push_back(DimInfo->
DA ? True : False);
9653 Ops.push_back(IsD16 ? True : False);
9655 Ops.push_back(
Op.getOperand(0));
9657 int NumVAddrDwords =
9663 NumVDataDwords, NumVAddrDwords);
9664 }
else if (IsGFX11Plus) {
9666 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9667 : AMDGPU::MIMGEncGfx11Default,
9668 NumVDataDwords, NumVAddrDwords);
9669 }
else if (IsGFX10Plus) {
9671 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9672 : AMDGPU::MIMGEncGfx10Default,
9673 NumVDataDwords, NumVAddrDwords);
9675 if (Subtarget->hasGFX90AInsts()) {
9677 NumVDataDwords, NumVAddrDwords);
9681 "requested image instruction is not supported on this GPU",
9686 for (EVT VT : OrigResultTypes) {
9687 if (VT == MVT::Other)
9688 RetValues[Idx++] =
Op.getOperand(0);
9699 NumVDataDwords, NumVAddrDwords);
9702 NumVDataDwords, NumVAddrDwords);
9709 MachineMemOperand *MemRef = MemOp->getMemOperand();
9728 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9729 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9742 MachinePointerInfo(),
9747 if (!
Offset->isDivergent()) {
9754 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9763 !Subtarget->hasScalarDwordx3Loads()) {
9767 AMDGPUISD::SBUFFER_LOAD,
DL, DAG.
getVTList(WidenedVT),
Ops, WidenedVT,
9790 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9792 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
9796 unsigned NumLoads = 1;
9802 if (NumElts == 8 || NumElts == 16) {
9803 NumLoads = NumElts / 4;
9807 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
9812 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
9814 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
9815 for (
unsigned i = 0; i < NumLoads; ++i) {
9817 Loads.
push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD,
DL, VTList,
Ops,
9821 if (NumElts == 8 || NumElts == 16)
9829 if (!Subtarget->hasArchitectedSGPRs())
9834 return DAG.
getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9841 unsigned Width)
const {
9843 using namespace AMDGPU::Hwreg;
9845 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9884 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
9886 EVT VT =
Op.getValueType();
9888 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
9892 switch (IntrinsicID) {
9893 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9896 return getPreloadedValue(DAG, *MFI, VT,
9899 case Intrinsic::amdgcn_dispatch_ptr:
9900 case Intrinsic::amdgcn_queue_ptr: {
9901 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
9903 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
9908 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9911 return getPreloadedValue(DAG, *MFI, VT, RegID);
9913 case Intrinsic::amdgcn_implicitarg_ptr: {
9915 return getImplicitArgPtr(DAG,
DL);
9916 return getPreloadedValue(DAG, *MFI, VT,
9919 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9925 return getPreloadedValue(DAG, *MFI, VT,
9928 case Intrinsic::amdgcn_dispatch_id: {
9931 case Intrinsic::amdgcn_rcp:
9932 return DAG.
getNode(AMDGPUISD::RCP,
DL, VT,
Op.getOperand(1));
9933 case Intrinsic::amdgcn_rsq:
9934 return DAG.
getNode(AMDGPUISD::RSQ,
DL, VT,
Op.getOperand(1));
9935 case Intrinsic::amdgcn_rsq_legacy:
9939 case Intrinsic::amdgcn_rcp_legacy:
9942 return DAG.
getNode(AMDGPUISD::RCP_LEGACY,
DL, VT,
Op.getOperand(1));
9943 case Intrinsic::amdgcn_rsq_clamp: {
9945 return DAG.
getNode(AMDGPUISD::RSQ_CLAMP,
DL, VT,
Op.getOperand(1));
9957 case Intrinsic::r600_read_ngroups_x:
9958 if (Subtarget->isAmdHsaOS())
9961 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9964 case Intrinsic::r600_read_ngroups_y:
9965 if (Subtarget->isAmdHsaOS())
9968 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9971 case Intrinsic::r600_read_ngroups_z:
9972 if (Subtarget->isAmdHsaOS())
9975 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9978 case Intrinsic::r600_read_local_size_x:
9979 if (Subtarget->isAmdHsaOS())
9982 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9984 case Intrinsic::r600_read_local_size_y:
9985 if (Subtarget->isAmdHsaOS())
9988 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9990 case Intrinsic::r600_read_local_size_z:
9991 if (Subtarget->isAmdHsaOS())
9994 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9996 case Intrinsic::amdgcn_workgroup_id_x:
9997 return lowerWorkGroupId(DAG, *MFI, VT,
10001 case Intrinsic::amdgcn_workgroup_id_y:
10002 return lowerWorkGroupId(DAG, *MFI, VT,
10006 case Intrinsic::amdgcn_workgroup_id_z:
10007 return lowerWorkGroupId(DAG, *MFI, VT,
10011 case Intrinsic::amdgcn_cluster_id_x:
10012 return Subtarget->hasClusters()
10013 ? getPreloadedValue(DAG, *MFI, VT,
10015 : DAG.getPOISON(VT);
10016 case Intrinsic::amdgcn_cluster_id_y:
10017 return Subtarget->hasClusters()
10018 ? getPreloadedValue(DAG, *MFI, VT,
10021 case Intrinsic::amdgcn_cluster_id_z:
10022 return Subtarget->hasClusters()
10023 ? getPreloadedValue(DAG, *MFI, VT,
10026 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10027 return Subtarget->hasClusters()
10028 ? getPreloadedValue(
10032 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10033 return Subtarget->hasClusters()
10034 ? getPreloadedValue(
10038 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10039 return Subtarget->hasClusters()
10040 ? getPreloadedValue(
10044 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10045 return Subtarget->hasClusters()
10048 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10049 return Subtarget->hasClusters()
10050 ? getPreloadedValue(
10054 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10055 return Subtarget->hasClusters()
10056 ? getPreloadedValue(
10060 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10061 return Subtarget->hasClusters()
10062 ? getPreloadedValue(
10066 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10067 return Subtarget->hasClusters()
10068 ? getPreloadedValue(
10072 case Intrinsic::amdgcn_wave_id:
10073 return lowerWaveID(DAG,
Op);
10074 case Intrinsic::amdgcn_lds_kernel_id: {
10076 return getLDSKernelId(DAG,
DL);
10077 return getPreloadedValue(DAG, *MFI, VT,
10080 case Intrinsic::amdgcn_workitem_id_x:
10081 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
10082 case Intrinsic::amdgcn_workitem_id_y:
10083 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
10084 case Intrinsic::amdgcn_workitem_id_z:
10085 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
10086 case Intrinsic::amdgcn_wavefrontsize:
10088 SDLoc(
Op), MVT::i32);
10089 case Intrinsic::amdgcn_s_buffer_load: {
10090 unsigned CPol =
Op.getConstantOperandVal(3);
10097 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
10098 Op.getOperand(3), DAG);
10100 case Intrinsic::amdgcn_fdiv_fast:
10101 return lowerFDIV_FAST(
Op, DAG);
10102 case Intrinsic::amdgcn_sin:
10103 return DAG.
getNode(AMDGPUISD::SIN_HW,
DL, VT,
Op.getOperand(1));
10105 case Intrinsic::amdgcn_cos:
10106 return DAG.
getNode(AMDGPUISD::COS_HW,
DL, VT,
Op.getOperand(1));
10108 case Intrinsic::amdgcn_mul_u24:
10109 return DAG.
getNode(AMDGPUISD::MUL_U24,
DL, VT,
Op.getOperand(1),
10111 case Intrinsic::amdgcn_mul_i24:
10112 return DAG.
getNode(AMDGPUISD::MUL_I24,
DL, VT,
Op.getOperand(1),
10115 case Intrinsic::amdgcn_log_clamp: {
10121 case Intrinsic::amdgcn_fract:
10122 return DAG.
getNode(AMDGPUISD::FRACT,
DL, VT,
Op.getOperand(1));
10124 case Intrinsic::amdgcn_class:
10125 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, VT,
Op.getOperand(1),
10127 case Intrinsic::amdgcn_div_fmas:
10128 return DAG.
getNode(AMDGPUISD::DIV_FMAS,
DL, VT,
Op.getOperand(1),
10129 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10131 case Intrinsic::amdgcn_div_fixup:
10132 return DAG.
getNode(AMDGPUISD::DIV_FIXUP,
DL, VT,
Op.getOperand(1),
10133 Op.getOperand(2),
Op.getOperand(3));
10135 case Intrinsic::amdgcn_div_scale: {
10141 SDValue Denominator =
Op.getOperand(2);
10148 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
10150 return DAG.
getNode(AMDGPUISD::DIV_SCALE,
DL,
Op->getVTList(), Src0,
10151 Denominator, Numerator);
10153 case Intrinsic::amdgcn_icmp: {
10155 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
10156 Op.getConstantOperandVal(2) == 0 &&
10161 case Intrinsic::amdgcn_fcmp: {
10164 case Intrinsic::amdgcn_ballot:
10166 case Intrinsic::amdgcn_fmed3:
10167 return DAG.
getNode(AMDGPUISD::FMED3,
DL, VT,
Op.getOperand(1),
10168 Op.getOperand(2),
Op.getOperand(3));
10169 case Intrinsic::amdgcn_fdot2:
10170 return DAG.
getNode(AMDGPUISD::FDOT2,
DL, VT,
Op.getOperand(1),
10171 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10172 case Intrinsic::amdgcn_fmul_legacy:
10173 return DAG.
getNode(AMDGPUISD::FMUL_LEGACY,
DL, VT,
Op.getOperand(1),
10175 case Intrinsic::amdgcn_sffbh:
10176 return DAG.
getNode(AMDGPUISD::FFBH_I32,
DL, VT,
Op.getOperand(1));
10177 case Intrinsic::amdgcn_sbfe:
10178 return DAG.
getNode(AMDGPUISD::BFE_I32,
DL, VT,
Op.getOperand(1),
10179 Op.getOperand(2),
Op.getOperand(3));
10180 case Intrinsic::amdgcn_ubfe:
10181 return DAG.
getNode(AMDGPUISD::BFE_U32,
DL, VT,
Op.getOperand(1),
10182 Op.getOperand(2),
Op.getOperand(3));
10183 case Intrinsic::amdgcn_cvt_pkrtz:
10184 case Intrinsic::amdgcn_cvt_pknorm_i16:
10185 case Intrinsic::amdgcn_cvt_pknorm_u16:
10186 case Intrinsic::amdgcn_cvt_pk_i16:
10187 case Intrinsic::amdgcn_cvt_pk_u16: {
10189 EVT VT =
Op.getValueType();
10192 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10193 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10194 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10195 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10196 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10197 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10198 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10199 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10201 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10204 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
10207 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
10210 case Intrinsic::amdgcn_fmad_ftz:
10211 return DAG.
getNode(AMDGPUISD::FMAD_FTZ,
DL, VT,
Op.getOperand(1),
10212 Op.getOperand(2),
Op.getOperand(3));
10214 case Intrinsic::amdgcn_if_break:
10216 Op->getOperand(1),
Op->getOperand(2)),
10219 case Intrinsic::amdgcn_groupstaticsize: {
10225 const GlobalValue *GV =
10231 case Intrinsic::amdgcn_is_shared:
10232 case Intrinsic::amdgcn_is_private: {
10239 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10243 Subtarget->hasGloballyAddressableScratch()) {
10246 AMDGPU::S_MOV_B32,
DL, MVT::i32,
10247 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10256 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10259 case Intrinsic::amdgcn_perm:
10260 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op.getOperand(1),
10261 Op.getOperand(2),
Op.getOperand(3));
10262 case Intrinsic::amdgcn_reloc_constant: {
10272 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10273 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10274 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10275 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10276 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10277 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10278 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10279 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10280 if (
Op.getOperand(4).getValueType() == MVT::i32)
10286 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10287 Op.getOperand(3), IndexKeyi32);
10289 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10290 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10291 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10292 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10293 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10294 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10295 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10296 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10297 if (
Op.getOperand(4).getValueType() == MVT::i64)
10303 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10304 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10305 Op.getOperand(6)});
10307 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10308 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10309 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10310 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10311 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10312 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10313 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10316 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
10322 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10323 Op.getOperand(3),
Op.getOperand(4),
Op.getOperand(5),
10324 IndexKey,
Op.getOperand(7),
Op.getOperand(8)};
10325 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
10326 Args.push_back(
Op.getOperand(9));
10329 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10330 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10331 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10332 if (
Op.getOperand(6).getValueType() == MVT::i32)
10338 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10339 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10340 IndexKeyi32, Op.getOperand(7)});
10342 case Intrinsic::amdgcn_addrspacecast_nonnull:
10343 return lowerADDRSPACECAST(
Op, DAG);
10344 case Intrinsic::amdgcn_readlane:
10345 case Intrinsic::amdgcn_readfirstlane:
10346 case Intrinsic::amdgcn_writelane:
10347 case Intrinsic::amdgcn_permlane16:
10348 case Intrinsic::amdgcn_permlanex16:
10349 case Intrinsic::amdgcn_permlane64:
10350 case Intrinsic::amdgcn_set_inactive:
10351 case Intrinsic::amdgcn_set_inactive_chain_arg:
10352 case Intrinsic::amdgcn_mov_dpp8:
10353 case Intrinsic::amdgcn_update_dpp:
10355 case Intrinsic::amdgcn_dead: {
10357 for (
const EVT ValTy :
Op.getNode()->values())
10361 case Intrinsic::amdgcn_wave_shuffle:
10364 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10366 return lowerImage(
Op, ImageDimIntr, DAG,
false);
10377 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10383 unsigned NewOpcode)
const {
10387 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10388 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10406 M->getMemOperand());
10411 unsigned NewOpcode)
const {
10415 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10416 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10434 M->getMemOperand());
10439 unsigned IntrID =
Op.getConstantOperandVal(1);
10443 case Intrinsic::amdgcn_ds_ordered_add:
10444 case Intrinsic::amdgcn_ds_ordered_swap: {
10449 unsigned IndexOperand =
M->getConstantOperandVal(7);
10450 unsigned WaveRelease =
M->getConstantOperandVal(8);
10451 unsigned WaveDone =
M->getConstantOperandVal(9);
10453 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10454 IndexOperand &= ~0x3f;
10455 unsigned CountDw = 0;
10458 CountDw = (IndexOperand >> 24) & 0xf;
10459 IndexOperand &= ~(0xf << 24);
10461 if (CountDw < 1 || CountDw > 4) {
10464 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
10465 DL.getDebugLoc()));
10470 if (IndexOperand) {
10473 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
10476 if (WaveDone && !WaveRelease) {
10480 Fn,
"ds_ordered_count: wave_done requires wave_release",
10481 DL.getDebugLoc()));
10484 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10485 unsigned ShaderType =
10487 unsigned Offset0 = OrderedCountIndex << 2;
10488 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10491 Offset1 |= (CountDw - 1) << 6;
10494 Offset1 |= ShaderType << 2;
10496 unsigned Offset = Offset0 | (Offset1 << 8);
10503 M->getVTList(),
Ops,
M->getMemoryVT(),
10504 M->getMemOperand());
10506 case Intrinsic::amdgcn_raw_buffer_load:
10507 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10508 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10509 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10510 case Intrinsic::amdgcn_raw_buffer_load_format:
10511 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10512 const bool IsFormat =
10513 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10514 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10516 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10517 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10531 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
10533 case Intrinsic::amdgcn_struct_buffer_load:
10534 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10535 case Intrinsic::amdgcn_struct_buffer_load_format:
10536 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10537 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10538 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10539 const bool IsFormat =
10540 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10541 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10543 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10544 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10559 case Intrinsic::amdgcn_raw_tbuffer_load:
10560 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10562 EVT LoadVT =
Op.getValueType();
10563 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10564 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10580 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10582 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
10583 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10586 case Intrinsic::amdgcn_struct_tbuffer_load:
10587 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10589 EVT LoadVT =
Op.getValueType();
10590 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10591 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10607 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10609 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
10610 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10613 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10614 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10615 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10616 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10617 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10618 return lowerStructBufferAtomicIntrin(
Op, DAG,
10619 AMDGPUISD::BUFFER_ATOMIC_FADD);
10620 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10621 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10622 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10623 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10624 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10625 return lowerStructBufferAtomicIntrin(
Op, DAG,
10626 AMDGPUISD::BUFFER_ATOMIC_FMIN);
10627 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10628 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10629 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10630 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10631 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10632 return lowerStructBufferAtomicIntrin(
Op, DAG,
10633 AMDGPUISD::BUFFER_ATOMIC_FMAX);
10634 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10635 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10636 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10637 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10638 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10639 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10640 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10641 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10642 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10643 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10644 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10645 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10646 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10647 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10648 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10649 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10650 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10651 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10652 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10653 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10654 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10655 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10656 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10657 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10658 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10659 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10660 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10661 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10662 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10663 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10664 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10665 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10666 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10667 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10668 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10669 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10670 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10671 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10672 return lowerStructBufferAtomicIntrin(
Op, DAG,
10673 AMDGPUISD::BUFFER_ATOMIC_SWAP);
10674 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10675 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10676 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10677 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10678 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10679 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10680 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10681 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10682 return lowerStructBufferAtomicIntrin(
Op, DAG,
10683 AMDGPUISD::BUFFER_ATOMIC_SMIN);
10684 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10685 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10686 return lowerStructBufferAtomicIntrin(
Op, DAG,
10687 AMDGPUISD::BUFFER_ATOMIC_UMIN);
10688 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10689 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10690 return lowerStructBufferAtomicIntrin(
Op, DAG,
10691 AMDGPUISD::BUFFER_ATOMIC_SMAX);
10692 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10693 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10694 return lowerStructBufferAtomicIntrin(
Op, DAG,
10695 AMDGPUISD::BUFFER_ATOMIC_UMAX);
10696 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10697 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10698 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10699 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10700 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10701 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10702 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10703 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10704 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10705 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10706 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10707 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10708 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10709 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10710 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10711 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10712 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10713 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
10714 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10715 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10716 return lowerStructBufferAtomicIntrin(
Op, DAG,
10717 AMDGPUISD::BUFFER_ATOMIC_CSUB);
10718 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10719 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10720 return lowerRawBufferAtomicIntrin(
Op, DAG,
10721 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10722 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10723 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10724 return lowerStructBufferAtomicIntrin(
Op, DAG,
10725 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10726 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10727 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10728 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
10729 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10743 EVT VT =
Op.getValueType();
10747 Op->getVTList(),
Ops, VT,
10748 M->getMemOperand());
10750 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10751 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10752 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
10753 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
10767 EVT VT =
Op.getValueType();
10771 Op->getVTList(),
Ops, VT,
10772 M->getMemOperand());
10774 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10775 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10777 SDValue NodePtr =
M->getOperand(2);
10778 SDValue RayExtent =
M->getOperand(3);
10779 SDValue InstanceMask =
M->getOperand(4);
10780 SDValue RayOrigin =
M->getOperand(5);
10781 SDValue RayDir =
M->getOperand(6);
10783 SDValue TDescr =
M->getOperand(8);
10788 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10793 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10794 const unsigned NumVDataDwords = 10;
10795 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10797 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10798 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10799 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10803 Ops.push_back(NodePtr);
10806 {DAG.getBitcast(MVT::i32, RayExtent),
10807 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10808 Ops.push_back(RayOrigin);
10809 Ops.push_back(RayDir);
10810 Ops.push_back(Offsets);
10811 Ops.push_back(TDescr);
10812 Ops.push_back(
M->getChain());
10815 MachineMemOperand *MemRef =
M->getMemOperand();
10819 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10821 SDValue NodePtr =
M->getOperand(2);
10822 SDValue RayExtent =
M->getOperand(3);
10823 SDValue RayOrigin =
M->getOperand(4);
10824 SDValue RayDir =
M->getOperand(5);
10825 SDValue RayInvDir =
M->getOperand(6);
10826 SDValue TDescr =
M->getOperand(7);
10833 if (!Subtarget->hasGFX10_AEncoding()) {
10843 const unsigned NumVDataDwords = 4;
10844 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10845 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10846 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10849 const unsigned BaseOpcodes[2][2] = {
10850 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10851 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10852 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10856 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10857 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10858 : AMDGPU::MIMGEncGfx10NSA,
10859 NumVDataDwords, NumVAddrDwords);
10863 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10864 : AMDGPU::MIMGEncGfx10Default,
10865 NumVDataDwords, NumVAddrDwords);
10871 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
10874 if (Lanes[0].getValueSizeInBits() == 32) {
10875 for (
unsigned I = 0;
I < 3; ++
I)
10882 Ops.push_back(Lanes[2]);
10894 if (UseNSA && IsGFX11Plus) {
10895 Ops.push_back(NodePtr);
10897 Ops.push_back(RayOrigin);
10902 for (
unsigned I = 0;
I < 3; ++
I) {
10905 {DirLanes[I], InvDirLanes[I]})));
10909 Ops.push_back(RayDir);
10910 Ops.push_back(RayInvDir);
10917 Ops.push_back(NodePtr);
10920 packLanes(RayOrigin,
true);
10921 packLanes(RayDir,
true);
10922 packLanes(RayInvDir,
false);
10927 if (NumVAddrDwords > 12) {
10929 Ops.append(16 -
Ops.size(), Undef);
10935 Ops.push_back(MergedOps);
10938 Ops.push_back(TDescr);
10940 Ops.push_back(
M->getChain());
10943 MachineMemOperand *MemRef =
M->getMemOperand();
10947 case Intrinsic::amdgcn_global_atomic_fmin_num:
10948 case Intrinsic::amdgcn_global_atomic_fmax_num:
10949 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10950 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10957 unsigned Opcode = 0;
10959 case Intrinsic::amdgcn_global_atomic_fmin_num:
10960 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10964 case Intrinsic::amdgcn_global_atomic_fmax_num:
10965 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10972 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
10973 Ops,
M->getMemOperand());
10975 case Intrinsic::amdgcn_s_get_barrier_state:
10976 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10983 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10984 BarID = (BarID >> 4) & 0x3F;
10985 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10988 Ops.push_back(Chain);
10990 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10991 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10999 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11007 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11008 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11009 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11013 EVT VT =
Op->getValueType(0);
11019 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11021 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11029SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
11036 EVT VT = VTList.
VTs[0];
11039 bool IsTFE = VTList.
NumVTs == 3;
11042 unsigned NumOpDWords = NumValueDWords + 1;
11044 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
11045 MachineMemOperand *OpDWordsMMO =
11047 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
11048 OpDWordsVT, OpDWordsMMO, DAG);
11053 NumValueDWords == 1
11062 if (!Subtarget->hasDwordx3LoadStores() &&
11063 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11067 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
11069 WidenedMemVT, WidenedMMO);
11079 bool ImageStore)
const {
11089 if (Subtarget->hasUnpackedD16VMem()) {
11103 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11114 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
11120 if ((NumElements % 2) == 1) {
11122 unsigned I = Elts.
size() / 2;
11138 if (NumElements == 3) {
11159 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
11162 switch (IntrinsicID) {
11163 case Intrinsic::amdgcn_exp_compr: {
11164 if (!Subtarget->hasCompressedExport()) {
11167 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
11189 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11193 case Intrinsic::amdgcn_struct_tbuffer_store:
11194 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11196 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11198 VData = handleD16VData(VData, DAG);
11199 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11200 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11214 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11215 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11218 M->getMemoryVT(),
M->getMemOperand());
11221 case Intrinsic::amdgcn_raw_tbuffer_store:
11222 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11224 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11226 VData = handleD16VData(VData, DAG);
11227 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11228 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11242 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11243 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11246 M->getMemoryVT(),
M->getMemOperand());
11249 case Intrinsic::amdgcn_raw_buffer_store:
11250 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11251 case Intrinsic::amdgcn_raw_buffer_store_format:
11252 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11253 const bool IsFormat =
11254 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11255 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11262 VData = handleD16VData(VData, DAG);
11272 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11273 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11287 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11288 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11293 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
11296 M->getMemoryVT(),
M->getMemOperand());
11299 case Intrinsic::amdgcn_struct_buffer_store:
11300 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11301 case Intrinsic::amdgcn_struct_buffer_store_format:
11302 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11303 const bool IsFormat =
11304 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11305 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11313 VData = handleD16VData(VData, DAG);
11323 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11324 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11338 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11339 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11343 EVT VDataType = VData.getValueType().getScalarType();
11345 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
11348 M->getMemoryVT(),
M->getMemOperand());
11350 case Intrinsic::amdgcn_raw_buffer_load_lds:
11351 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11352 case Intrinsic::amdgcn_struct_buffer_load_lds:
11353 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11354 if (!Subtarget->hasVMemToLDSLoad())
11358 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11359 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11360 unsigned OpOffset = HasVIndex ? 1 : 0;
11361 SDValue VOffset =
Op.getOperand(5 + OpOffset);
11363 unsigned Size =
Op->getConstantOperandVal(4);
11369 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11370 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11371 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11372 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11375 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11376 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11377 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11378 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11381 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11382 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11383 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11384 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11387 if (!Subtarget->hasLDSLoadB96_B128())
11389 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11390 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11391 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11392 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11395 if (!Subtarget->hasLDSLoadB96_B128())
11397 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11398 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11399 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11400 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11408 if (HasVIndex && HasVOffset)
11412 else if (HasVIndex)
11413 Ops.push_back(
Op.getOperand(5));
11414 else if (HasVOffset)
11415 Ops.push_back(VOffset);
11417 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11418 Ops.push_back(Rsrc);
11419 Ops.push_back(
Op.getOperand(6 + OpOffset));
11420 Ops.push_back(
Op.getOperand(7 + OpOffset));
11422 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
11435 MachineMemOperand *LoadMMO =
M->getMemOperand();
11440 MachinePointerInfo StorePtrI = LoadPtrI;
11464 case Intrinsic::amdgcn_load_to_lds:
11465 case Intrinsic::amdgcn_global_load_lds: {
11466 if (!Subtarget->hasVMemToLDSLoad())
11470 unsigned Size =
Op->getConstantOperandVal(4);
11475 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11478 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11481 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11484 if (!Subtarget->hasLDSLoadB96_B128())
11486 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11489 if (!Subtarget->hasLDSLoadB96_B128())
11491 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11507 if (
LHS->isDivergent())
11511 RHS.getOperand(0).getValueType() == MVT::i32) {
11514 VOffset =
RHS.getOperand(0);
11518 Ops.push_back(Addr);
11526 Ops.push_back(VOffset);
11529 Ops.push_back(
Op.getOperand(5));
11531 unsigned Aux =
Op.getConstantOperandVal(6);
11539 MachineMemOperand *LoadMMO =
M->getMemOperand();
11541 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
11542 MachinePointerInfo StorePtrI = LoadPtrI;
11561 case Intrinsic::amdgcn_end_cf:
11563 Op->getOperand(2), Chain),
11565 case Intrinsic::amdgcn_s_barrier_init:
11566 case Intrinsic::amdgcn_s_barrier_signal_var: {
11573 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11574 ? AMDGPU::S_BARRIER_INIT_M0
11575 : AMDGPU::S_BARRIER_SIGNAL_M0;
11590 constexpr unsigned ShAmt = 16;
11597 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11602 case Intrinsic::amdgcn_s_wakeup_barrier: {
11603 if (!Subtarget->hasSWakeupBarrier())
11607 case Intrinsic::amdgcn_s_barrier_join: {
11616 switch (IntrinsicID) {
11619 case Intrinsic::amdgcn_s_barrier_join:
11620 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11622 case Intrinsic::amdgcn_s_wakeup_barrier:
11623 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11627 unsigned BarID = (BarVal >> 4) & 0x3F;
11630 Ops.push_back(Chain);
11632 switch (IntrinsicID) {
11635 case Intrinsic::amdgcn_s_barrier_join:
11636 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11638 case Intrinsic::amdgcn_s_wakeup_barrier:
11639 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11650 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11656 case Intrinsic::amdgcn_s_prefetch_data: {
11659 return Op.getOperand(0);
11662 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11664 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
11671 Op->getVTList(),
Ops,
M->getMemoryVT(),
11672 M->getMemOperand());
11674 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11675 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11676 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11685 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11687 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11703 return PtrVT == MVT::i64;
11717std::pair<SDValue, SDValue>
11747 unsigned Overflow = ImmOffset & ~MaxImm;
11748 ImmOffset -= Overflow;
11749 if ((int32_t)Overflow < 0) {
11750 Overflow += ImmOffset;
11755 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
11774void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
11776 Align Alignment)
const {
11778 SDLoc
DL(CombinedOffset);
11780 uint32_t
Imm =
C->getZExtValue();
11781 uint32_t SOffset, ImmOffset;
11782 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11792 uint32_t SOffset, ImmOffset;
11795 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
11803 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11812SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
11815 return MaybePointer;
11829 SDValue NumRecords =
Op->getOperand(3);
11835 if (Subtarget->has45BitNumRecordsBufferResource()) {
11854 SDValue ExtShiftedStrideVec =
11866 DAG.
getNode(
ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11868 DAG.
getNode(
ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11873 auto [LowHalf, HighHalf] =
11874 DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11884 NumRecords, Flags);
11896 bool IsTFE)
const {
11901 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
11902 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
11905 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
11917 ? AMDGPUISD::BUFFER_LOAD_UBYTE
11918 : AMDGPUISD::BUFFER_LOAD_USHORT;
11920 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
11934 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11938 Ops[1] = BufferStoreExt;
11939 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11940 : AMDGPUISD::BUFFER_STORE_SHORT;
11943 M->getMemOperand());
11968 DAGCombinerInfo &DCI)
const {
11969 SelectionDAG &DAG = DCI.DAG;
11984 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
11991 "unexpected vector extload");
12004 "unexpected fp extload");
12022 DCI.AddToWorklist(Cvt.
getNode());
12027 DCI.AddToWorklist(Cvt.
getNode());
12038 if (
Info.isEntryFunction())
12039 return Info.getUserSGPRInfo().hasFlatScratchInit();
12047 EVT MemVT =
Load->getMemoryVT();
12048 MachineMemOperand *MMO =
Load->getMemOperand();
12060 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12088 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
12089 "Custom lowering for non-i32 vectors hasn't been implemented.");
12092 unsigned AS =
Load->getAddressSpace();
12099 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12103 !Subtarget->hasMultiDwordFlatScratchAddressing())
12113 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
12116 Alignment >=
Align(4) && NumElements < 32) {
12118 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12130 if (NumElements > 4)
12133 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12143 switch (Subtarget->getMaxPrivateElementSize()) {
12149 if (NumElements > 2)
12154 if (NumElements > 4)
12157 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12166 auto Flags =
Load->getMemOperand()->getFlags();
12168 Load->getAlign(), Flags, &
Fast) &&
12177 MemVT, *
Load->getMemOperand())) {
12186 EVT VT =
Op.getValueType();
12223 EVT VT =
Op.getValueType();
12224 const SDNodeFlags
Flags =
Op->getFlags();
12226 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
12232 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12235 if (CLHS->isExactlyValue(1.0)) {
12248 return DAG.
getNode(AMDGPUISD::RCP, SL, VT,
RHS);
12252 if (CLHS->isExactlyValue(-1.0)) {
12255 return DAG.
getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12261 if (!AllowInaccurateRcp &&
12262 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
12276 EVT VT =
Op.getValueType();
12277 const SDNodeFlags
Flags =
Op->getFlags();
12279 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
12280 if (!AllowInaccurateDiv)
12301 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
12311 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12315 return DAG.
getNode(Opcode, SL, VTList,
12324 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
12334 Opcode = AMDGPUISD::FMA_W_CHAIN;
12338 return DAG.
getNode(Opcode, SL, VTList,
12344 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12345 return FastLowered;
12348 EVT VT =
Op.getValueType();
12355 if (VT == MVT::bf16) {
12378 unsigned FMADOpCode =
12382 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt,
Op->getFlags());
12385 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12387 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
12388 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12398 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst,
RHS,
LHS,
12404 SDNodeFlags
Flags =
Op->getFlags();
12414 const APFloat K0Val(0x1p+96f);
12417 const APFloat K1Val(0x1p-32f);
12444 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
12445 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
12446 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12451 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12452 return FastLowered;
12458 SDNodeFlags
Flags =
Op->getFlags();
12459 Flags.setNoFPExcept(
true);
12467 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
12476 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12480 using namespace AMDGPU::Hwreg;
12481 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12485 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
12486 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
12489 const bool HasDynamicDenormals =
12495 if (!PreservesDenormals) {
12500 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12503 if (HasDynamicDenormals) {
12507 SavedDenormMode =
SDValue(GetReg, 0);
12513 SDNode *EnableDenorm;
12514 if (Subtarget->hasDenormModeInst()) {
12515 const SDValue EnableDenormValue =
12518 EnableDenorm = DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12522 const SDValue EnableDenormValue =
12524 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12525 {EnableDenormValue,
BitField, Glue});
12535 ApproxRcp, One, NegDivScale0, Flags);
12538 ApproxRcp, Fma0, Flags);
12544 NumeratorScaled,
Mul, Flags);
12550 NumeratorScaled, Fma3, Flags);
12552 if (!PreservesDenormals) {
12553 SDNode *DisableDenorm;
12554 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12558 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12560 DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12564 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
12565 const SDValue DisableDenormValue =
12566 HasDynamicDenormals
12571 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12582 {Fma4, Fma1, Fma3, Scale},
Flags);
12584 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas,
RHS,
LHS, Flags);
12588 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
12589 return FastLowered;
12597 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
12603 SDValue Rcp = DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12621 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12651 DAG.
getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3,
Mul, Scale);
12653 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas,
Y,
X);
12657 EVT VT =
Op.getValueType();
12659 if (VT == MVT::f32)
12660 return LowerFDIV32(
Op, DAG);
12662 if (VT == MVT::f64)
12663 return LowerFDIV64(
Op, DAG);
12665 if (VT == MVT::f16 || VT == MVT::bf16)
12666 return LowerFDIV16(
Op, DAG);
12675 EVT ResultExpVT =
Op->getValueType(1);
12676 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12686 if (Subtarget->hasFractBug()) {
12704 EVT VT =
Store->getMemoryVT();
12706 if (VT == MVT::i1) {
12710 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
12714 Store->getValue().getValueType().getScalarType() == MVT::i32);
12716 unsigned AS =
Store->getAddressSpace();
12724 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12728 !Subtarget->hasMultiDwordFlatScratchAddressing())
12735 if (NumElements > 4)
12738 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12742 VT, *
Store->getMemOperand()))
12748 switch (Subtarget->getMaxPrivateElementSize()) {
12752 if (NumElements > 2)
12756 if (NumElements > 4 ||
12757 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12765 auto Flags =
Store->getMemOperand()->getFlags();
12784 assert(!Subtarget->has16BitInsts());
12785 SDNodeFlags
Flags =
Op->getFlags();
12799 SDNodeFlags
Flags =
Op->getFlags();
12800 MVT VT =
Op.getValueType().getSimpleVT();
12908 SDNodeFlags
Flags =
Op->getFlags();
12971 EVT VT =
Op.getValueType();
12981 if (Subtarget->hasTrigReducedRange()) {
12983 TrigVal = DAG.
getNode(AMDGPUISD::FRACT,
DL, VT, MulVal, Flags);
12988 switch (
Op.getOpcode()) {
12990 return DAG.
getNode(AMDGPUISD::COS_HW, SDLoc(
Op), VT, TrigVal, Flags);
12992 return DAG.
getNode(AMDGPUISD::SIN_HW, SDLoc(
Op), VT, TrigVal, Flags);
13015 EVT VT =
Op.getValueType();
13023 Op->getVTList(),
Ops, VT,
13032SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
13033 DAGCombinerInfo &DCI)
const {
13034 EVT VT =
N->getValueType(0);
13036 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13039 SelectionDAG &DAG = DCI.DAG;
13043 EVT SrcVT = Src.getValueType();
13049 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13052 DCI.AddToWorklist(Cvt.
getNode());
13055 if (ScalarVT != MVT::f32) {
13067 DAGCombinerInfo &DCI)
const {
13078 SelectionDAG &DAG = DCI.DAG;
13097 for (
unsigned I = 0;
I != NumElts; ++
I) {
13121 if (NewElts.
size() == 1)
13143 for (
unsigned I = 0;
I != NumElts; ++
I) {
13178SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
13180 DAGCombinerInfo &DCI)
const {
13197 SelectionDAG &DAG = DCI.DAG;
13210 AM.BaseOffs =
Offset.getSExtValue();
13215 EVT VT =
N->getValueType(0);
13221 Flags.setNoUnsignedWrap(
13222 N->getFlags().hasNoUnsignedWrap() &&
13234 switch (
N->getOpcode()) {
13245 DAGCombinerInfo &DCI)
const {
13246 SelectionDAG &DAG = DCI.DAG;
13253 SDValue NewPtr = performSHLPtrCombine(Ptr.
getNode(),
N->getAddressSpace(),
13254 N->getMemoryVT(), DCI);
13258 NewOps[PtrIdx] = NewPtr;
13267 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13268 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13277SDValue SITargetLowering::splitBinaryBitConstantOp(
13281 uint32_t ValLo =
Lo_32(Val);
13282 uint32_t ValHi =
Hi_32(Val);
13289 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
13303 if (V.getValueType() != MVT::i1)
13305 switch (V.getOpcode()) {
13310 case AMDGPUISD::FP_CLASS:
13322 return V.getResNo() == 1;
13324 unsigned IntrinsicID = V.getConstantOperandVal(0);
13325 switch (IntrinsicID) {
13326 case Intrinsic::amdgcn_is_shared:
13327 case Intrinsic::amdgcn_is_private:
13344 if (!(
C & 0x000000ff))
13345 ZeroByteMask |= 0x000000ff;
13346 if (!(
C & 0x0000ff00))
13347 ZeroByteMask |= 0x0000ff00;
13348 if (!(
C & 0x00ff0000))
13349 ZeroByteMask |= 0x00ff0000;
13350 if (!(
C & 0xff000000))
13351 ZeroByteMask |= 0xff000000;
13352 uint32_t NonZeroByteMask = ~ZeroByteMask;
13353 if ((NonZeroByteMask &
C) != NonZeroByteMask)
13366 assert(V.getValueSizeInBits() == 32);
13368 if (V.getNumOperands() != 2)
13377 switch (V.getOpcode()) {
13382 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13387 return (0x03020100 & ~ConstMask) | ConstMask;
13394 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
13400 return uint32_t(0x0c0c0c0c03020100ull >>
C);
13407 DAGCombinerInfo &DCI)
const {
13408 if (DCI.isBeforeLegalize())
13411 SelectionDAG &DAG = DCI.DAG;
13412 EVT VT =
N->getValueType(0);
13417 if (VT == MVT::i64 && CRHS) {
13419 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
13423 if (CRHS && VT == MVT::i32) {
13433 unsigned Shift = CShift->getZExtValue();
13435 unsigned Offset = NB + Shift;
13436 if ((
Offset & (Bits - 1)) == 0) {
13439 DAG.
getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
LHS->getOperand(0),
13460 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13462 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
13475 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
13480 if (
X !=
LHS.getOperand(1))
13484 const ConstantFPSDNode *C1 =
13501 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
X,
13507 if (
RHS.getOpcode() ==
ISD::SETCC &&
LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13510 if (
LHS.getOpcode() ==
ISD::SETCC &&
RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13518 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
13519 LHS.getOperand(0) ==
LHS.getOperand(1))) {
13521 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
13522 :
Mask->getZExtValue() & OrdMask;
13525 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
RHS.getOperand(0),
13543 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13546 if (LHSMask != ~0u && RHSMask != ~0u) {
13549 if (LHSMask > RHSMask) {
13556 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13557 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13560 if (!(LHSUsedLanes & RHSUsedLanes) &&
13563 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13569 uint32_t
Mask = LHSMask & RHSMask;
13570 for (
unsigned I = 0;
I < 32;
I += 8) {
13571 uint32_t ByteSel = 0xff <<
I;
13572 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13573 Mask &= (0x0c <<
I) & 0xffffffff;
13578 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
13581 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
13631static const std::optional<ByteProvider<SDValue>>
13633 unsigned Depth = 0) {
13636 return std::nullopt;
13638 if (
Op.getValueSizeInBits() < 8)
13639 return std::nullopt;
13641 if (
Op.getValueType().isVector())
13644 switch (
Op->getOpcode()) {
13656 NarrowVT = VTSign->getVT();
13659 return std::nullopt;
13662 if (SrcIndex >= NarrowByteWidth)
13663 return std::nullopt;
13671 return std::nullopt;
13673 uint64_t BitShift = ShiftOp->getZExtValue();
13675 if (BitShift % 8 != 0)
13676 return std::nullopt;
13678 SrcIndex += BitShift / 8;
13696static const std::optional<ByteProvider<SDValue>>
13698 unsigned StartingIndex = 0) {
13702 return std::nullopt;
13704 unsigned BitWidth =
Op.getScalarValueSizeInBits();
13706 return std::nullopt;
13708 return std::nullopt;
13710 bool IsVec =
Op.getValueType().isVector();
13711 switch (
Op.getOpcode()) {
13714 return std::nullopt;
13719 return std::nullopt;
13723 return std::nullopt;
13726 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
13727 return std::nullopt;
13728 if (!
LHS ||
LHS->isConstantZero())
13730 if (!
RHS ||
RHS->isConstantZero())
13732 return std::nullopt;
13737 return std::nullopt;
13741 return std::nullopt;
13743 uint32_t BitMask = BitMaskOp->getZExtValue();
13745 uint32_t IndexMask = 0xFF << (Index * 8);
13747 if ((IndexMask & BitMask) != IndexMask) {
13750 if (IndexMask & BitMask)
13751 return std::nullopt;
13760 return std::nullopt;
13764 if (!ShiftOp ||
Op.getValueType().isVector())
13765 return std::nullopt;
13767 uint64_t BitsProvided =
Op.getValueSizeInBits();
13768 if (BitsProvided % 8 != 0)
13769 return std::nullopt;
13771 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13773 return std::nullopt;
13775 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13776 uint64_t ByteShift = BitShift / 8;
13778 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13779 uint64_t BytesProvided = BitsProvided / 8;
13780 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13781 NewIndex %= BytesProvided;
13788 return std::nullopt;
13792 return std::nullopt;
13794 uint64_t BitShift = ShiftOp->getZExtValue();
13796 return std::nullopt;
13798 auto BitsProvided =
Op.getScalarValueSizeInBits();
13799 if (BitsProvided % 8 != 0)
13800 return std::nullopt;
13802 uint64_t BytesProvided = BitsProvided / 8;
13803 uint64_t ByteShift = BitShift / 8;
13808 return BytesProvided - ByteShift > Index
13816 return std::nullopt;
13820 return std::nullopt;
13822 uint64_t BitShift = ShiftOp->getZExtValue();
13823 if (BitShift % 8 != 0)
13824 return std::nullopt;
13825 uint64_t ByteShift = BitShift / 8;
13831 return Index < ByteShift
13834 Depth + 1, StartingIndex);
13843 return std::nullopt;
13851 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13853 if (NarrowBitWidth % 8 != 0)
13854 return std::nullopt;
13855 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13857 if (Index >= NarrowByteWidth)
13859 ? std::optional<ByteProvider<SDValue>>(
13867 return std::nullopt;
13871 if (NarrowByteWidth >= Index) {
13876 return std::nullopt;
13883 return std::nullopt;
13889 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13890 if (NarrowBitWidth % 8 != 0)
13891 return std::nullopt;
13892 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13897 if (Index >= NarrowByteWidth) {
13899 ? std::optional<ByteProvider<SDValue>>(
13904 if (NarrowByteWidth > Index) {
13908 return std::nullopt;
13913 return std::nullopt;
13916 Depth + 1, StartingIndex);
13922 return std::nullopt;
13923 auto VecIdx = IdxOp->getZExtValue();
13924 auto ScalarSize =
Op.getScalarValueSizeInBits();
13925 if (ScalarSize < 32)
13926 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13928 StartingIndex, Index);
13931 case AMDGPUISD::PERM: {
13933 return std::nullopt;
13937 return std::nullopt;
13940 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13941 if (IdxMask > 0x07 && IdxMask != 0x0c)
13942 return std::nullopt;
13944 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13945 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13947 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
13953 return std::nullopt;
13968 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
13975 auto MemVT = L->getMemoryVT();
13978 return L->getMemoryVT().getSizeInBits() == 16;
13988 int Low8 = Mask & 0xff;
13989 int Hi8 = (Mask & 0xff00) >> 8;
13991 assert(Low8 < 8 && Hi8 < 8);
13993 bool IsConsecutive = (Hi8 - Low8 == 1);
13998 bool Is16Aligned = !(Low8 % 2);
14000 return IsConsecutive && Is16Aligned;
14008 int Low16 = PermMask & 0xffff;
14009 int Hi16 = (PermMask & 0xffff0000) >> 16;
14019 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14021 if (!OtherOpIs16Bit)
14029 unsigned DWordOffset) {
14034 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14039 if (Src.getValueType().isVector()) {
14040 auto ScalarTySize = Src.getScalarValueSizeInBits();
14041 auto ScalarTy = Src.getValueType().getScalarType();
14042 if (ScalarTySize == 32) {
14046 if (ScalarTySize > 32) {
14049 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
14050 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14057 assert(ScalarTySize < 32);
14058 auto NumElements =
TypeSize / ScalarTySize;
14059 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14060 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14061 auto NumElementsIn32 = 32 / ScalarTySize;
14062 auto NumAvailElements = DWordOffset < Trunc32Elements
14064 : NumElements - NormalizedTrunc;
14077 auto ShiftVal = 32 * DWordOffset;
14085 [[maybe_unused]]
EVT VT =
N->getValueType(0);
14090 for (
int i = 0; i < 4; i++) {
14092 std::optional<ByteProvider<SDValue>>
P =
14095 if (!
P ||
P->isConstantZero())
14100 if (PermNodes.
size() != 4)
14103 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14104 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14106 for (
size_t i = 0; i < PermNodes.
size(); i++) {
14107 auto PermOp = PermNodes[i];
14110 int SrcByteAdjust = 4;
14114 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14115 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14117 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14118 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14122 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14123 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14126 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14128 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14131 SDValue Op = *PermNodes[FirstSrc.first].Src;
14133 assert(
Op.getValueSizeInBits() == 32);
14137 int Low16 = PermMask & 0xffff;
14138 int Hi16 = (PermMask & 0xffff0000) >> 16;
14140 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14141 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14144 if (WellFormedLow && WellFormedHi)
14148 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
14157 (
N->getOperand(0) ==
Op ||
N->getOperand(0) == OtherOp) &&
14158 (
N->getOperand(1) ==
Op ||
N->getOperand(1) == OtherOp))
14163 assert(
Op.getValueType().isByteSized() &&
14174 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op, OtherOp,
14181 DAGCombinerInfo &DCI)
const {
14182 SelectionDAG &DAG = DCI.DAG;
14186 EVT VT =
N->getValueType(0);
14187 if (VT == MVT::i1) {
14189 if (
LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14190 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14192 if (Src !=
RHS.getOperand(0))
14197 if (!CLHS || !CRHS)
14201 static const uint32_t MaxMask = 0x3ff;
14206 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1, Src,
14215 LHS.getOpcode() == AMDGPUISD::PERM &&
14221 Sel |=
LHS.getConstantOperandVal(2);
14223 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14230 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14234 auto usesCombinedOperand = [](SDNode *OrUse) {
14237 !OrUse->getValueType(0).isVector())
14241 for (
auto *VUser : OrUse->users()) {
14242 if (!VUser->getValueType(0).isVector())
14249 if (VUser->getOpcode() == VectorwiseOp)
14255 if (!
any_of(
N->users(), usesCombinedOperand))
14261 if (LHSMask != ~0u && RHSMask != ~0u) {
14264 if (LHSMask > RHSMask) {
14271 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14272 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14275 if (!(LHSUsedLanes & RHSUsedLanes) &&
14278 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14280 LHSMask &= ~RHSUsedLanes;
14281 RHSMask &= ~LHSUsedLanes;
14283 LHSMask |= LHSUsedLanes & 0x04040404;
14285 uint32_t Sel = LHSMask | RHSMask;
14288 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14293 if (LHSMask == ~0u || RHSMask == ~0u) {
14334 return IdentitySrc;
14340 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14355 if (SrcVT == MVT::i32) {
14360 DCI.AddToWorklist(LowOr.
getNode());
14361 DCI.AddToWorklist(HiBits.getNode());
14372 N->getOperand(0), CRHS))
14380 DAGCombinerInfo &DCI)
const {
14381 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
14388 SelectionDAG &DAG = DCI.DAG;
14390 EVT VT =
N->getValueType(0);
14391 if (CRHS && VT == MVT::i64) {
14393 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
14400 unsigned Opc =
LHS.getOpcode();
14430 LHS->getOperand(0), FNegLHS, FNegRHS);
14439 DAGCombinerInfo &DCI)
const {
14440 if (!Subtarget->has16BitInsts() ||
14444 EVT VT =
N->getValueType(0);
14445 if (VT != MVT::i32)
14449 if (Src.getValueType() != MVT::i16)
14456SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
14457 DAGCombinerInfo &DCI)
const {
14463 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14464 VTSign->getVT() == MVT::i8) ||
14465 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14466 VTSign->getVT() == MVT::i16))) {
14467 assert(Subtarget->hasScalarSubwordLoads() &&
14468 "s_buffer_load_{u8, i8} are supported "
14469 "in GFX12 (or newer) architectures.");
14470 EVT VT = Src.getValueType();
14471 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14472 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14473 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14475 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14482 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14483 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14487 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14488 VTSign->getVT() == MVT::i8) ||
14489 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14490 VTSign->getVT() == MVT::i16)) &&
14499 Src.getOperand(6), Src.getOperand(7)};
14502 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14503 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14504 ? AMDGPUISD::BUFFER_LOAD_BYTE
14505 : AMDGPUISD::BUFFER_LOAD_SHORT;
14506 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14507 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14508 return DCI.DAG.getMergeValues(
14509 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
14515 DAGCombinerInfo &DCI)
const {
14516 SelectionDAG &DAG = DCI.DAG;
14523 if (
N->getOperand(0).isUndef())
14530 DAGCombinerInfo &DCI)
const {
14531 EVT VT =
N->getValueType(0);
14541 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(
N), VT, N0,
14548 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(
N), VT, N0.
getOperand(0),
14556 unsigned MaxDepth)
const {
14557 unsigned Opcode =
Op.getOpcode();
14562 const auto &
F = CFP->getValueAPF();
14563 if (
F.isNaN() &&
F.isSignaling())
14565 if (!
F.isDenormal())
14597 case AMDGPUISD::FMUL_LEGACY:
14598 case AMDGPUISD::FMAD_FTZ:
14599 case AMDGPUISD::RCP:
14600 case AMDGPUISD::RSQ:
14601 case AMDGPUISD::RSQ_CLAMP:
14602 case AMDGPUISD::RCP_LEGACY:
14603 case AMDGPUISD::RCP_IFLAG:
14604 case AMDGPUISD::LOG:
14605 case AMDGPUISD::EXP:
14606 case AMDGPUISD::DIV_SCALE:
14607 case AMDGPUISD::DIV_FMAS:
14608 case AMDGPUISD::DIV_FIXUP:
14609 case AMDGPUISD::FRACT:
14610 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14611 case AMDGPUISD::CVT_F32_UBYTE0:
14612 case AMDGPUISD::CVT_F32_UBYTE1:
14613 case AMDGPUISD::CVT_F32_UBYTE2:
14614 case AMDGPUISD::CVT_F32_UBYTE3:
14615 case AMDGPUISD::FP_TO_FP16:
14616 case AMDGPUISD::SIN_HW:
14617 case AMDGPUISD::COS_HW:
14628 if (
Op.getValueType() == MVT::i32) {
14634 if (RHS->getZExtValue() == 0xffff0000) {
14644 return Op.getValueType().getScalarType() != MVT::f16;
14654 case AMDGPUISD::CLAMP:
14655 case AMDGPUISD::FMED3:
14656 case AMDGPUISD::FMAX3:
14657 case AMDGPUISD::FMIN3:
14658 case AMDGPUISD::FMAXIMUM3:
14659 case AMDGPUISD::FMINIMUM3: {
14665 if (Subtarget->supportsMinMaxDenormModes() ||
14675 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
14687 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
14714 if (
Op.getValueType() == MVT::i16) {
14725 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
14727 switch (IntrinsicID) {
14728 case Intrinsic::amdgcn_cvt_pkrtz:
14729 case Intrinsic::amdgcn_cubeid:
14730 case Intrinsic::amdgcn_frexp_mant:
14731 case Intrinsic::amdgcn_fdot2:
14732 case Intrinsic::amdgcn_rcp:
14733 case Intrinsic::amdgcn_rsq:
14734 case Intrinsic::amdgcn_rsq_clamp:
14735 case Intrinsic::amdgcn_rcp_legacy:
14736 case Intrinsic::amdgcn_rsq_legacy:
14737 case Intrinsic::amdgcn_trig_preop:
14738 case Intrinsic::amdgcn_tanh:
14739 case Intrinsic::amdgcn_log:
14740 case Intrinsic::amdgcn_exp2:
14741 case Intrinsic::amdgcn_sqrt:
14759 unsigned MaxDepth)
const {
14762 unsigned Opcode =
MI->getOpcode();
14764 if (Opcode == AMDGPU::G_FCANONICALIZE)
14767 std::optional<FPValueAndVReg> FCR;
14770 if (FCR->Value.isSignaling())
14772 if (!FCR->Value.isDenormal())
14783 case AMDGPU::G_FADD:
14784 case AMDGPU::G_FSUB:
14785 case AMDGPU::G_FMUL:
14786 case AMDGPU::G_FCEIL:
14787 case AMDGPU::G_FFLOOR:
14788 case AMDGPU::G_FRINT:
14789 case AMDGPU::G_FNEARBYINT:
14790 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14791 case AMDGPU::G_INTRINSIC_TRUNC:
14792 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14793 case AMDGPU::G_FMA:
14794 case AMDGPU::G_FMAD:
14795 case AMDGPU::G_FSQRT:
14796 case AMDGPU::G_FDIV:
14797 case AMDGPU::G_FREM:
14798 case AMDGPU::G_FPOW:
14799 case AMDGPU::G_FPEXT:
14800 case AMDGPU::G_FLOG:
14801 case AMDGPU::G_FLOG2:
14802 case AMDGPU::G_FLOG10:
14803 case AMDGPU::G_FPTRUNC:
14804 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14805 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14806 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14807 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14808 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14810 case AMDGPU::G_FNEG:
14811 case AMDGPU::G_FABS:
14812 case AMDGPU::G_FCOPYSIGN:
14814 case AMDGPU::G_FMINNUM:
14815 case AMDGPU::G_FMAXNUM:
14816 case AMDGPU::G_FMINNUM_IEEE:
14817 case AMDGPU::G_FMAXNUM_IEEE:
14818 case AMDGPU::G_FMINIMUM:
14819 case AMDGPU::G_FMAXIMUM:
14820 case AMDGPU::G_FMINIMUMNUM:
14821 case AMDGPU::G_FMAXIMUMNUM: {
14822 if (Subtarget->supportsMinMaxDenormModes() ||
14829 case AMDGPU::G_BUILD_VECTOR:
14834 case AMDGPU::G_INTRINSIC:
14835 case AMDGPU::G_INTRINSIC_CONVERGENT:
14837 case Intrinsic::amdgcn_fmul_legacy:
14838 case Intrinsic::amdgcn_fmad_ftz:
14839 case Intrinsic::amdgcn_sqrt:
14840 case Intrinsic::amdgcn_fmed3:
14841 case Intrinsic::amdgcn_sin:
14842 case Intrinsic::amdgcn_cos:
14843 case Intrinsic::amdgcn_log:
14844 case Intrinsic::amdgcn_exp2:
14845 case Intrinsic::amdgcn_log_clamp:
14846 case Intrinsic::amdgcn_rcp:
14847 case Intrinsic::amdgcn_rcp_legacy:
14848 case Intrinsic::amdgcn_rsq:
14849 case Intrinsic::amdgcn_rsq_clamp:
14850 case Intrinsic::amdgcn_rsq_legacy:
14851 case Intrinsic::amdgcn_div_scale:
14852 case Intrinsic::amdgcn_div_fmas:
14853 case Intrinsic::amdgcn_div_fixup:
14854 case Intrinsic::amdgcn_fract:
14855 case Intrinsic::amdgcn_cvt_pkrtz:
14856 case Intrinsic::amdgcn_cubeid:
14857 case Intrinsic::amdgcn_cubema:
14858 case Intrinsic::amdgcn_cubesc:
14859 case Intrinsic::amdgcn_cubetc:
14860 case Intrinsic::amdgcn_frexp_mant:
14861 case Intrinsic::amdgcn_fdot2:
14862 case Intrinsic::amdgcn_trig_preop:
14863 case Intrinsic::amdgcn_tanh:
14882 if (
C.isDenormal()) {
14896 if (
C.isSignaling()) {
14919SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
14920 DAGCombinerInfo &DCI)
const {
14921 SelectionDAG &DAG = DCI.DAG;
14923 EVT VT =
N->getValueType(0);
14932 EVT VT =
N->getValueType(0);
14933 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
14949 EVT EltVT =
Lo.getValueType();
14952 for (
unsigned I = 0;
I != 2; ++
I) {
14956 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14957 }
else if (
Op.isUndef()) {
14993 return AMDGPUISD::FMAX3;
14995 return AMDGPUISD::FMAXIMUM3;
14997 return AMDGPUISD::SMAX3;
14999 return AMDGPUISD::UMAX3;
15003 return AMDGPUISD::FMIN3;
15005 return AMDGPUISD::FMINIMUM3;
15007 return AMDGPUISD::SMIN3;
15009 return AMDGPUISD::UMIN3;
15030 if (!MinK || !MaxK)
15042 unsigned Med3Opc =
Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15043 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15044 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
15103 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15109 if (
Info->getMode().DX10Clamp) {
15118 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15150 case AMDGPUISD::FMIN_LEGACY:
15151 case AMDGPUISD::FMAX_LEGACY:
15152 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
15163 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
15172 DAGCombinerInfo &DCI)
const {
15173 SelectionDAG &DAG = DCI.DAG;
15205 if (
SDValue Med3 = performIntMed3ImmCombine(
15210 if (
SDValue Med3 = performIntMed3ImmCombine(
15216 if (
SDValue Med3 = performIntMed3ImmCombine(
15221 if (
SDValue Med3 = performIntMed3ImmCombine(
15234 (
Opc == AMDGPUISD::FMIN_LEGACY &&
15235 Op0.
getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15236 (VT == MVT::f32 || VT == MVT::f64 ||
15237 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15238 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15239 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15240 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15242 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1))
15249 const SDNodeFlags
Flags =
N->getFlags();
15251 !Subtarget->hasIEEEMinimumMaximumInsts() &&
Flags.hasNoNaNs()) {
15254 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
15264 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15265 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15274 DAGCombinerInfo &DCI)
const {
15275 EVT VT =
N->getValueType(0);
15279 SelectionDAG &DAG = DCI.DAG;
15290 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15294 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15298 if (
Info->getMode().DX10Clamp) {
15311 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15318 DAGCombinerInfo &DCI)
const {
15322 return DCI.DAG.getUNDEF(
N->getValueType(0));
15330 bool IsDivergentIdx,
15335 unsigned VecSize = EltSize * NumElem;
15338 if (VecSize <= 64 && EltSize < 32)
15347 if (IsDivergentIdx)
15351 unsigned NumInsts = NumElem +
15352 ((EltSize + 31) / 32) * NumElem ;
15356 if (Subtarget->useVGPRIndexMode())
15357 return NumInsts <= 16;
15361 if (Subtarget->hasMovrel())
15362 return NumInsts <= 15;
15368 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
15383SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
15384 DAGCombinerInfo &DCI)
const {
15390 EVT ResVT =
N->getValueType(0);
15414 if (!
C ||
C->getZExtValue() != 0x1f)
15430 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15458 DCI.AddToWorklist(Elt0.
getNode());
15459 DCI.AddToWorklist(Elt1.
getNode());
15481 if (!DCI.isBeforeLegalize())
15489 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15492 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15493 unsigned EltIdx = BitIndex / 32;
15494 unsigned LeftoverBitIdx = BitIndex % 32;
15498 DCI.AddToWorklist(Cast.
getNode());
15502 DCI.AddToWorklist(Elt.
getNode());
15505 DCI.AddToWorklist(Srl.
getNode());
15509 DCI.AddToWorklist(Trunc.
getNode());
15511 if (VecEltVT == ResVT) {
15523SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
15524 DAGCombinerInfo &DCI)
const {
15535 SelectionDAG &DAG = DCI.DAG;
15555 Src.getOperand(0).getValueType() == MVT::f16) {
15556 return Src.getOperand(0);
15560 APFloat Val = CFP->getValueAPF();
15561 bool LosesInfo =
true;
15571 DAGCombinerInfo &DCI)
const {
15572 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15573 "combine only useful on gfx8");
15575 SDValue TruncSrc =
N->getOperand(0);
15576 EVT VT =
N->getValueType(0);
15577 if (VT != MVT::f16)
15580 if (TruncSrc.
getOpcode() != AMDGPUISD::FMED3 ||
15584 SelectionDAG &DAG = DCI.DAG;
15615unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
15617 const SDNode *N1)
const {
15622 if (((VT == MVT::f32 &&
15624 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15644 EVT VT =
N->getValueType(0);
15645 if (VT != MVT::i32 && VT != MVT::i64)
15651 unsigned Opc =
N->getOpcode();
15706 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
15725 DAGCombinerInfo &DCI)
const {
15728 SelectionDAG &DAG = DCI.DAG;
15729 EVT VT =
N->getValueType(0);
15739 if (!
N->isDivergent() && Subtarget->hasSMulHi())
15743 if (NumBits <= 32 || NumBits > 64)
15754 if (!Subtarget->hasFullRate64Ops()) {
15755 unsigned NumUsers = 0;
15756 for (SDNode *User :
LHS->
users()) {
15759 if (!
User->isAnyAdd())
15783 bool MulSignedLo =
false;
15784 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15793 if (VT != MVT::i64) {
15816 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15818 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15819 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15821 if (!MulLHSUnsigned32) {
15828 if (!MulRHSUnsigned32) {
15839 if (VT != MVT::i64)
15845SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
15846 DAGCombinerInfo &DCI)
const {
15856 SelectionDAG &DAG = DCI.DAG;
15871 unsigned Opcode =
N->getOpcode();
15875 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
15886static std::optional<ByteProvider<SDValue>>
15889 if (!Byte0 || Byte0->isConstantZero()) {
15890 return std::nullopt;
15893 if (Byte1 && !Byte1->isConstantZero()) {
15894 return std::nullopt;
15900 unsigned FirstCs =
First & 0x0c0c0c0c;
15901 unsigned SecondCs = Second & 0x0c0c0c0c;
15902 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
15903 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15905 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15906 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15907 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15908 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15910 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15934 for (
int BPI = 0; BPI < 2; BPI++) {
15937 BPP = {Src1, Src0};
15939 unsigned ZeroMask = 0x0c0c0c0c;
15940 unsigned FMask = 0xFF << (8 * (3 - Step));
15942 unsigned FirstMask =
15943 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15944 unsigned SecondMask =
15945 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15949 int FirstGroup = -1;
15950 for (
int I = 0;
I < 2;
I++) {
15952 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
15953 return IterElt.SrcOp == *BPP.first.Src &&
15954 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15958 if (Match != Srcs.
end()) {
15959 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
15964 if (FirstGroup != -1) {
15966 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
15967 return IterElt.SrcOp == *BPP.second.Src &&
15968 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15971 if (Match != Srcs.
end()) {
15972 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
15974 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15982 unsigned ZeroMask = 0x0c0c0c0c;
15983 unsigned FMask = 0xFF << (8 * (3 - Step));
15987 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15991 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16000 if (Srcs.
size() == 1) {
16001 auto *Elt = Srcs.
begin();
16005 if (Elt->PermMask == 0x3020100)
16008 return DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16012 auto *FirstElt = Srcs.
begin();
16013 auto *SecondElt = std::next(FirstElt);
16020 auto FirstMask = FirstElt->PermMask;
16021 auto SecondMask = SecondElt->PermMask;
16023 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
16024 unsigned FirstPlusFour = FirstMask | 0x04040404;
16027 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
16039 FirstElt = std::next(SecondElt);
16040 if (FirstElt == Srcs.
end())
16043 SecondElt = std::next(FirstElt);
16046 if (SecondElt == Srcs.
end()) {
16051 DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16052 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
16058 return Perms.
size() == 2
16064 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16065 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
16066 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
16067 EntryMask += ZeroMask;
16072 auto Opcode =
Op.getOpcode();
16074 return (Opcode ==
ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16075 Opcode == AMDGPUISD::MUL_I24);
16078static std::optional<bool>
16089 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16092 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16094 assert(!(S0IsUnsigned && S0IsSigned));
16095 assert(!(S1IsUnsigned && S1IsSigned));
16103 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16109 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16110 return std::nullopt;
16122 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16123 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16128 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16134 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16135 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16136 return std::nullopt;
16142 DAGCombinerInfo &DCI)
const {
16143 SelectionDAG &DAG = DCI.DAG;
16144 EVT VT =
N->getValueType(0);
16150 if (Subtarget->hasMad64_32()) {
16151 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16156 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
16160 if (VT == MVT::i64) {
16161 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16166 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16168 std::optional<bool> IsSigned;
16174 int ChainLength = 0;
16175 for (
int I = 0;
I < 4;
I++) {
16179 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16182 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16187 TempNode->getOperand(MulIdx), *Src0, *Src1,
16188 TempNode->getOperand(MulIdx)->getOperand(0),
16189 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16193 IsSigned = *IterIsSigned;
16194 if (*IterIsSigned != *IsSigned)
16197 auto AddIdx = 1 - MulIdx;
16200 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
16201 Src2s.
push_back(TempNode->getOperand(AddIdx));
16211 TempNode->getOperand(AddIdx), *Src0, *Src1,
16212 TempNode->getOperand(AddIdx)->getOperand(0),
16213 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16217 if (*IterIsSigned != *IsSigned)
16221 ChainLength =
I + 2;
16225 TempNode = TempNode->getOperand(AddIdx);
16227 ChainLength =
I + 1;
16228 if (TempNode->getNumOperands() < 2)
16230 LHS = TempNode->getOperand(0);
16231 RHS = TempNode->getOperand(1);
16234 if (ChainLength < 2)
16240 if (ChainLength < 4) {
16250 bool UseOriginalSrc =
false;
16251 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
16252 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
16253 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
16254 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
16255 SmallVector<unsigned, 4> SrcBytes;
16256 auto Src0Mask = Src0s.
begin()->PermMask;
16257 SrcBytes.
push_back(Src0Mask & 0xFF000000);
16258 bool UniqueEntries =
true;
16259 for (
auto I = 1;
I < 4;
I++) {
16260 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
16263 UniqueEntries =
false;
16269 if (UniqueEntries) {
16270 UseOriginalSrc =
true;
16272 auto *FirstElt = Src0s.
begin();
16276 auto *SecondElt = Src1s.
begin();
16278 SecondElt->DWordOffset);
16287 if (!UseOriginalSrc) {
16294 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16297 : Intrinsic::amdgcn_udot4,
16307 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16312 unsigned Opc =
LHS.getOpcode();
16324 auto Cond =
RHS.getOperand(0);
16329 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16346 DAGCombinerInfo &DCI)
const {
16347 SelectionDAG &DAG = DCI.DAG;
16349 EVT VT =
N->getValueType(0);
16362 SDNodeFlags ShlFlags = N1->
getFlags();
16366 SDNodeFlags NewShlFlags =
16371 DCI.AddToWorklist(Inner.
getNode());
16378 if (Subtarget->hasMad64_32()) {
16379 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16388 if (VT == MVT::i64) {
16389 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16402 if (!YIsConstant && !ZIsConstant && !
X->isDivergent() &&
16403 Y->isDivergent() !=
Z->isDivergent()) {
16412 if (
Y->isDivergent())
16415 SDNodeFlags ReassocFlags =
16418 DCI.AddToWorklist(UniformInner.
getNode());
16426 DAGCombinerInfo &DCI)
const {
16427 SelectionDAG &DAG = DCI.DAG;
16428 EVT VT =
N->getValueType(0);
16430 if (VT == MVT::i64) {
16431 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16435 if (VT != MVT::i32)
16444 unsigned Opc =
RHS.getOpcode();
16451 auto Cond =
RHS.getOperand(0);
16456 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16474SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
16475 DAGCombinerInfo &DCI)
const {
16477 if (
N->getValueType(0) != MVT::i32)
16483 SelectionDAG &DAG = DCI.DAG;
16488 unsigned LHSOpc =
LHS.getOpcode();
16489 unsigned Opc =
N->getOpcode();
16493 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
16499 DAGCombinerInfo &DCI)
const {
16503 SelectionDAG &DAG = DCI.DAG;
16504 EVT VT =
N->getValueType(0);
16516 if (
A ==
LHS.getOperand(1)) {
16517 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16518 if (FusedOp != 0) {
16520 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
16528 if (
A ==
RHS.getOperand(1)) {
16529 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16530 if (FusedOp != 0) {
16532 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
16541 DAGCombinerInfo &DCI)
const {
16545 SelectionDAG &DAG = DCI.DAG;
16547 EVT VT =
N->getValueType(0);
16560 if (
A ==
LHS.getOperand(1)) {
16561 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16562 if (FusedOp != 0) {
16566 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
16575 if (
A ==
RHS.getOperand(1)) {
16576 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16577 if (FusedOp != 0) {
16579 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
16588 DAGCombinerInfo &DCI)
const {
16589 SelectionDAG &DAG = DCI.DAG;
16591 EVT VT =
N->getValueType(0);
16600 SDNodeFlags
Flags =
N->getFlags();
16601 SDNodeFlags RHSFlags =
RHS->getFlags();
16607 bool IsNegative =
false;
16608 if (CLHS->isExactlyValue(1.0) ||
16609 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16615 DAG.
getNode(AMDGPUISD::RSQ, SL, VT,
RHS.getOperand(0), Flags);
16625 DAGCombinerInfo &DCI)
const {
16626 SelectionDAG &DAG = DCI.DAG;
16627 EVT VT =
N->getValueType(0);
16631 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
16632 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16647 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16652 const ConstantFPSDNode *FalseNode =
16662 if (ScalarVT == MVT::f32 &&
16668 if (TrueNodeExpVal == INT_MIN)
16671 if (FalseNodeExpVal == INT_MIN)
16691 DAGCombinerInfo &DCI)
const {
16692 SelectionDAG &DAG = DCI.DAG;
16693 EVT VT =
N->getValueType(0);
16696 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16714 (
N->getFlags().hasAllowContract() &&
16715 FMA->getFlags().hasAllowContract())) {
16749 if (Vec1 == Vec2 || Vec3 == Vec4)
16755 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16756 return DAG.
getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16764 DAGCombinerInfo &DCI)
const {
16765 SelectionDAG &DAG = DCI.DAG;
16770 EVT VT =
LHS.getValueType();
16799 return LHS.getOperand(0);
16807 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
16814 const APInt &CT =
LHS.getConstantOperandAPInt(1);
16815 const APInt &CF =
LHS.getConstantOperandAPInt(2);
16823 return LHS.getOperand(0);
16855 DAG.
getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
16860 {Op0Hi, Op1Hi, CarryInHi});
16870 DCI.CombineTo(
LHS.getNode(), Result);
16874 if (VT != MVT::f32 && VT != MVT::f64 &&
16875 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16890 const unsigned IsInfMask =
16892 const unsigned IsFiniteMask =
16897 return DAG.
getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1,
LHS.getOperand(0),
16906SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
16907 DAGCombinerInfo &DCI)
const {
16908 SelectionDAG &DAG = DCI.DAG;
16910 unsigned Offset =
N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16929 unsigned ShiftOffset = 8 *
Offset;
16931 ShiftOffset -=
C->getZExtValue();
16933 ShiftOffset +=
C->getZExtValue();
16935 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16936 return DAG.
getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16937 MVT::f32, Shifted);
16948 DCI.AddToWorklist(
N);
16955 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
16961 DAGCombinerInfo &DCI)
const {
16966 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16970 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16971 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
16974 APFloat One(
F.getSemantics(),
"1.0");
16976 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
16982 DAGCombinerInfo &DCI)
const {
17003 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
17004 bool isInteger =
LHS.getValueType().isInteger();
17007 if (!isFloatingPoint && !isInteger)
17012 if (!isEquality && !isNonEquality)
17029 if (isFloatingPoint) {
17031 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
17042 if (!(isEquality && TrueVal == ConstVal) &&
17043 !(isNonEquality && FalseVal == ConstVal))
17050 SelectLHS, SelectRHS);
17055 switch (
N->getOpcode()) {
17071 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
17081 switch (
N->getOpcode()) {
17083 return performAddCombine(
N, DCI);
17085 return performPtrAddCombine(
N, DCI);
17087 return performSubCombine(
N, DCI);
17090 return performAddCarrySubCarryCombine(
N, DCI);
17092 return performFAddCombine(
N, DCI);
17094 return performFSubCombine(
N, DCI);
17096 return performFDivCombine(
N, DCI);
17098 return performFMulCombine(
N, DCI);
17100 return performSetCCCombine(
N, DCI);
17102 if (
auto Res = performSelectCombine(
N, DCI))
17117 case AMDGPUISD::FMIN_LEGACY:
17118 case AMDGPUISD::FMAX_LEGACY:
17119 return performMinMaxCombine(
N, DCI);
17121 return performFMACombine(
N, DCI);
17123 return performAndCombine(
N, DCI);
17125 return performOrCombine(
N, DCI);
17128 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
17129 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
17135 return performXorCombine(
N, DCI);
17137 return performZeroExtendCombine(
N, DCI);
17139 return performSignExtendInRegCombine(
N, DCI);
17140 case AMDGPUISD::FP_CLASS:
17141 return performClassCombine(
N, DCI);
17143 return performFCanonicalizeCombine(
N, DCI);
17144 case AMDGPUISD::RCP:
17145 return performRcpCombine(
N, DCI);
17147 case AMDGPUISD::FRACT:
17148 case AMDGPUISD::RSQ:
17149 case AMDGPUISD::RCP_LEGACY:
17150 case AMDGPUISD::RCP_IFLAG:
17151 case AMDGPUISD::RSQ_CLAMP: {
17160 return performUCharToFloatCombine(
N, DCI);
17162 return performFCopySignCombine(
N, DCI);
17163 case AMDGPUISD::CVT_F32_UBYTE0:
17164 case AMDGPUISD::CVT_F32_UBYTE1:
17165 case AMDGPUISD::CVT_F32_UBYTE2:
17166 case AMDGPUISD::CVT_F32_UBYTE3:
17167 return performCvtF32UByteNCombine(
N, DCI);
17168 case AMDGPUISD::FMED3:
17169 return performFMed3Combine(
N, DCI);
17170 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17171 return performCvtPkRTZCombine(
N, DCI);
17172 case AMDGPUISD::CLAMP:
17173 return performClampCombine(
N, DCI);
17176 EVT VT =
N->getValueType(0);
17179 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17182 EVT EltVT = Src.getValueType();
17183 if (EltVT != MVT::i16)
17193 return performExtractVectorEltCombine(
N, DCI);
17195 return performInsertVectorEltCombine(
N, DCI);
17197 return performFPRoundCombine(
N, DCI);
17206 return performMemSDNodeCombine(MemNode, DCI);
17237 unsigned Opcode =
Node->getMachineOpcode();
17240 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17241 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
17244 SDNode *
Users[5] = {
nullptr};
17246 unsigned DmaskIdx =
17247 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17248 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
17249 unsigned NewDmask = 0;
17250 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17251 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17252 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
17253 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
17254 unsigned TFCLane = 0;
17255 bool HasChain =
Node->getNumValues() > 1;
17257 if (OldDmask == 0) {
17265 TFCLane = OldBitsSet;
17269 for (SDUse &Use :
Node->uses()) {
17272 if (
Use.getResNo() != 0)
17275 SDNode *
User =
Use.getUser();
17278 if (!
User->isMachineOpcode() ||
17279 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17291 if (UsesTFC && Lane == TFCLane) {
17296 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17298 Dmask &= ~(1 << Comp);
17306 NewDmask |= 1 << Comp;
17311 bool NoChannels = !NewDmask;
17318 if (OldBitsSet == 1)
17324 if (NewDmask == OldDmask)
17333 unsigned NewChannels = BitsSet + UsesTFC;
17337 assert(NewOpcode != -1 &&
17338 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
17339 "failed to find equivalent MIMG op");
17347 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
17349 MVT ResultVT = NewChannels == 1
17352 : NewChannels == 5 ? 8
17354 SDVTList NewVTList =
17357 MachineSDNode *NewNode =
17366 if (NewChannels == 1) {
17376 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17381 if (i || !NoChannels)
17386 if (NewUser != User) {
17396 Idx = AMDGPU::sub1;
17399 Idx = AMDGPU::sub2;
17402 Idx = AMDGPU::sub3;
17405 Idx = AMDGPU::sub4;
17416 Op =
Op.getOperand(0);
17437 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17441 Node->getOperand(0), SL, VReg, SrcVal,
17447 return ToResultReg.
getNode();
17452 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
17454 Ops.push_back(
Node->getOperand(i));
17460 Node->getOperand(i).getValueType(),
17461 Node->getOperand(i)),
17473 unsigned Opcode =
Node->getMachineOpcode();
17475 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
17476 !
TII->isGather4(Opcode) &&
17478 return adjustWritemask(
Node, DAG);
17481 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17487 case AMDGPU::V_DIV_SCALE_F32_e64:
17488 case AMDGPU::V_DIV_SCALE_F64_e64: {
17498 (Src0 == Src1 || Src0 == Src2))
17554 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
17555 unsigned InitIdx = 0;
17557 if (
TII->isImage(
MI)) {
17565 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
17566 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
17567 unsigned D16Val = D16 ? D16->getImm() : 0;
17569 if (!TFEVal && !LWEVal)
17580 assert(MO_Dmask &&
"Expected dmask operand in instruction");
17582 unsigned dmask = MO_Dmask->
getImm();
17587 bool Packed = !Subtarget->hasUnpackedD16VMem();
17589 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17596 uint32_t DstSize =
TRI.getRegSizeInBits(*DstRC) / 32;
17597 if (DstSize < InitIdx)
17601 InitIdx =
TRI.getRegSizeInBits(*DstRC) / 32;
17609 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
17610 unsigned NewDst = 0;
17615 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17616 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17619 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17620 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
17640 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
17652 if (
TII->isVOP3(
MI.getOpcode())) {
17654 TII->legalizeOperandsVOP3(
MRI,
MI);
17656 if (
TII->isMAI(
MI)) {
17661 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17662 AMDGPU::OpName::scale_src0);
17663 if (Src0Idx != -1) {
17664 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17665 AMDGPU::OpName::scale_src1);
17666 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
17667 TII->usesConstantBus(
MRI,
MI, Src1Idx))
17668 TII->legalizeOpWithMove(
MI, Src1Idx);
17675 if (
TII->isImage(
MI))
17676 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
17750std::pair<unsigned, const TargetRegisterClass *>
17757 if (Constraint.
size() == 1) {
17761 if (VT == MVT::Other)
17764 switch (Constraint[0]) {
17771 RC = &AMDGPU::SReg_32RegClass;
17774 RC = &AMDGPU::SGPR_64RegClass;
17779 return std::pair(0U,
nullptr);
17786 return std::pair(0U,
nullptr);
17788 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17789 : &AMDGPU::VGPR_32_Lo256RegClass;
17792 RC = Subtarget->has1024AddressableVGPRs()
17793 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
17796 return std::pair(0U,
nullptr);
17801 if (!Subtarget->hasMAIInsts())
17805 return std::pair(0U,
nullptr);
17807 RC = &AMDGPU::AGPR_32RegClass;
17812 return std::pair(0U,
nullptr);
17817 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
17821 RC = &AMDGPU::AV_32RegClass;
17824 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
17826 return std::pair(0U,
nullptr);
17835 return std::pair(0U, RC);
17838 if (Kind !=
'\0') {
17840 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17841 }
else if (Kind ==
's') {
17842 RC = &AMDGPU::SGPR_32RegClass;
17843 }
else if (Kind ==
'a') {
17844 RC = &AMDGPU::AGPR_32RegClass;
17850 return std::pair(0U,
nullptr);
17856 return std::pair(0U,
nullptr);
17860 RC =
TRI->getVGPRClassForBitWidth(Width);
17862 RC =
TRI->getSGPRClassForBitWidth(Width);
17864 RC =
TRI->getAGPRClassForBitWidth(Width);
17866 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17871 return std::pair(0U,
nullptr);
17873 return std::pair(Reg, RC);
17879 return std::pair(0U,
nullptr);
17880 if (Idx < RC->getNumRegs())
17882 return std::pair(0U,
nullptr);
17888 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
17894 if (Constraint.
size() == 1) {
17895 switch (Constraint[0]) {
17905 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
17913 if (Constraint.
size() == 1) {
17914 switch (Constraint[0]) {
17922 }
else if (Constraint.
size() == 2) {
17923 if (Constraint ==
"VA")
17941 std::vector<SDValue> &
Ops,
17956 unsigned Size =
Op.getScalarValueSizeInBits();
17960 if (
Size == 16 && !Subtarget->has16BitInsts())
17964 Val =
C->getSExtValue();
17968 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17972 if (
Size != 16 ||
Op.getNumOperands() != 2)
17974 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
17977 Val =
C->getSExtValue();
17981 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17991 if (Constraint.
size() == 1) {
17992 switch (Constraint[0]) {
18007 }
else if (Constraint.
size() == 2) {
18008 if (Constraint ==
"DA") {
18009 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
18010 int64_t LoBits =
static_cast<int32_t
>(Val);
18014 if (Constraint ==
"DB") {
18022 unsigned MaxSize)
const {
18023 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
18024 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
18026 MVT VT =
Op.getSimpleValueType();
18051 switch (UnalignedClassID) {
18052 case AMDGPU::VReg_64RegClassID:
18053 return AMDGPU::VReg_64_Align2RegClassID;
18054 case AMDGPU::VReg_96RegClassID:
18055 return AMDGPU::VReg_96_Align2RegClassID;
18056 case AMDGPU::VReg_128RegClassID:
18057 return AMDGPU::VReg_128_Align2RegClassID;
18058 case AMDGPU::VReg_160RegClassID:
18059 return AMDGPU::VReg_160_Align2RegClassID;
18060 case AMDGPU::VReg_192RegClassID:
18061 return AMDGPU::VReg_192_Align2RegClassID;
18062 case AMDGPU::VReg_224RegClassID:
18063 return AMDGPU::VReg_224_Align2RegClassID;
18064 case AMDGPU::VReg_256RegClassID:
18065 return AMDGPU::VReg_256_Align2RegClassID;
18066 case AMDGPU::VReg_288RegClassID:
18067 return AMDGPU::VReg_288_Align2RegClassID;
18068 case AMDGPU::VReg_320RegClassID:
18069 return AMDGPU::VReg_320_Align2RegClassID;
18070 case AMDGPU::VReg_352RegClassID:
18071 return AMDGPU::VReg_352_Align2RegClassID;
18072 case AMDGPU::VReg_384RegClassID:
18073 return AMDGPU::VReg_384_Align2RegClassID;
18074 case AMDGPU::VReg_512RegClassID:
18075 return AMDGPU::VReg_512_Align2RegClassID;
18076 case AMDGPU::VReg_1024RegClassID:
18077 return AMDGPU::VReg_1024_Align2RegClassID;
18078 case AMDGPU::AReg_64RegClassID:
18079 return AMDGPU::AReg_64_Align2RegClassID;
18080 case AMDGPU::AReg_96RegClassID:
18081 return AMDGPU::AReg_96_Align2RegClassID;
18082 case AMDGPU::AReg_128RegClassID:
18083 return AMDGPU::AReg_128_Align2RegClassID;
18084 case AMDGPU::AReg_160RegClassID:
18085 return AMDGPU::AReg_160_Align2RegClassID;
18086 case AMDGPU::AReg_192RegClassID:
18087 return AMDGPU::AReg_192_Align2RegClassID;
18088 case AMDGPU::AReg_256RegClassID:
18089 return AMDGPU::AReg_256_Align2RegClassID;
18090 case AMDGPU::AReg_512RegClassID:
18091 return AMDGPU::AReg_512_Align2RegClassID;
18092 case AMDGPU::AReg_1024RegClassID:
18093 return AMDGPU::AReg_1024_Align2RegClassID;
18109 if (Info->isEntryFunction()) {
18116 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18118 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
18119 :
TRI->getAlignedHighSGPRForRC(MF, 2,
18120 &AMDGPU::SGPR_64RegClass);
18121 Info->setSGPRForEXECCopy(SReg);
18123 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
18124 Info->getStackPtrOffsetReg()));
18125 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18126 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
18130 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18131 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
18133 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18134 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
18136 Info->limitOccupancy(MF);
18138 if (ST.isWave32() && !MF.
empty()) {
18139 for (
auto &
MBB : MF) {
18140 for (
auto &
MI :
MBB) {
18141 TII->fixImplicitOperands(
MI);
18151 if (ST.needsAlignedVGPRs()) {
18152 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
18158 if (NewClassID != -1)
18159 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
18168 const APInt &DemandedElts,
18170 unsigned Depth)
const {
18172 unsigned Opc =
Op.getOpcode();
18175 unsigned IID =
Op.getConstantOperandVal(0);
18177 case Intrinsic::amdgcn_mbcnt_lo:
18178 case Intrinsic::amdgcn_mbcnt_hi: {
18184 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18194 Op, Known, DemandedElts, DAG,
Depth);
18210 unsigned MaxValue =
18217 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
18221 unsigned Src1Cst = 0;
18222 if (Src1.
isImm()) {
18223 Src1Cst = Src1.
getImm();
18224 }
else if (Src1.
isReg()) {
18228 Src1Cst = Cst->Value.getZExtValue();
18239 if (Width >= BFEWidth)
18248 Known = Known.
sext(BFEWidth);
18250 Known = Known.
zext(BFEWidth);
18256 unsigned Depth)
const {
18259 switch (
MI->getOpcode()) {
18260 case AMDGPU::S_BFE_I32:
18263 case AMDGPU::S_BFE_U32:
18266 case AMDGPU::S_BFE_I64:
18269 case AMDGPU::S_BFE_U64:
18272 case AMDGPU::G_INTRINSIC:
18273 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18276 case Intrinsic::amdgcn_workitem_id_x:
18279 case Intrinsic::amdgcn_workitem_id_y:
18282 case Intrinsic::amdgcn_workitem_id_z:
18285 case Intrinsic::amdgcn_mbcnt_lo:
18286 case Intrinsic::amdgcn_mbcnt_hi: {
18298 case Intrinsic::amdgcn_groupstaticsize: {
18309 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18312 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18315 case AMDGPU::G_AMDGPU_SMED3:
18316 case AMDGPU::G_AMDGPU_UMED3: {
18317 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
18344 unsigned Depth)
const {
18351 AttributeList Attrs =
18353 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
18380 if (Header->getAlignment() != PrefAlign)
18381 return Header->getAlignment();
18383 unsigned LoopSize = 0;
18388 LoopSize +=
MBB->getAlignment().value() / 2;
18391 LoopSize +=
TII->getInstSizeInBytes(
MI);
18392 if (LoopSize > 192)
18397 if (LoopSize <= 64)
18400 if (LoopSize <= 128)
18401 return CacheLineAlign;
18407 auto I = Exit->getFirstNonDebugInstr();
18408 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18409 return CacheLineAlign;
18418 if (PreTerm == Pre->
begin() ||
18419 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18423 auto ExitHead = Exit->getFirstNonDebugInstr();
18424 if (ExitHead == Exit->end() ||
18425 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18430 return CacheLineAlign;
18438 N =
N->getOperand(0).getNode();
18448 switch (
N->getOpcode()) {
18456 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
18457 return !
TRI->isSGPRReg(
MRI, Reg);
18463 return !
TRI->isSGPRReg(
MRI, Reg);
18467 unsigned AS = L->getAddressSpace();
18477 case AMDGPUISD::ATOMIC_CMP_SWAP:
18478 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18479 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18480 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18481 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18482 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18483 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18484 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18485 case AMDGPUISD::BUFFER_ATOMIC_AND:
18486 case AMDGPUISD::BUFFER_ATOMIC_OR:
18487 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18488 case AMDGPUISD::BUFFER_ATOMIC_INC:
18489 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18490 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18491 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18492 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18493 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18499 return A->readMem() &&
A->writeMem();
18520 switch (Ty.getScalarSizeInBits()) {
18532 const APInt &DemandedElts,
18535 unsigned Depth)
const {
18536 if (
Op.getOpcode() == AMDGPUISD::CLAMP) {
18540 if (Info->getMode().DX10Clamp)
18552 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
18572 <<
"Hardware instruction generated for atomic "
18574 <<
" operation at memory scope " << MemScope;
18579 Type *EltTy = VT->getElementType();
18580 return VT->getNumElements() == 2 &&
18600 unsigned BW =
IT->getBitWidth();
18601 return BW == 32 || BW == 64;
18615 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
18616 return BW == 32 || BW == 64;
18619 if (Ty->isFloatTy() || Ty->isDoubleTy())
18623 return VT->getNumElements() == 2 &&
18624 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18634 bool HasSystemScope) {
18641 if (HasSystemScope) {
18650 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
18663 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
18689 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
18702 bool HasSystemScope =
18734 if (!
IT ||
IT->getBitWidth() != 32)
18740 if (Subtarget->hasEmulatedSystemScopeAtomics())
18756 if (!HasSystemScope &&
18757 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18769 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
18777 ConstVal && ConstVal->isNullValue())
18815 if (Ty->isFloatTy()) {
18820 if (Ty->isDoubleTy()) {
18841 if (Ty->isFloatTy() &&
18842 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18855 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18859 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
18863 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18868 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
18873 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18877 if (Ty->isFloatTy()) {
18880 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18883 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18888 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18896 if (Subtarget->hasFlatAtomicFaddF32Inst())
18905 if (Subtarget->hasLDSFPAtomicAddF32()) {
18906 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18908 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18936 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18938 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18942 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18944 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18997 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18998 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18999 : &AMDGPU::SReg_32RegClass;
19000 if (!
TRI->isSGPRClass(RC) && !isDivergent)
19001 return TRI->getEquivalentSGPRClass(RC);
19002 if (
TRI->isSGPRClass(RC) && isDivergent) {
19003 if (Subtarget->hasGFX90AInsts())
19004 return TRI->getEquivalentAVClass(RC);
19005 return TRI->getEquivalentVGPRClass(RC);
19018 unsigned WaveSize) {
19023 if (!
IT ||
IT->getBitWidth() != WaveSize)
19028 if (!Visited.
insert(V).second)
19030 bool Result =
false;
19031 for (
const auto *U : V->users()) {
19033 if (V == U->getOperand(1)) {
19038 case Intrinsic::amdgcn_if_break:
19039 case Intrinsic::amdgcn_if:
19040 case Intrinsic::amdgcn_else:
19045 if (V == U->getOperand(0)) {
19050 case Intrinsic::amdgcn_end_cf:
19051 case Intrinsic::amdgcn_loop:
19057 Result =
hasCFUser(U, Visited, WaveSize);
19066 const Value *V)
const {
19068 if (CI->isInlineAsm()) {
19077 for (
auto &TC : TargetConstraints) {
19091 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
19119 return MRI.hasOneNonDBGUse(N0);
19126 if (
I.getMetadata(
"amdgpu.noclobber"))
19128 if (
I.getMetadata(
"amdgpu.last.use"))
19192 Alignment = RMW->getAlign();
19205 bool FullFlatEmulation =
19207 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19208 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19209 RMW->getType()->isDoubleTy()));
19212 bool ReturnValueIsUsed = !AI->
use_empty();
19221 if (FullFlatEmulation) {
19232 std::prev(BB->
end())->eraseFromParent();
19233 Builder.SetInsertPoint(BB);
19235 Value *LoadedShared =
nullptr;
19236 if (FullFlatEmulation) {
19237 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19238 {Addr},
nullptr,
"is.shared");
19239 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19240 Builder.SetInsertPoint(SharedBB);
19241 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19247 LoadedShared = Clone;
19249 Builder.CreateBr(PhiBB);
19250 Builder.SetInsertPoint(CheckPrivateBB);
19253 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19254 {Addr},
nullptr,
"is.private");
19255 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19257 Builder.SetInsertPoint(PrivateBB);
19259 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19262 Value *LoadedPrivate;
19264 LoadedPrivate = Builder.CreateAlignedLoad(
19265 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
19268 LoadedPrivate, RMW->getValOperand());
19270 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19272 auto [ResultLoad, Equal] =
19278 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19281 Builder.CreateBr(PhiBB);
19283 Builder.SetInsertPoint(GlobalBB);
19287 if (FullFlatEmulation) {
19288 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19297 if (!FullFlatEmulation) {
19302 MDNode *RangeNotPrivate =
19305 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
19309 Builder.CreateBr(PhiBB);
19311 Builder.SetInsertPoint(PhiBB);
19313 if (ReturnValueIsUsed) {
19316 if (FullFlatEmulation)
19317 Loaded->addIncoming(LoadedShared, SharedBB);
19318 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19319 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19320 Loaded->takeName(AI);
19323 Builder.CreateBr(ExitBB);
19327 unsigned PtrOpIdx) {
19328 Value *PtrOp =
I->getOperand(PtrOpIdx);
19335 I->setOperand(PtrOpIdx, ASCast);
19347 ConstVal && ConstVal->isNullValue()) {
19377 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19385 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19400 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
AMDGPUArgumentUsageInfo & getArgUsageInfo()
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned AndSaveExecOpc
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf()
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
MachineFunctionAnalysisManager * getMFAM()
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
constexpr bool empty() const
empty - Check if the string is empty.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
constexpr bool isZero() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
unsigned AtomicNoRetBaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const