44#include "llvm/IR/IntrinsicsAMDGPU.h"
45#include "llvm/IR/IntrinsicsR600.h"
56#define DEBUG_TYPE "si-lower"
62 cl::desc(
"Do not align and prefetch loops"),
66 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
67 cl::desc(
"Use indirect register addressing for divergent indexes"),
81 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
82 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
84 return AMDGPU::SGPR0 +
Reg;
100 TRI->getDefaultVectorSuperClassForBitWidth(32);
106 TRI->getDefaultVectorSuperClassForBitWidth(64);
144 TRI->getDefaultVectorSuperClassForBitWidth(320));
148 TRI->getDefaultVectorSuperClassForBitWidth(352));
152 TRI->getDefaultVectorSuperClassForBitWidth(384));
156 TRI->getDefaultVectorSuperClassForBitWidth(512));
163 TRI->getDefaultVectorSuperClassForBitWidth(1024));
165 if (Subtarget->has16BitInsts()) {
166 if (Subtarget->useRealTrue16Insts()) {
196 TRI->getDefaultVectorSuperClassForBitWidth(1024));
209 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
210 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
211 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
212 MVT::i1, MVT::v32i32},
216 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
217 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
218 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
219 MVT::i1, MVT::v32i32},
287 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
294 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
295 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
296 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
299 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
300 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
301 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
305 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
306 MVT::v3i16, MVT::v4i16, MVT::Other},
311 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
327 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
328 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
329 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
330 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
331 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
332 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
333 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
334 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
366 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
380 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
394 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
408 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
422 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
437 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
438 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
441 if (Subtarget->hasPkMovB32()) {
462 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
463 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
468 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
472 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
473 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
474 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
475 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
499 if (Subtarget->hasSMemRealTime() ||
504 if (Subtarget->has16BitInsts()) {
514 if (Subtarget->hasMadMacF32Insts())
531 if (Subtarget->hasIntClamp())
534 if (Subtarget->hasAddNoCarryInsts())
540 {MVT::f32, MVT::f64},
Custom);
546 {MVT::f32, MVT::f64},
Legal);
548 if (Subtarget->haveRoundOpsF64())
578 if (Subtarget->has16BitInsts()) {
631 if (Subtarget->hasBF16TransInsts())
654 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
655 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
656 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
791 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
792 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
793 MVT::v32f16, MVT::v32bf16},
803 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
807 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
811 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
812 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
820 if (Subtarget->hasVOP3PInsts()) {
831 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
834 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
835 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
836 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
839 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
847 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
854 {MVT::v2f16, MVT::v4f16},
Custom);
860 if (Subtarget->hasBF16PackedInsts()) {
861 for (
MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
867 if (Subtarget->hasPackedFP32Ops()) {
871 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
878 if (Subtarget->has16BitInsts()) {
891 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
892 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
893 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
894 MVT::v32f16, MVT::v32bf16},
899 if (Subtarget->hasVectorMulU64())
901 else if (Subtarget->hasScalarSMulU64())
904 if (Subtarget->hasMad64_32())
907 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
910 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
912 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
915 if (Subtarget->hasMinimum3Maximum3F32())
918 if (Subtarget->hasMinimum3Maximum3PKF16()) {
922 if (!Subtarget->hasMinimum3Maximum3F16())
927 if (Subtarget->hasVOP3PInsts()) {
930 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
934 if (Subtarget->hasIntMinMax64())
939 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
940 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
945 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
946 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
947 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
948 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
952 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
953 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
954 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
955 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
970 if (Subtarget->hasBF16ConversionInsts()) {
975 if (Subtarget->hasBF16PackedInsts()) {
981 if (Subtarget->hasBF16TransInsts()) {
985 if (Subtarget->hasCvtPkF16F32Inst()) {
987 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1038 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1079 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1092 EVT DestVT,
EVT SrcVT)
const {
1094 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1095 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1097 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1104 LLT DestTy,
LLT SrcTy)
const {
1105 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1106 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1108 SrcTy.getScalarSizeInBits() == 16 &&
1129 return Subtarget->has16BitInsts()
1135 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1139 if (!Subtarget->has16BitInsts() && VT.
getSizeInBits() == 16)
1161 return (NumElts + 1) / 2;
1167 return NumElts * ((
Size + 31) / 32);
1176 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1185 MVT SimpleIntermediateVT =
1187 IntermediateVT = SimpleIntermediateVT;
1188 RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
1189 NumIntermediates = (NumElts + 1) / 2;
1190 return (NumElts + 1) / 2;
1195 IntermediateVT = RegisterVT;
1196 NumIntermediates = NumElts;
1197 return NumIntermediates;
1202 RegisterVT = MVT::i16;
1203 IntermediateVT = ScalarVT;
1204 NumIntermediates = NumElts;
1205 return NumIntermediates;
1209 RegisterVT = MVT::i32;
1210 IntermediateVT = ScalarVT;
1211 NumIntermediates = NumElts;
1212 return NumIntermediates;
1216 RegisterVT = MVT::i32;
1217 IntermediateVT = RegisterVT;
1218 NumIntermediates = NumElts * ((
Size + 31) / 32);
1219 return NumIntermediates;
1224 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1229 unsigned MaxNumLanes) {
1230 assert(MaxNumLanes != 0);
1234 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1245 unsigned MaxNumLanes) {
1251 assert(ST->getNumContainedTypes() == 2 &&
1252 ST->getContainedType(1)->isIntegerTy(32));
1266 return MVT::amdgpuBufferFatPointer;
1268 DL.getPointerSizeInBits(AS) == 192)
1269 return MVT::amdgpuBufferStridedPointer;
1278 DL.getPointerSizeInBits(AS) == 160) ||
1280 DL.getPointerSizeInBits(AS) == 192))
1287 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1288 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1289 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1291 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1292 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1293 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1294 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1295 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1297 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1298 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1299 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1300 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1301 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1303 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1304 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1305 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1306 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1307 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1346 unsigned IntrID)
const {
1348 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1366 if (RsrcIntr->IsImage) {
1381 Info.ptrVal = RsrcArg;
1384 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1393 if (RsrcIntr->IsImage) {
1394 unsigned MaxNumLanes = 4;
1409 std::numeric_limits<unsigned>::max());
1419 if (RsrcIntr->IsImage) {
1440 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1442 Info.memVT = MVT::i32;
1449 case Intrinsic::amdgcn_raw_buffer_load_lds:
1450 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1451 case Intrinsic::amdgcn_struct_buffer_load_lds:
1452 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1458 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1459 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1460 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1461 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1464 std::numeric_limits<unsigned>::max());
1474 case Intrinsic::amdgcn_ds_ordered_add:
1475 case Intrinsic::amdgcn_ds_ordered_swap: {
1488 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1489 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1492 Info.ptrVal =
nullptr;
1497 case Intrinsic::amdgcn_ds_append:
1498 case Intrinsic::amdgcn_ds_consume: {
1511 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1512 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1513 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1518 Info.memVT = MVT::i64;
1524 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1525 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1526 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1529 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1532 ->getElementType(0));
1540 case Intrinsic::amdgcn_global_atomic_fmin_num:
1541 case Intrinsic::amdgcn_global_atomic_fmax_num:
1542 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1543 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1544 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1554 case Intrinsic::amdgcn_flat_load_monitor_b32:
1555 case Intrinsic::amdgcn_flat_load_monitor_b64:
1556 case Intrinsic::amdgcn_flat_load_monitor_b128:
1557 case Intrinsic::amdgcn_global_load_monitor_b32:
1558 case Intrinsic::amdgcn_global_load_monitor_b64:
1559 case Intrinsic::amdgcn_global_load_monitor_b128:
1560 case Intrinsic::amdgcn_cluster_load_b32:
1561 case Intrinsic::amdgcn_cluster_load_b64:
1562 case Intrinsic::amdgcn_cluster_load_b128:
1563 case Intrinsic::amdgcn_ds_load_tr6_b96:
1564 case Intrinsic::amdgcn_ds_load_tr4_b64:
1565 case Intrinsic::amdgcn_ds_load_tr8_b64:
1566 case Intrinsic::amdgcn_ds_load_tr16_b128:
1567 case Intrinsic::amdgcn_global_load_tr6_b96:
1568 case Intrinsic::amdgcn_global_load_tr4_b64:
1569 case Intrinsic::amdgcn_global_load_tr_b64:
1570 case Intrinsic::amdgcn_global_load_tr_b128:
1571 case Intrinsic::amdgcn_ds_read_tr4_b64:
1572 case Intrinsic::amdgcn_ds_read_tr6_b96:
1573 case Intrinsic::amdgcn_ds_read_tr8_b64:
1574 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1582 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1583 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1584 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1592 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1593 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1594 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1602 case Intrinsic::amdgcn_ds_gws_init:
1603 case Intrinsic::amdgcn_ds_gws_barrier:
1604 case Intrinsic::amdgcn_ds_gws_sema_v:
1605 case Intrinsic::amdgcn_ds_gws_sema_br:
1606 case Intrinsic::amdgcn_ds_gws_sema_p:
1607 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1617 Info.memVT = MVT::i32;
1619 Info.align =
Align(4);
1621 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1627 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1628 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1629 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1630 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1631 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1632 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1633 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1634 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1641 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1642 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1643 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1644 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1651 case Intrinsic::amdgcn_load_to_lds:
1652 case Intrinsic::amdgcn_global_load_lds: {
1663 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1664 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1665 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1666 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1676 Info.memVT = MVT::i32;
1678 Info.align =
Align(4);
1683 case Intrinsic::amdgcn_s_prefetch_data:
1684 case Intrinsic::amdgcn_flat_prefetch:
1685 case Intrinsic::amdgcn_global_prefetch: {
1700 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1703 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1704 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1716 Type *&AccessTy)
const {
1717 Value *Ptr =
nullptr;
1718 switch (
II->getIntrinsicID()) {
1719 case Intrinsic::amdgcn_cluster_load_b128:
1720 case Intrinsic::amdgcn_cluster_load_b64:
1721 case Intrinsic::amdgcn_cluster_load_b32:
1722 case Intrinsic::amdgcn_ds_append:
1723 case Intrinsic::amdgcn_ds_consume:
1724 case Intrinsic::amdgcn_ds_load_tr8_b64:
1725 case Intrinsic::amdgcn_ds_load_tr16_b128:
1726 case Intrinsic::amdgcn_ds_load_tr4_b64:
1727 case Intrinsic::amdgcn_ds_load_tr6_b96:
1728 case Intrinsic::amdgcn_ds_read_tr4_b64:
1729 case Intrinsic::amdgcn_ds_read_tr6_b96:
1730 case Intrinsic::amdgcn_ds_read_tr8_b64:
1731 case Intrinsic::amdgcn_ds_read_tr16_b64:
1732 case Intrinsic::amdgcn_ds_ordered_add:
1733 case Intrinsic::amdgcn_ds_ordered_swap:
1734 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1735 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1736 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1737 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1738 case Intrinsic::amdgcn_flat_load_monitor_b128:
1739 case Intrinsic::amdgcn_flat_load_monitor_b32:
1740 case Intrinsic::amdgcn_flat_load_monitor_b64:
1741 case Intrinsic::amdgcn_global_atomic_fmax_num:
1742 case Intrinsic::amdgcn_global_atomic_fmin_num:
1743 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1744 case Intrinsic::amdgcn_global_load_monitor_b128:
1745 case Intrinsic::amdgcn_global_load_monitor_b32:
1746 case Intrinsic::amdgcn_global_load_monitor_b64:
1747 case Intrinsic::amdgcn_global_load_tr_b64:
1748 case Intrinsic::amdgcn_global_load_tr_b128:
1749 case Intrinsic::amdgcn_global_load_tr4_b64:
1750 case Intrinsic::amdgcn_global_load_tr6_b96:
1751 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1752 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1753 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1754 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1755 Ptr =
II->getArgOperand(0);
1757 case Intrinsic::amdgcn_load_to_lds:
1758 case Intrinsic::amdgcn_global_load_lds:
1759 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1760 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1761 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1762 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1763 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1764 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1765 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1766 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1767 Ptr =
II->getArgOperand(1);
1772 AccessTy =
II->getType();
1778 unsigned AddrSpace)
const {
1779 if (!Subtarget->hasFlatInstOffsets()) {
1790 return AM.
Scale == 0 &&
1791 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1792 AM.
BaseOffs, AddrSpace, FlatVariant));
1796 if (Subtarget->hasFlatGlobalInsts())
1799 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1812 return isLegalMUBUFAddressingMode(AM);
1815bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1826 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1838 if (AM.HasBaseReg) {
1870 return isLegalMUBUFAddressingMode(AM);
1872 if (!Subtarget->hasScalarSubwordLoads()) {
1877 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1925 return Subtarget->hasFlatScratchEnabled()
1927 : isLegalMUBUFAddressingMode(AM);
1974 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1983 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
1986 Align RequiredAlignment(
1988 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
Size > 32 &&
1989 Alignment < RequiredAlignment)
2004 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
2010 RequiredAlignment =
Align(4);
2012 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2028 *IsFast = (Alignment >= RequiredAlignment) ? 64
2029 : (Alignment <
Align(4)) ? 32
2036 if (!Subtarget->hasDS96AndDS128())
2042 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2051 *IsFast = (Alignment >= RequiredAlignment) ? 96
2052 : (Alignment <
Align(4)) ? 32
2059 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2065 RequiredAlignment =
Align(8);
2067 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2076 *IsFast = (Alignment >= RequiredAlignment) ? 128
2077 : (Alignment <
Align(4)) ? 32
2094 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2096 return Alignment >= RequiredAlignment ||
2097 Subtarget->hasUnalignedDSAccessEnabled();
2105 bool AlignedBy4 = Alignment >=
Align(4);
2106 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2108 *IsFast = AlignedBy4 ?
Size : 1;
2113 *IsFast = AlignedBy4;
2124 return Alignment >=
Align(4) ||
2125 Subtarget->hasUnalignedBufferAccessEnabled();
2137 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2152 return Size >= 32 && Alignment >=
Align(4);
2157 unsigned *IsFast)
const {
2159 Alignment, Flags, IsFast);
2164 const AttributeList &FuncAttributes)
const {
2170 if (
Op.size() >= 16 &&
2174 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2192 unsigned DestAS)
const {
2195 Subtarget->hasGloballyAddressableScratch()) {
2225 unsigned Index)
const {
2237 unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
2242 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2277 auto [InputPtrReg, RC, ArgTy] =
2287 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2293 const SDLoc &SL)
const {
2300 const SDLoc &SL)
const {
2303 std::optional<uint32_t> KnownSize =
2305 if (KnownSize.has_value())
2332 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2347SDValue SITargetLowering::lowerKernargMemParameter(
2352 MachinePointerInfo PtrInfo =
2361 int64_t OffsetDiff =
Offset - AlignDownOffset;
2367 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2378 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2383 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain,
Offset);
2388 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2397 const SDLoc &SL)
const {
2466 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2469 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2470 if (ConvertedVal == ArgValue)
2471 return ConvertedVal;
2476SDValue SITargetLowering::lowerWorkGroupId(
2481 if (!Subtarget->hasClusters())
2482 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2490 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2491 SDLoc SL(ClusterIdXYZ);
2492 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2495 SDValue ClusterWorkGroupIdXYZ =
2496 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2506 return ClusterIdXYZ;
2508 using namespace AMDGPU::Hwreg;
2512 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2523SDValue SITargetLowering::getPreloadedValue(
2526 const ArgDescriptor *
Reg =
nullptr;
2527 const TargetRegisterClass *RC;
2531 const ArgDescriptor WorkGroupIDX =
2539 const ArgDescriptor WorkGroupIDZ =
2541 const ArgDescriptor ClusterWorkGroupIDX =
2543 const ArgDescriptor ClusterWorkGroupIDY =
2545 const ArgDescriptor ClusterWorkGroupIDZ =
2547 const ArgDescriptor ClusterWorkGroupMaxIDX =
2549 const ArgDescriptor ClusterWorkGroupMaxIDY =
2551 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2553 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2556 auto LoadConstant = [&](
unsigned N) {
2560 if (Subtarget->hasArchitectedSGPRs() &&
2567 Reg = &WorkGroupIDX;
2568 RC = &AMDGPU::SReg_32RegClass;
2572 Reg = &WorkGroupIDY;
2573 RC = &AMDGPU::SReg_32RegClass;
2577 Reg = &WorkGroupIDZ;
2578 RC = &AMDGPU::SReg_32RegClass;
2582 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2583 return LoadConstant(0);
2584 Reg = &ClusterWorkGroupIDX;
2585 RC = &AMDGPU::SReg_32RegClass;
2589 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2590 return LoadConstant(0);
2591 Reg = &ClusterWorkGroupIDY;
2592 RC = &AMDGPU::SReg_32RegClass;
2596 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2597 return LoadConstant(0);
2598 Reg = &ClusterWorkGroupIDZ;
2599 RC = &AMDGPU::SReg_32RegClass;
2604 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2605 Reg = &ClusterWorkGroupMaxIDX;
2606 RC = &AMDGPU::SReg_32RegClass;
2611 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2612 Reg = &ClusterWorkGroupMaxIDY;
2613 RC = &AMDGPU::SReg_32RegClass;
2618 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2619 Reg = &ClusterWorkGroupMaxIDZ;
2620 RC = &AMDGPU::SReg_32RegClass;
2624 Reg = &ClusterWorkGroupMaxFlatID;
2625 RC = &AMDGPU::SReg_32RegClass;
2656 for (
unsigned I = 0,
E = Ins.
size(), PSInputNum = 0;
I !=
E; ++
I) {
2660 "vector type argument should have been split");
2665 bool SkipArg = !Arg->
Used && !Info->isPSInputAllocated(PSInputNum);
2673 "unexpected vector split in ps argument type");
2687 Info->markPSInputAllocated(PSInputNum);
2689 Info->markPSInputEnabled(PSInputNum);
2705 if (Info.hasWorkItemIDX()) {
2711 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2715 if (Info.hasWorkItemIDY()) {
2716 assert(Info.hasWorkItemIDX());
2717 if (Subtarget->hasPackedTID()) {
2718 Info.setWorkItemIDY(
2721 unsigned Reg = AMDGPU::VGPR1;
2729 if (Info.hasWorkItemIDZ()) {
2730 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2731 if (Subtarget->hasPackedTID()) {
2732 Info.setWorkItemIDZ(
2735 unsigned Reg = AMDGPU::VGPR2;
2755 if (RegIdx == ArgVGPRs.
size()) {
2762 unsigned Reg = ArgVGPRs[RegIdx];
2774 unsigned NumArgRegs) {
2777 if (RegIdx == ArgSGPRs.
size())
2780 unsigned Reg = ArgSGPRs[RegIdx];
2822 const unsigned Mask = 0x3ff;
2825 if (Info.hasWorkItemIDX()) {
2827 Info.setWorkItemIDX(Arg);
2830 if (Info.hasWorkItemIDY()) {
2832 Info.setWorkItemIDY(Arg);
2835 if (Info.hasWorkItemIDZ())
2847 const unsigned Mask = 0x3ff;
2856 auto &
ArgInfo = Info.getArgInfo();
2868 if (Info.hasImplicitArgPtr())
2876 if (Info.hasWorkGroupIDX())
2879 if (Info.hasWorkGroupIDY())
2882 if (Info.hasWorkGroupIDZ())
2885 if (Info.hasLDSKernelId())
2896 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2897 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2903 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2904 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2909 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2910 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2916 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2922 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
2931 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2936 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
2937 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2942 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
2943 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2958 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2960 bool InPreloadSequence =
true;
2962 bool AlignedForImplictArgs =
false;
2963 unsigned ImplicitArgOffset = 0;
2964 for (
auto &Arg :
F.args()) {
2965 if (!InPreloadSequence || !Arg.hasInRegAttr())
2968 unsigned ArgIdx = Arg.getArgNo();
2971 if (InIdx < Ins.
size() &&
2972 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2975 for (; InIdx < Ins.
size() && Ins[InIdx].isOrigArg() &&
2976 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2978 assert(ArgLocs[ArgIdx].isMemLoc());
2979 auto &ArgLoc = ArgLocs[InIdx];
2981 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2983 unsigned NumAllocSGPRs =
2984 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2987 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2988 if (!AlignedForImplictArgs) {
2990 alignTo(LastExplicitArgOffset,
2991 Subtarget->getAlignmentForImplicitArgPtr()) -
2992 LastExplicitArgOffset;
2993 AlignedForImplictArgs =
true;
2995 ArgOffset += ImplicitArgOffset;
2999 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
3000 assert(InIdx >= 1 &&
"No previous SGPR");
3001 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
3002 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
3006 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3007 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
3010 InPreloadSequence =
false;
3016 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3018 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3020 if (PreloadRegs->
size() > 1)
3021 RC = &AMDGPU::SGPR_32RegClass;
3022 for (
auto &Reg : *PreloadRegs) {
3028 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3037 if (Info.hasLDSKernelId()) {
3038 Register Reg = Info.addLDSKernelId();
3039 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3048 bool IsShader)
const {
3049 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3050 if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
3056 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3058 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3062 unsigned NumRequiredSystemSGPRs =
3063 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3064 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3065 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3066 Register Reg = Info.addReservedUserSGPR();
3067 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3072 if (!HasArchitectedSGPRs) {
3073 if (Info.hasWorkGroupIDX()) {
3074 Register Reg = Info.addWorkGroupIDX();
3075 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3079 if (Info.hasWorkGroupIDY()) {
3080 Register Reg = Info.addWorkGroupIDY();
3081 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3085 if (Info.hasWorkGroupIDZ()) {
3086 Register Reg = Info.addWorkGroupIDZ();
3087 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3092 if (Info.hasWorkGroupInfo()) {
3093 Register Reg = Info.addWorkGroupInfo();
3094 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3098 if (Info.hasPrivateSegmentWaveByteOffset()) {
3100 unsigned PrivateSegmentWaveByteOffsetReg;
3103 PrivateSegmentWaveByteOffsetReg =
3104 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3108 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3110 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3113 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3115 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3116 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3119 assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
3120 Info.getNumPreloadedSGPRs() >= 16);
3135 if (HasStackObjects)
3136 Info.setHasNonSpillStackObjects(
true);
3141 HasStackObjects =
true;
3145 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3147 if (!ST.hasFlatScratchEnabled()) {
3148 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3155 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3157 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3167 Info.setScratchRSrcReg(ReservedBufferReg);
3186 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
3187 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3194 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3195 if (!
MRI.isLiveIn(
Reg)) {
3196 Info.setStackPtrOffsetReg(
Reg);
3201 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3208 if (ST.getFrameLowering()->hasFP(MF)) {
3209 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3225 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3234 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3235 RC = &AMDGPU::SGPR_64RegClass;
3236 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3237 RC = &AMDGPU::SGPR_32RegClass;
3243 Entry->addLiveIn(*
I);
3248 for (
auto *Exit : Exits)
3250 TII->get(TargetOpcode::COPY), *
I)
3265 bool IsError =
false;
3269 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3287 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3288 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3290 if (!Subtarget->hasFlatScratchEnabled())
3295 !Subtarget->hasArchitectedSGPRs())
3296 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3297 !Info->hasWorkGroupIDZ());
3300 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3318 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3319 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3322 Info->markPSInputAllocated(0);
3323 Info->markPSInputEnabled(0);
3325 if (Subtarget->isAmdPalOS()) {
3334 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3335 if ((PsInputBits & 0x7F) == 0 ||
3336 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3339 }
else if (IsKernel) {
3340 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3352 if (IsKernel && Subtarget->hasKernargPreload())
3356 }
else if (!IsGraphics) {
3361 if (!Subtarget->hasFlatScratchEnabled())
3373 Info->setNumWaveDispatchSGPRs(
3375 Info->setNumWaveDispatchVGPRs(
3377 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3378 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3383 if (IsWholeWaveFunc) {
3385 {MVT::i1, MVT::Other}, Chain);
3397 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.
size(), ArgIdx = 0; i != e;
3408 if (IsEntryFunc && VA.
isMemLoc()) {
3431 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3435 int64_t OffsetDiff =
Offset - AlignDownOffset;
3442 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3453 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3454 Ins[i].Flags.isSExt(), &Ins[i]);
3462 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3465 if (PreloadRegs.
size() == 1) {
3466 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3471 TRI->getRegSizeInBits(*RC)));
3479 for (
auto Reg : PreloadRegs) {
3486 PreloadRegs.size()),
3503 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3504 Ins[i].Flags.isSExt(), &Ins[i]);
3516 "hidden argument in kernel signature was not preloaded",
3522 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3523 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3543 if (!IsEntryFunc && VA.
isMemLoc()) {
3544 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3555 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3556 RC = &AMDGPU::VGPR_32RegClass;
3557 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3558 RC = &AMDGPU::SGPR_32RegClass;
3578 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3587 auto &ArgUsageInfo =
3590 }
else if (
auto *MFAM = DAG.
getMFAM()) {
3592 auto *ArgUsageInfo =
3594 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3596 ArgUsageInfo->setFuncArgInfo(Fn, Info->getArgInfo());
3600 Info->setBytesInStackArgArea(StackArgSize);
3602 return Chains.
empty() ? Chain
3611 const Type *RetTy)
const {
3619 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3624 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3625 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3626 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3627 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3650 Info->setIfReturnsVoid(Outs.
empty());
3651 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3670 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3671 ++
I, ++RealRVLocIdx) {
3675 SDValue Arg = OutVals[RealRVLocIdx];
3698 ReadFirstLane, Arg);
3705 if (!Info->isEntryFunction()) {
3711 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3713 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3726 unsigned Opc = AMDGPUISD::ENDPGM;
3728 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3729 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3730 : AMDGPUISD::RET_GLUE;
3812 auto &ArgUsageInfo =
3815 &ArgUsageInfo.getArgUsageInfo().lookupFuncArgInfo(*CalleeFunc);
3816 }
else if (
auto *MFAM = DAG.
getMFAM()) {
3818 auto *ArgUsageInfo =
3823 CalleeArgInfo = &ArgUsageInfo->lookupFuncArgInfo(*CalleeFunc);
3851 const auto [OutgoingArg, ArgRC, ArgTy] =
3856 const auto [IncomingArg, IncomingArgRC, Ty] =
3858 assert(IncomingArgRC == ArgRC);
3861 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3869 InputReg = getImplicitArgPtr(DAG,
DL);
3871 std::optional<uint32_t> Id =
3873 if (Id.has_value()) {
3884 if (OutgoingArg->isRegister()) {
3885 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3886 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3889 unsigned SpecialArgOffset =
3900 auto [OutgoingArg, ArgRC, Ty] =
3903 std::tie(OutgoingArg, ArgRC, Ty) =
3906 std::tie(OutgoingArg, ArgRC, Ty) =
3921 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3922 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3923 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3928 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3936 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
3946 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
3955 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3956 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3967 : IncomingArgY ? *IncomingArgY
3974 if (OutgoingArg->isRegister()) {
3976 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
4002 if (Callee->isDivergent())
4009 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
4013 if (!CallerPreserved)
4016 bool CCMatch = CallerCC == CalleeCC;
4029 if (Arg.hasByValAttr())
4043 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
4044 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4053 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4066 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4068 if (!CCVA.isRegLoc())
4073 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4075 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4099enum ChainCallArgIdx {
4121 bool UsesDynamicVGPRs =
false;
4122 if (IsChainCallConv) {
4127 auto RequestedExecIt =
4129 return Arg.OrigArgIndex == 2;
4131 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4133 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4136 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4139 "Haven't popped all the special args");
4142 CLI.
Args[ChainCallArgIdx::Exec];
4143 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4151 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4153 ChainCallSpecialArgs.
push_back(Arg.Node);
4156 PushNodeOrTargetConstant(RequestedExecArg);
4162 if (FlagsValue.
isZero()) {
4163 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4165 "no additional args allowed if flags == 0");
4167 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4171 if (!Subtarget->isWave32()) {
4173 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4176 UsesDynamicVGPRs =
true;
4177 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4178 CLI.
Args.end(), PushNodeOrTargetConstant);
4187 bool IsSibCall =
false;
4201 "unsupported call to variadic function ");
4209 "unsupported required tail call to function ");
4214 Outs, OutVals, Ins, DAG);
4218 "site marked musttail or on llvm.amdgcn.cs.chain");
4225 if (!TailCallOpt && IsTailCall)
4265 auto *
TRI = Subtarget->getRegisterInfo();
4272 if (!IsSibCall || IsChainCallConv) {
4273 if (!Subtarget->hasFlatScratchEnabled()) {
4279 RegsToPass.emplace_back(IsChainCallConv
4280 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4281 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4288 const unsigned NumSpecialInputs = RegsToPass.size();
4290 MVT PtrVT = MVT::i32;
4293 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4321 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4329 int32_t
Offset = LocMemOffset;
4336 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4342 ? Flags.getNonZeroByValAlign()
4369 if (Outs[i].Flags.isByVal()) {
4371 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4374 Outs[i].Flags.getNonZeroByValAlign(),
4376 nullptr, std::nullopt, DstInfo,
4382 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4388 if (!MemOpChains.
empty())
4404 unsigned ArgIdx = 0;
4405 for (
auto [Reg, Val] : RegsToPass) {
4406 if (ArgIdx++ >= NumSpecialInputs &&
4407 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4433 if (IsTailCall && !IsSibCall) {
4438 std::vector<SDValue>
Ops({Chain});
4444 Ops.push_back(Callee);
4461 Ops.push_back(Callee);
4472 if (IsChainCallConv)
4477 for (
auto &[Reg, Val] : RegsToPass)
4481 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4482 assert(Mask &&
"Missing call preserved mask for calling convention");
4492 MVT::Glue, GlueOps),
4497 Ops.push_back(InGlue);
4503 unsigned OPC = AMDGPUISD::TC_RETURN;
4506 OPC = AMDGPUISD::TC_RETURN_GFX;
4510 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4511 : AMDGPUISD::TC_RETURN_CHAIN;
4517 if (Info->isWholeWaveFunction())
4518 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4525 Chain =
Call.getValue(0);
4526 InGlue =
Call.getValue(1);
4528 uint64_t CalleePopBytes = NumBytes;
4549 EVT VT =
Op.getValueType();
4563 "Stack grows upwards for AMDGPU");
4565 Chain = BaseAddr.getValue(1);
4567 if (Alignment > StackAlign) {
4569 << Subtarget->getWavefrontSizeLog2();
4570 uint64_t StackAlignMask = ScaledAlignment - 1;
4577 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4583 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4594 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4610 if (
Op.getValueType() != MVT::i32)
4629 assert(
Op.getValueType() == MVT::i32);
4638 Op.getOperand(0), IntrinID, GetRoundBothImm);
4672 SDValue RoundModeTimesNumBits =
4692 TableEntry, EnumOffset);
4708 static_cast<uint32_t>(ConstMode->getZExtValue()),
4720 if (UseReducedTable) {
4726 SDValue RoundModeTimesNumBits =
4746 SDValue RoundModeTimesNumBits =
4755 NewMode = TruncTable;
4764 ReadFirstLaneID, NewMode);
4777 IntrinID, RoundBothImm, NewMode);
4783 if (
Op->isDivergent() &&
4784 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4794 if (Subtarget->hasSafeSmemPrefetch())
4802 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4811 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4812 EVT SrcVT = Src.getValueType();
4821 EVT DstVT =
Op.getValueType();
4830 if (
Op.getValueType() != MVT::i64)
4844 Op.getOperand(0), IntrinID, ModeHwRegImm);
4846 Op.getOperand(0), IntrinID, TrapHwRegImm);
4860 if (
Op.getOperand(1).getValueType() != MVT::i64)
4872 ReadFirstLaneID, NewModeReg);
4874 ReadFirstLaneID, NewTrapReg);
4876 unsigned ModeHwReg =
4879 unsigned TrapHwReg =
4887 IntrinID, ModeHwRegImm, NewModeReg);
4890 IntrinID, TrapHwRegImm, NewTrapReg);
4899 .
Case(
"m0", AMDGPU::M0)
4900 .
Case(
"exec", AMDGPU::EXEC)
4901 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4902 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4903 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4904 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4905 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4910 if (!Subtarget->hasFlatScrRegister() &&
4911 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4913 "\" for subtarget."));
4918 case AMDGPU::EXEC_LO:
4919 case AMDGPU::EXEC_HI:
4920 case AMDGPU::FLAT_SCR_LO:
4921 case AMDGPU::FLAT_SCR_HI:
4926 case AMDGPU::FLAT_SCR:
4945 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4954static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4976 auto Next = std::next(
I);
4987 MBB.addSuccessor(LoopBB);
4989 return std::pair(LoopBB, RemainderBB);
4996 auto I =
MI.getIterator();
4997 auto E = std::next(
I);
5019 Src->setIsKill(
false);
5029 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
5035 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5038 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5062 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5063 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5073 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
5074 Register NewExec =
MRI.createVirtualRegister(BoolRC);
5076 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5077 Register CondReg =
MRI.createVirtualRegister(BoolRC);
5085 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5092 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5096 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5104 MRI.setSimpleHint(NewExec, CondReg);
5106 if (UseGPRIdxMode) {
5108 SGPRIdxReg = CurrentIdxReg;
5110 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5111 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5121 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5152 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5153 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5161 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5163 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
5164 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
5180 InitResultReg, DstReg, PhiReg, TmpExec,
5181 Offset, UseGPRIdxMode, SGPRIdxReg);
5187 LoopBB->removeSuccessor(RemainderBB);
5189 LoopBB->addSuccessor(LandingPad);
5200static std::pair<unsigned, int>
5204 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5209 return std::pair(AMDGPU::sub0,
Offset);
5249 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5266 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5267 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5276 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5279 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5283 if (UseGPRIdxMode) {
5290 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5303 MI.eraseFromParent();
5312 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5313 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5319 UseGPRIdxMode, SGPRIdxReg);
5323 if (UseGPRIdxMode) {
5325 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5327 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5332 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5337 MI.eraseFromParent();
5354 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5364 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5366 if (Idx->
getReg() == AMDGPU::NoRegister) {
5377 MI.eraseFromParent();
5382 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5386 if (UseGPRIdxMode) {
5390 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5399 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5400 TRI.getRegSizeInBits(*VecRC), 32,
false);
5406 MI.eraseFromParent();
5416 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5420 UseGPRIdxMode, SGPRIdxReg);
5423 if (UseGPRIdxMode) {
5425 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5427 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5433 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5434 TRI.getRegSizeInBits(*VecRC), 32,
false);
5435 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5441 MI.eraseFromParent();
5457 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5458 if (ST.hasScalarAddSub64()) {
5459 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5469 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5470 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5473 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5475 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5478 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5480 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5482 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5483 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5492 MI.eraseFromParent();
5498 case AMDGPU::S_MIN_U32:
5499 return std::numeric_limits<uint32_t>::max();
5500 case AMDGPU::S_MIN_I32:
5501 return std::numeric_limits<int32_t>::max();
5502 case AMDGPU::S_MAX_U32:
5503 return std::numeric_limits<uint32_t>::min();
5504 case AMDGPU::S_MAX_I32:
5505 return std::numeric_limits<int32_t>::min();
5506 case AMDGPU::V_ADD_F32_e64:
5508 case AMDGPU::V_SUB_F32_e64:
5510 case AMDGPU::S_ADD_I32:
5511 case AMDGPU::S_SUB_I32:
5512 case AMDGPU::S_OR_B32:
5513 case AMDGPU::S_XOR_B32:
5514 return std::numeric_limits<uint32_t>::min();
5515 case AMDGPU::S_AND_B32:
5516 return std::numeric_limits<uint32_t>::max();
5517 case AMDGPU::V_MIN_F32_e64:
5518 case AMDGPU::V_MAX_F32_e64:
5522 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5528 case AMDGPU::V_CMP_LT_U64_e64:
5529 return std::numeric_limits<uint64_t>::max();
5530 case AMDGPU::V_CMP_LT_I64_e64:
5531 return std::numeric_limits<int64_t>::max();
5532 case AMDGPU::V_CMP_GT_U64_e64:
5533 return std::numeric_limits<uint64_t>::min();
5534 case AMDGPU::V_CMP_GT_I64_e64:
5535 return std::numeric_limits<int64_t>::min();
5536 case AMDGPU::V_MIN_F64_e64:
5537 case AMDGPU::V_MAX_F64_e64:
5538 case AMDGPU::V_MIN_NUM_F64_e64:
5539 case AMDGPU::V_MAX_NUM_F64_e64:
5540 return 0x7FF8000000000000;
5541 case AMDGPU::S_ADD_U64_PSEUDO:
5542 case AMDGPU::S_SUB_U64_PSEUDO:
5543 case AMDGPU::S_OR_B64:
5544 case AMDGPU::S_XOR_B64:
5545 return std::numeric_limits<uint64_t>::min();
5546 case AMDGPU::S_AND_B64:
5547 return std::numeric_limits<uint64_t>::max();
5548 case AMDGPU::V_ADD_F64_e64:
5549 case AMDGPU::V_ADD_F64_pseudo_e64:
5550 return 0x8000000000000000;
5553 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5558 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5559 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5560 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5561 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5562 Opc == AMDGPU::S_XOR_B32 ||
Opc == AMDGPU::V_MIN_F32_e64 ||
5563 Opc == AMDGPU::V_MAX_F32_e64 ||
Opc == AMDGPU::V_ADD_F32_e64 ||
5564 Opc == AMDGPU::V_SUB_F32_e64;
5568 return Opc == AMDGPU::V_MIN_F32_e64 ||
Opc == AMDGPU::V_MAX_F32_e64 ||
5569 Opc == AMDGPU::V_ADD_F32_e64 ||
Opc == AMDGPU::V_SUB_F32_e64 ||
5570 Opc == AMDGPU::V_MIN_F64_e64 ||
Opc == AMDGPU::V_MAX_F64_e64 ||
5571 Opc == AMDGPU::V_MIN_NUM_F64_e64 ||
Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
5572 Opc == AMDGPU::V_ADD_F64_e64 ||
Opc == AMDGPU::V_ADD_F64_pseudo_e64;
5586 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5591 case AMDGPU::S_MIN_U32:
5592 case AMDGPU::S_MIN_I32:
5593 case AMDGPU::V_MIN_F32_e64:
5594 case AMDGPU::S_MAX_U32:
5595 case AMDGPU::S_MAX_I32:
5596 case AMDGPU::V_MAX_F32_e64:
5597 case AMDGPU::S_AND_B32:
5598 case AMDGPU::S_OR_B32: {
5604 case AMDGPU::V_CMP_LT_U64_e64:
5605 case AMDGPU::V_CMP_LT_I64_e64:
5606 case AMDGPU::V_CMP_GT_U64_e64:
5607 case AMDGPU::V_CMP_GT_I64_e64:
5608 case AMDGPU::V_MIN_F64_e64:
5609 case AMDGPU::V_MIN_NUM_F64_e64:
5610 case AMDGPU::V_MAX_F64_e64:
5611 case AMDGPU::V_MAX_NUM_F64_e64:
5612 case AMDGPU::S_AND_B64:
5613 case AMDGPU::S_OR_B64: {
5619 case AMDGPU::S_XOR_B32:
5620 case AMDGPU::S_XOR_B64:
5621 case AMDGPU::S_ADD_I32:
5622 case AMDGPU::S_ADD_U64_PSEUDO:
5623 case AMDGPU::V_ADD_F32_e64:
5624 case AMDGPU::V_ADD_F64_e64:
5625 case AMDGPU::V_ADD_F64_pseudo_e64:
5626 case AMDGPU::S_SUB_I32:
5627 case AMDGPU::S_SUB_U64_PSEUDO:
5628 case AMDGPU::V_SUB_F32_e64: {
5631 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5633 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5635 bool IsWave32 = ST.isWave32();
5636 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5637 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5638 unsigned BitCountOpc =
5639 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5643 auto NewAccumulator =
5648 case AMDGPU::S_XOR_B32:
5649 case AMDGPU::S_XOR_B64: {
5655 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5658 .
addReg(NewAccumulator->getOperand(0).getReg())
5661 if (
Opc == AMDGPU::S_XOR_B32) {
5667 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5669 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5673 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5676 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5678 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5688 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5696 case AMDGPU::S_SUB_I32: {
5697 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5705 .
addReg(NewAccumulator->getOperand(0).getReg());
5708 case AMDGPU::S_ADD_I32: {
5711 .
addReg(NewAccumulator->getOperand(0).getReg());
5714 case AMDGPU::S_ADD_U64_PSEUDO:
5715 case AMDGPU::S_SUB_U64_PSEUDO: {
5716 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5717 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5719 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5721 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5722 Register CarryReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5723 Register AddReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5725 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5727 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5731 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5734 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5736 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5738 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5741 .
addReg(NewAccumulator->getOperand(0).getReg())
5751 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5753 : NewAccumulator->getOperand(0).getReg();
5764 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5770 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5776 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5783 case AMDGPU::V_ADD_F32_e64:
5784 case AMDGPU::V_ADD_F64_e64:
5785 case AMDGPU::V_ADD_F64_pseudo_e64:
5786 case AMDGPU::V_SUB_F32_e64: {
5789 Register ActiveLanesVreg =
MRI.createVirtualRegister(VregRC);
5790 Register DstVreg =
MRI.createVirtualRegister(VregRC);
5793 TII->get(is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
5794 : AMDGPU::V_CVT_F64_I32_e64),
5796 .
addReg(NewAccumulator->getOperand(0).getReg())
5802 (
Opc == AMDGPU::V_SUB_F32_e64 ||
5803 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
5806 unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
5808 ? AMDGPU::V_MUL_F64_pseudo_e64
5809 : AMDGPU::V_MUL_F64_e64;
5819 BuildMI(BB,
MI,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5823 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5825 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5827 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
5829 TII->buildExtractSubRegOrImm(
MI,
MRI, DestVregInst->getOperand(0),
5830 VregRC, AMDGPU::sub0, VregSubRC);
5832 TII->buildExtractSubRegOrImm(
MI,
MRI, DestVregInst->getOperand(0),
5833 VregRC, AMDGPU::sub1, VregSubRC);
5842 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5875 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5876 Register IdentityValReg =
MRI.createVirtualRegister(DstRegClass);
5877 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5878 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5879 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5880 Register FF1Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5881 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
5883 bool IsWave32 = ST.isWave32();
5884 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5885 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5892 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5896 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
5899 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5908 I = ComputeLoop->begin();
5910 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5914 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5918 I = ComputeLoop->end();
5921 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5925 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5931 MRI.createVirtualRegister(
MRI.getRegClass(SrcReg));
5932 Register DstVreg =
MRI.createVirtualRegister(
MRI.getRegClass(SrcReg));
5934 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_MOV_B32_e32),
5944 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5945 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5954 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5956 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5957 Register LaneValReg =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5960 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5962 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5964 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5966 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5970 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5974 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5975 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5981 case AMDGPU::S_OR_B64:
5982 case AMDGPU::S_AND_B64:
5983 case AMDGPU::S_XOR_B64: {
5986 .
addReg(LaneValue->getOperand(0).getReg())
5990 case AMDGPU::V_CMP_GT_I64_e64:
5991 case AMDGPU::V_CMP_GT_U64_e64:
5992 case AMDGPU::V_CMP_LT_I64_e64:
5993 case AMDGPU::V_CMP_LT_U64_e64: {
5994 Register LaneMaskReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5996 MRI.createVirtualRegister(WaveMaskRegClass);
5998 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src);
6000 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), SrcIdx));
6002 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
6003 Register AccumulatorVReg =
MRI.createVirtualRegister(VregClass);
6006 VregClass, AMDGPU::sub0, VSubRegClass);
6009 VregClass, AMDGPU::sub1, VSubRegClass);
6010 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
6017 .
addReg(LaneValue->getOperand(0).getReg())
6018 .
addReg(AccumulatorVReg);
6020 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6021 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
6025 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
6026 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
6027 .
addReg(LaneValue->getOperand(0).getReg())
6031 case AMDGPU::V_MIN_F64_e64:
6032 case AMDGPU::V_MIN_NUM_F64_e64:
6033 case AMDGPU::V_MAX_F64_e64:
6034 case AMDGPU::V_MAX_NUM_F64_e64:
6035 case AMDGPU::V_ADD_F64_e64:
6036 case AMDGPU::V_ADD_F64_pseudo_e64: {
6038 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src);
6040 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), SrcIdx));
6042 TRI->getSubRegisterClass(VregRC, AMDGPU::sub0);
6043 Register AccumulatorVReg =
MRI.createVirtualRegister(VregRC);
6044 Register DstVreg =
MRI.createVirtualRegister(VregRC);
6046 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6048 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6049 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::COPY), AccumulatorVReg)
6052 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6057 .
addReg(LaneValue->getOperand(0).getReg())
6063 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32),
6066 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32),
6070 TII->buildExtractSubRegOrImm(Iters,
MRI, DstVregInst->getOperand(0),
6071 VregRC, AMDGPU::sub0, VregSubRC);
6073 TII->buildExtractSubRegOrImm(Iters,
MRI, DstVregInst->getOperand(0),
6074 VregRC, AMDGPU::sub1, VregSubRC);
6075 ReadLaneLo.add(Op1L);
6076 ReadLaneHi.add(Op1H);
6077 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
6078 TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
6085 case AMDGPU::S_ADD_U64_PSEUDO:
6086 case AMDGPU::S_SUB_U64_PSEUDO: {
6089 .
addReg(LaneValue->getOperand(0).getReg());
6096 unsigned BITSETOpc =
6097 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
6098 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
6104 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
6107 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
6109 .
addReg(NewActiveBitsReg)
6111 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
6116 MI.eraseFromParent();
6131 switch (
MI.getOpcode()) {
6132 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6134 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6136 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6138 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6140 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6142 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
6145 ? AMDGPU::V_MIN_NUM_F64_e64
6146 : AMDGPU::V_MIN_F64_e64);
6147 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6149 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6151 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6153 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6155 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6157 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
6160 ? AMDGPU::V_MAX_NUM_F64_e64
6161 : AMDGPU::V_MAX_F64_e64);
6162 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6164 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6166 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6168 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
6171 ? AMDGPU::V_ADD_F64_pseudo_e64
6172 : AMDGPU::V_ADD_F64_e64);
6173 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6175 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6177 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6179 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
6184 ? AMDGPU::V_ADD_F64_pseudo_e64
6185 : AMDGPU::V_ADD_F64_e64);
6186 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6188 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6190 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6192 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6194 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6196 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6198 case AMDGPU::S_UADDO_PSEUDO:
6199 case AMDGPU::S_USUBO_PSEUDO: {
6205 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6207 : AMDGPU::S_SUB_U32;
6215 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6218 MI.eraseFromParent();
6221 case AMDGPU::S_ADD_U64_PSEUDO:
6222 case AMDGPU::S_SUB_U64_PSEUDO: {
6225 case AMDGPU::V_ADD_U64_PSEUDO:
6226 case AMDGPU::V_SUB_U64_PSEUDO: {
6227 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6233 if (ST.hasAddSubU64Insts()) {
6235 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6236 : AMDGPU::V_SUB_U64_e64),
6241 TII->legalizeOperands(*
I);
6242 MI.eraseFromParent();
6246 if (IsAdd && ST.hasLshlAddU64Inst()) {
6252 TII->legalizeOperands(*
Add);
6253 MI.eraseFromParent();
6257 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6259 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6260 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6262 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
6263 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
6267 : &AMDGPU::VReg_64RegClass;
6270 : &AMDGPU::VReg_64RegClass;
6273 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6275 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6278 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6280 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6283 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6285 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6288 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6295 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6309 TII->legalizeOperands(*LoHalf);
6310 TII->legalizeOperands(*HiHalf);
6311 MI.eraseFromParent();
6314 case AMDGPU::S_ADD_CO_PSEUDO:
6315 case AMDGPU::S_SUB_CO_PSEUDO: {
6326 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6327 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6332 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6333 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6337 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6339 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6344 if (ST.isWave64()) {
6345 if (ST.hasScalarCompareEq64()) {
6352 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6354 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6356 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6357 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6359 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6373 unsigned Opc =
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6374 ? AMDGPU::S_ADDC_U32
6375 : AMDGPU::S_SUBB_U32;
6380 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6386 MI.eraseFromParent();
6389 case AMDGPU::SI_INIT_M0: {
6392 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6395 MI.eraseFromParent();
6398 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6401 TII->get(AMDGPU::S_CMP_EQ_U32))
6406 case AMDGPU::GET_GROUPSTATICSIZE: {
6410 .
add(
MI.getOperand(0))
6412 MI.eraseFromParent();
6415 case AMDGPU::GET_SHADERCYCLESHILO: {
6428 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6430 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6431 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6433 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6434 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6436 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6440 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6445 .
add(
MI.getOperand(0))
6450 MI.eraseFromParent();
6453 case AMDGPU::SI_INDIRECT_SRC_V1:
6454 case AMDGPU::SI_INDIRECT_SRC_V2:
6455 case AMDGPU::SI_INDIRECT_SRC_V3:
6456 case AMDGPU::SI_INDIRECT_SRC_V4:
6457 case AMDGPU::SI_INDIRECT_SRC_V5:
6458 case AMDGPU::SI_INDIRECT_SRC_V6:
6459 case AMDGPU::SI_INDIRECT_SRC_V7:
6460 case AMDGPU::SI_INDIRECT_SRC_V8:
6461 case AMDGPU::SI_INDIRECT_SRC_V9:
6462 case AMDGPU::SI_INDIRECT_SRC_V10:
6463 case AMDGPU::SI_INDIRECT_SRC_V11:
6464 case AMDGPU::SI_INDIRECT_SRC_V12:
6465 case AMDGPU::SI_INDIRECT_SRC_V16:
6466 case AMDGPU::SI_INDIRECT_SRC_V32:
6468 case AMDGPU::SI_INDIRECT_DST_V1:
6469 case AMDGPU::SI_INDIRECT_DST_V2:
6470 case AMDGPU::SI_INDIRECT_DST_V3:
6471 case AMDGPU::SI_INDIRECT_DST_V4:
6472 case AMDGPU::SI_INDIRECT_DST_V5:
6473 case AMDGPU::SI_INDIRECT_DST_V6:
6474 case AMDGPU::SI_INDIRECT_DST_V7:
6475 case AMDGPU::SI_INDIRECT_DST_V8:
6476 case AMDGPU::SI_INDIRECT_DST_V9:
6477 case AMDGPU::SI_INDIRECT_DST_V10:
6478 case AMDGPU::SI_INDIRECT_DST_V11:
6479 case AMDGPU::SI_INDIRECT_DST_V12:
6480 case AMDGPU::SI_INDIRECT_DST_V16:
6481 case AMDGPU::SI_INDIRECT_DST_V32:
6483 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6484 case AMDGPU::SI_KILL_I1_PSEUDO:
6486 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6490 Register SrcCond =
MI.getOperand(3).getReg();
6492 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6493 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6494 const auto *CondRC =
TRI->getWaveMaskRegClass();
6495 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
6499 : &AMDGPU::VReg_64RegClass;
6502 : &AMDGPU::VReg_64RegClass;
6505 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6507 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6510 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6512 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6515 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6517 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6538 MI.eraseFromParent();
6541 case AMDGPU::SI_BR_UNDEF: {
6543 .
add(
MI.getOperand(0));
6545 MI.eraseFromParent();
6548 case AMDGPU::ADJCALLSTACKUP:
6549 case AMDGPU::ADJCALLSTACKDOWN: {
6556 case AMDGPU::SI_CALL_ISEL: {
6557 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
6560 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6566 MI.eraseFromParent();
6569 case AMDGPU::V_ADD_CO_U32_e32:
6570 case AMDGPU::V_SUB_CO_U32_e32:
6571 case AMDGPU::V_SUBREV_CO_U32_e32: {
6573 unsigned Opc =
MI.getOpcode();
6575 bool NeedClampOperand =
false;
6576 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
6578 NeedClampOperand =
true;
6582 if (
TII->isVOP3(*
I)) {
6585 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
6586 if (NeedClampOperand)
6589 TII->legalizeOperands(*
I);
6591 MI.eraseFromParent();
6594 case AMDGPU::V_ADDC_U32_e32:
6595 case AMDGPU::V_SUBB_U32_e32:
6596 case AMDGPU::V_SUBBREV_U32_e32:
6599 TII->legalizeOperands(
MI);
6601 case AMDGPU::DS_GWS_INIT:
6602 case AMDGPU::DS_GWS_SEMA_BR:
6603 case AMDGPU::DS_GWS_BARRIER:
6604 case AMDGPU::DS_GWS_SEMA_V:
6605 case AMDGPU::DS_GWS_SEMA_P:
6606 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6614 case AMDGPU::S_SETREG_B32: {
6630 const unsigned SetMask = WidthMask <<
Offset;
6633 unsigned SetDenormOp = 0;
6634 unsigned SetRoundOp = 0;
6642 SetRoundOp = AMDGPU::S_ROUND_MODE;
6643 SetDenormOp = AMDGPU::S_DENORM_MODE;
6645 SetRoundOp = AMDGPU::S_ROUND_MODE;
6647 SetDenormOp = AMDGPU::S_DENORM_MODE;
6650 if (SetRoundOp || SetDenormOp) {
6652 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6653 unsigned ImmVal = Def->getOperand(1).getImm();
6667 MI.eraseFromParent();
6676 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6680 case AMDGPU::S_INVERSE_BALLOT_U32:
6681 case AMDGPU::S_INVERSE_BALLOT_U64:
6684 MI.setDesc(
TII->get(AMDGPU::COPY));
6686 case AMDGPU::ENDPGM_TRAP: {
6688 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6708 MI.eraseFromParent();
6711 case AMDGPU::SIMULATED_TRAP: {
6712 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6714 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6715 MI.eraseFromParent();
6718 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6719 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6725 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6726 Register OriginalExec = Setup->getOperand(0).getReg();
6728 MI.getOperand(0).setReg(OriginalExec);
6765 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6769 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6796 if (!Subtarget->hasMadMacF32Insts())
6797 return Subtarget->hasFastFMAF32();
6803 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6806 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6822 switch (Ty.getScalarSizeInBits()) {
6840 if (Ty.getScalarSizeInBits() == 16)
6842 if (Ty.getScalarSizeInBits() == 32)
6843 return Subtarget->hasMadMacF32Insts() &&
6853 EVT VT =
N->getValueType(0);
6855 return Subtarget->hasMadMacF32Insts() &&
6857 if (VT == MVT::f16) {
6858 return Subtarget->hasMadF16() &&
6873 unsigned Opc =
Op.getOpcode();
6874 EVT VT =
Op.getValueType();
6875 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6876 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6877 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6878 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6879 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6880 VT == MVT::v32bf16);
6896 [[maybe_unused]]
EVT VT =
Op.getValueType();
6898 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6899 VT == MVT::v16i32) &&
6900 "Unexpected ValueType.");
6909 unsigned Opc =
Op.getOpcode();
6910 EVT VT =
Op.getValueType();
6911 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6912 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6913 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6914 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6915 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6916 VT == MVT::v32bf16);
6924 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6926 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6933 unsigned Opc =
Op.getOpcode();
6934 EVT VT =
Op.getValueType();
6935 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6936 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6937 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6938 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6939 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6940 VT == MVT::v32bf16);
6945 : std::pair(Op0, Op0);
6954 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
6956 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
6962 switch (
Op.getOpcode()) {
6966 return LowerBRCOND(
Op, DAG);
6968 return LowerRETURNADDR(
Op, DAG);
6971 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6972 "Load should return a value and a chain");
6976 EVT VT =
Op.getValueType();
6978 return lowerFSQRTF32(
Op, DAG);
6980 return lowerFSQRTF64(
Op, DAG);
6985 return LowerTrig(
Op, DAG);
6987 return LowerSELECT(
Op, DAG);
6989 return LowerFDIV(
Op, DAG);
6991 return LowerFFREXP(
Op, DAG);
6993 return LowerATOMIC_CMP_SWAP(
Op, DAG);
6995 return LowerSTORE(
Op, DAG);
6999 return LowerGlobalAddress(MFI,
Op, DAG);
7002 return LowerExternalSymbol(
Op, DAG);
7004 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
7006 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
7008 return LowerINTRINSIC_VOID(
Op, DAG);
7010 return lowerADDRSPACECAST(
Op, DAG);
7012 return lowerINSERT_SUBVECTOR(
Op, DAG);
7014 return lowerINSERT_VECTOR_ELT(
Op, DAG);
7016 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
7018 return lowerVECTOR_SHUFFLE(
Op, DAG);
7020 return lowerSCALAR_TO_VECTOR(
Op, DAG);
7022 return lowerBUILD_VECTOR(
Op, DAG);
7025 return lowerFP_ROUND(
Op, DAG);
7027 return lowerTRAP(
Op, DAG);
7029 return lowerDEBUGTRAP(
Op, DAG);
7038 return lowerFMINNUM_FMAXNUM(
Op, DAG);
7041 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
7044 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
7047 return lowerFLDEXP(
Op, DAG);
7053 Op.getValueType() == MVT::i16 &&
7054 Op.getOperand(0).getValueType() == MVT::f32) {
7078 return lowerFCOPYSIGN(
Op, DAG);
7080 return lowerMUL(
Op, DAG);
7083 return lowerXMULO(
Op, DAG);
7086 return lowerXMUL_LOHI(
Op, DAG);
7121 EVT FittingLoadVT = LoadVT;
7153SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
7156 bool IsIntrinsic)
const {
7159 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7160 EVT LoadVT =
M->getValueType(0);
7162 EVT EquivLoadVT = LoadVT;
7176 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
7180 M->getMemoryVT(),
M->getMemOperand());
7191 EVT LoadVT =
M->getValueType(0);
7197 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
7198 bool IsTFE =
M->getNumValues() == 3;
7200 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7201 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7202 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7203 : AMDGPUISD::BUFFER_LOAD;
7206 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG,
Ops);
7211 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
7215 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
7216 M->getMemOperand(), DAG);
7220 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
7222 M->getMemOperand(), DAG);
7230 EVT VT =
N->getValueType(0);
7231 unsigned CondCode =
N->getConstantOperandVal(3);
7242 EVT CmpVT =
LHS.getValueType();
7243 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
7244 unsigned PromoteOp =
7264 EVT VT =
N->getValueType(0);
7266 unsigned CondCode =
N->getConstantOperandVal(3);
7275 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7284 SDValue SetCC = DAG.
getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7293 EVT VT =
N->getValueType(0);
7317 Exec = AMDGPU::EXEC_LO;
7319 Exec = AMDGPU::EXEC;
7336 EVT VT =
N->getValueType(0);
7338 unsigned IID =
N->getConstantOperandVal(0);
7339 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7340 IID == Intrinsic::amdgcn_permlanex16;
7341 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7342 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7346 unsigned SplitSize = 32;
7347 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7348 ST->hasDPALU_DPP() &&
7356 case Intrinsic::amdgcn_permlane16:
7357 case Intrinsic::amdgcn_permlanex16:
7358 case Intrinsic::amdgcn_update_dpp:
7363 case Intrinsic::amdgcn_writelane:
7366 case Intrinsic::amdgcn_readlane:
7367 case Intrinsic::amdgcn_set_inactive:
7368 case Intrinsic::amdgcn_set_inactive_chain_arg:
7369 case Intrinsic::amdgcn_mov_dpp8:
7372 case Intrinsic::amdgcn_readfirstlane:
7373 case Intrinsic::amdgcn_permlane64:
7381 std::reverse(Operands.
begin(), Operands.
end());
7383 if (
SDNode *GL =
N->getGluedNode()) {
7385 GL = GL->getOperand(0).getNode();
7395 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7396 IID == Intrinsic::amdgcn_mov_dpp8 ||
7397 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7398 Src1 =
N->getOperand(2);
7399 if (IID == Intrinsic::amdgcn_writelane ||
7400 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7401 Src2 =
N->getOperand(3);
7404 if (ValSize == SplitSize) {
7414 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7419 if (IID == Intrinsic::amdgcn_writelane) {
7424 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7426 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7429 if (ValSize % SplitSize != 0)
7433 EVT VT =
N->getValueType(0);
7437 unsigned NumOperands =
N->getNumOperands();
7439 SDNode *GL =
N->getGluedNode();
7444 for (
unsigned i = 0; i != NE; ++i) {
7445 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7447 SDValue Operand =
N->getOperand(j);
7456 Operands[j] = Operand;
7461 Operands[NumOperands - 1] =
7477 if (SplitSize == 32) {
7479 return unrollLaneOp(LaneOp.
getNode());
7485 unsigned SubVecNumElt =
7489 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7490 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7494 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7499 if (IID == Intrinsic::amdgcn_writelane)
7504 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7505 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7506 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7507 EltIdx += SubVecNumElt;
7521 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7524 if (IID == Intrinsic::amdgcn_writelane)
7527 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7534 EVT VT =
N->getValueType(0);
7552 auto MakeIntrinsic = [&DAG, &SL](
unsigned IID,
MVT RetVT,
7556 Operands.
append(IntrinArgs);
7562 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7563 {ShiftedIndex, ValueI32});
7573 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7574 {ValueI32, PoisonVal});
7575 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
7576 {ShiftedIndex, PoisonVal});
7579 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
7582 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
7583 {WWMIndex, WWMValue});
7584 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
7585 MVT::i32, {WWMIndex, Swapped});
7587 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
7595 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
7603 DAG.
getSetCC(SL, MVT::i1, SameOrOtherHalf,
7613 switch (
N->getOpcode()) {
7625 unsigned IID =
N->getConstantOperandVal(0);
7627 case Intrinsic::amdgcn_make_buffer_rsrc:
7628 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
7630 case Intrinsic::amdgcn_cvt_pkrtz: {
7635 DAG.
getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7639 case Intrinsic::amdgcn_cvt_pknorm_i16:
7640 case Intrinsic::amdgcn_cvt_pknorm_u16:
7641 case Intrinsic::amdgcn_cvt_pk_i16:
7642 case Intrinsic::amdgcn_cvt_pk_u16: {
7648 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7649 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7650 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7651 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7652 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7653 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7655 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7657 EVT VT =
N->getValueType(0);
7666 case Intrinsic::amdgcn_s_buffer_load: {
7672 if (!Subtarget->hasScalarSubwordLoads())
7678 EVT VT =
Op.getValueType();
7679 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
7691 if (!
Offset->isDivergent()) {
7710 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
7715 case Intrinsic::amdgcn_dead: {
7716 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7727 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7728 Results.push_back(Res.getOperand(
I));
7732 Results.push_back(Res.getValue(1));
7741 EVT VT =
N->getValueType(0);
7746 EVT SelectVT = NewVT;
7747 if (NewVT.
bitsLT(MVT::i32)) {
7750 SelectVT = MVT::i32;
7756 if (NewVT != SelectVT)
7762 if (
N->getValueType(0) != MVT::v2f16)
7774 if (
N->getValueType(0) != MVT::v2f16)
7786 if (
N->getValueType(0) != MVT::f16)
7801 if (U.get() !=
Value)
7804 if (U.getUser()->getOpcode() == Opcode)
7810unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
7813 case Intrinsic::amdgcn_if:
7814 return AMDGPUISD::IF;
7815 case Intrinsic::amdgcn_else:
7816 return AMDGPUISD::ELSE;
7817 case Intrinsic::amdgcn_loop:
7818 return AMDGPUISD::LOOP;
7819 case Intrinsic::amdgcn_end_cf:
7839 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7866 SDNode *Intr = BRCOND.getOperand(1).getNode();
7883 Intr =
LHS.getNode();
7891 assert(BR &&
"brcond missing unconditional branch user");
7896 unsigned CFNode = isCFIntrinsic(Intr);
7916 Ops.push_back(Target);
7939 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
7958 MVT VT =
Op.getSimpleValueType();
7961 if (
Op.getConstantOperandVal(0) != 0)
7965 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7967 if (
Info->isEntryFunction())
7984 return Op.getValueType().bitsLE(VT)
7992 EVT DstVT =
Op.getValueType();
7999 unsigned Opc =
Op.getOpcode();
8011 EVT SrcVT = Src.getValueType();
8012 EVT DstVT =
Op.getValueType();
8015 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
8018 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
8025 if (DstVT == MVT::f16) {
8030 if (!Subtarget->has16BitInsts()) {
8035 if (
Op->getFlags().hasApproximateFuncs()) {
8046 "custom lower FP_ROUND for f16 or bf16");
8047 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
8059 EVT VT =
Op.getValueType();
8061 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8062 bool IsIEEEMode =
Info->getMode().IEEE;
8071 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8078SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
8080 EVT VT =
Op.getValueType();
8082 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8083 bool IsIEEEMode =
Info->getMode().IEEE;
8088 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8096 EVT VT =
Op.getValueType();
8100 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
8101 !Subtarget->hasMinimum3Maximum3F16() &&
8102 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
8103 "should not need to widen f16 minimum/maximum to v2f16");
8117 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
8125 EVT VT =
Op.getValueType();
8129 EVT ExpVT =
Exp.getValueType();
8130 if (ExpVT == MVT::i16)
8151 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
8158 switch (
Op->getOpcode()) {
8188 DAGCombinerInfo &DCI)
const {
8189 const unsigned Opc =
Op.getOpcode();
8197 :
Op->getOperand(0).getValueType();
8198 auto &DAG = DCI.DAG;
8201 if (DCI.isBeforeLegalizeOps() ||
8209 LHS =
Op->getOperand(1);
8210 RHS =
Op->getOperand(2);
8212 LHS =
Op->getOperand(0);
8213 RHS =
Op->getOperand(1);
8252 if (MagVT == SignVT)
8269 EVT VT =
Op.getValueType();
8275 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
8302 if (
Op->isDivergent())
8315 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8317 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8320 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8322 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8328 EVT VT =
Op.getValueType();
8335 const APInt &
C = RHSC->getAPIntValue();
8337 if (
C.isPowerOf2()) {
8339 bool UseArithShift = isSigned && !
C.isMinSignedValue();
8366 if (
Op->isDivergent()) {
8370 if (Subtarget->hasSMulHi()) {
8381 if (!Subtarget->hasTrapHandler() ||
8383 return lowerTrapEndpgm(
Op, DAG);
8385 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
8386 : lowerTrapHsaQueuePtr(
Op, DAG);
8392 return DAG.
getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8396SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
8398 ImplicitParameter Param)
const {
8402 MachinePointerInfo PtrInfo =
8419 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
8422 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8425 if (UserSGPR == AMDGPU::NoRegister) {
8442 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8451 if (Subtarget->hasPrivEnabledTrap2NopBug())
8452 return DAG.
getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8456 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8464 if (!Subtarget->hasTrapHandler() ||
8468 "debugtrap handler not supported",
8476 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8479SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
8481 if (Subtarget->hasApertureRegs()) {
8483 ? AMDGPU::SRC_SHARED_BASE
8484 : AMDGPU::SRC_PRIVATE_BASE;
8485 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8486 !Subtarget->hasGloballyAddressableScratch()) &&
8487 "Cannot use src_private_base with globally addressable scratch!");
8508 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
8512 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8514 if (UserSGPR == AMDGPU::NoRegister) {
8559 const AMDGPUTargetMachine &TM =
8562 unsigned DestAS, SrcAS;
8564 bool IsNonNull =
false;
8566 SrcAS = ASC->getSrcAddressSpace();
8567 Src = ASC->getOperand(0);
8568 DestAS = ASC->getDestAddressSpace();
8571 Op.getConstantOperandVal(0) ==
8572 Intrinsic::amdgcn_addrspacecast_nonnull);
8573 Src =
Op->getOperand(1);
8574 SrcAS =
Op->getConstantOperandVal(2);
8575 DestAS =
Op->getConstantOperandVal(3);
8588 Subtarget->hasGloballyAddressableScratch()) {
8593 AMDGPU::S_MOV_B32, SL, MVT::i32,
8594 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8602 unsigned NullVal = TM.getNullPointerValue(DestAS);
8617 Subtarget->hasGloballyAddressableScratch()) {
8626 if (Subtarget->isWave64())
8632 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8640 AMDGPU::S_MOV_B64, SL, MVT::i64,
8641 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8643 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8645 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8653 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8665 Op.getValueType() == MVT::i64) {
8666 const SIMachineFunctionInfo *
Info =
8668 if (
Info->get32BitAddressHighBits() == 0)
8677 Src.getValueType() == MVT::i64)
8705 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
8710 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8712 MVT::i32, InsNumElts / 2);
8717 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
8719 if (InsNumElts == 2) {
8732 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
8755 if (NumElts == 4 && EltSize == 16 && KIdx) {
8766 unsigned Idx = KIdx->getZExtValue();
8767 bool InsertLo = Idx < 2;
8771 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8777 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8790 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
8825 EVT ResultVT =
Op.getValueType();
8838 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8841 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8845 if (VecSize == 128) {
8853 }
else if (VecSize == 256) {
8856 for (
unsigned P = 0;
P < 4; ++
P) {
8862 Parts[0], Parts[1]));
8864 Parts[2], Parts[3]));
8870 for (
unsigned P = 0;
P < 8; ++
P) {
8877 Parts[0], Parts[1], Parts[2], Parts[3]));
8880 Parts[4], Parts[5], Parts[6], Parts[7]));
8900 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8915 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8925 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8930 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8931 !(Mask[Elt + 1] & 1);
8937 EVT ResultVT =
Op.getValueType();
8940 const int NewSrcNumElts = 2;
8942 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
8958 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
8980 if (ShouldUseConsecutiveExtract &&
8983 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8984 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8996 if (Idx0 >= SrcNumElts) {
9001 if (Idx1 >= SrcNumElts) {
9006 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
9007 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
9015 int NewMaskIdx0 = Idx0 - AlignedIdx0;
9016 int NewMaskIdx1 = Idx1 - AlignedIdx1;
9021 if (SubVec0 != SubVec1) {
9022 NewMaskIdx1 += NewSrcNumElts;
9029 {NewMaskIdx0, NewMaskIdx1});
9034 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
9035 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
9036 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
9037 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
9056 EVT ResultVT =
Op.getValueType();
9072 EVT VT =
Op.getValueType();
9074 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
9075 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
9109 for (
unsigned P = 0;
P < NumParts; ++
P) {
9111 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
9130 if (!Subtarget->isAmdHsaOS())
9173 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET64,
DL, PtrVT, Ptr);
9182 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET,
DL, PtrVT, PtrLo, PtrHi);
9190 EVT PtrVT =
Op.getValueType();
9192 const GlobalValue *GV = GSD->
getGlobal();
9206 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
9221 return DAG.
getNode(AMDGPUISD::LDS,
DL, MVT::i32, GA);
9224 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9225 if (Subtarget->has64BitLiterals()) {
9256 MachinePointerInfo PtrInfo =
9269 Fn,
"unsupported external symbol",
Op.getDebugLoc()));
9293 SDValue Param = lowerKernargMemParameter(
9304 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
9312 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
9320 unsigned NumElts = Elts.
size();
9322 if (NumElts <= 12) {
9331 for (
unsigned i = 0; i < Elts.
size(); ++i) {
9337 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
9347 EVT SrcVT = Src.getValueType();
9368 bool Unpacked,
bool IsD16,
int DMaskPop,
9369 int NumVDataDwords,
bool IsAtomicPacked16Bit,
9373 EVT ReqRetVT = ResultTypes[0];
9375 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9376 ? (ReqRetNumElts + 1) / 2
9379 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9390 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
9401 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
9403 NumDataDwords - MaskPopDwords);
9408 EVT LegalReqRetVT = ReqRetVT;
9410 if (!
Data.getValueType().isInteger())
9412 Data.getValueType().changeTypeToInteger(),
Data);
9433 if (Result->getNumValues() == 1)
9440 SDValue *LWE,
bool &IsTexFail) {
9460 unsigned DimIdx,
unsigned EndIdx,
9461 unsigned NumGradients) {
9463 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
9471 if (((
I + 1) >= EndIdx) ||
9472 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
9473 I == DimIdx + NumGradients - 1))) {
9495 !
Op.getNode()->hasAnyUseOfValue(0))
9497 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9507 ResultTypes.erase(&ResultTypes[0]);
9513 int NumVDataDwords = 0;
9514 bool AdjustRetType =
false;
9515 bool IsAtomicPacked16Bit =
false;
9518 const unsigned ArgOffset = WithChain ? 2 : 1;
9521 unsigned DMaskLanes = 0;
9523 if (BaseOpcode->
Atomic) {
9524 VData =
Op.getOperand(2);
9526 IsAtomicPacked16Bit =
9527 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9528 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9529 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9530 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9541 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9543 DMask = Is64Bit ? 0xf : 0x3;
9544 NumVDataDwords = Is64Bit ? 4 : 2;
9546 DMask = Is64Bit ? 0x3 : 0x1;
9547 NumVDataDwords = Is64Bit ? 2 : 1;
9550 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
9553 if (BaseOpcode->
Store) {
9554 VData =
Op.getOperand(2);
9558 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9562 VData = handleD16VData(VData, DAG,
true);
9565 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
9566 }
else if (!BaseOpcode->
NoReturn) {
9571 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9579 (!LoadVT.
isVector() && DMaskLanes > 1))
9585 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9586 !(BaseOpcode->
Gather4 && Subtarget->hasImageGather4D16Bug()))
9587 NumVDataDwords = (DMaskLanes + 1) / 2;
9589 NumVDataDwords = DMaskLanes;
9591 AdjustRetType =
true;
9595 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
9602 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9603 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9605 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
9607 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9608 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9612 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
9618 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
9622 "Bias needs to be converted to 16 bit in A16 mode");
9627 if (BaseOpcode->
Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
9631 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
9632 "require 16 bit args for both gradients and addresses");
9637 if (!
ST->hasA16()) {
9638 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
9639 "support 16 bit addresses\n");
9649 if (BaseOpcode->
Gradients && IsG16 &&
ST->hasG16()) {
9651 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9653 IntrOpcode = G16MappingInfo->
G16;
9676 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
9694 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->
Sampler);
9695 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
9696 const bool UseNSA =
ST->hasNSAEncoding() &&
9697 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
9698 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
9699 const bool UsePartialNSA =
9700 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
9703 if (UsePartialNSA) {
9705 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9706 }
else if (!UseNSA) {
9716 uint64_t UnormConst =
9717 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
9719 Unorm = UnormConst ? True : False;
9725 bool IsTexFail =
false;
9726 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9737 NumVDataDwords += 1;
9738 AdjustRetType =
true;
9743 if (AdjustRetType) {
9746 if (DMaskLanes == 0 && !BaseOpcode->
Store) {
9755 MVT::i32, NumVDataDwords)
9758 ResultTypes[0] = NewVT;
9759 if (ResultTypes.size() == 3) {
9763 ResultTypes.erase(&ResultTypes[1]);
9777 Ops.push_back(VData);
9778 if (UsePartialNSA) {
9780 Ops.push_back(VAddr);
9784 Ops.push_back(VAddr);
9787 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9789 Ops.push_back(Rsrc);
9794 Ops.push_back(Samp);
9799 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9800 Ops.push_back(Unorm);
9802 Ops.push_back(IsA16 &&
9803 ST->hasFeature(AMDGPU::FeatureR128A16)
9807 Ops.push_back(IsA16 ? True : False);
9809 if (!Subtarget->hasGFX90AInsts())
9814 "TFE is not supported on this GPU",
DL.getDebugLoc()));
9817 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9820 Ops.push_back(DimInfo->
DA ? True : False);
9822 Ops.push_back(IsD16 ? True : False);
9824 Ops.push_back(
Op.getOperand(0));
9826 int NumVAddrDwords =
9832 NumVDataDwords, NumVAddrDwords);
9833 }
else if (IsGFX11Plus) {
9835 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9836 : AMDGPU::MIMGEncGfx11Default,
9837 NumVDataDwords, NumVAddrDwords);
9838 }
else if (IsGFX10Plus) {
9840 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9841 : AMDGPU::MIMGEncGfx10Default,
9842 NumVDataDwords, NumVAddrDwords);
9844 if (Subtarget->hasGFX90AInsts()) {
9846 NumVDataDwords, NumVAddrDwords);
9850 "requested image instruction is not supported on this GPU",
9855 for (EVT VT : OrigResultTypes) {
9856 if (VT == MVT::Other)
9857 RetValues[Idx++] =
Op.getOperand(0);
9868 NumVDataDwords, NumVAddrDwords);
9871 NumVDataDwords, NumVAddrDwords);
9878 MachineMemOperand *MemRef = MemOp->getMemOperand();
9897 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9898 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9911 MachinePointerInfo(),
9916 if (!
Offset->isDivergent()) {
9923 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9932 !Subtarget->hasScalarDwordx3Loads()) {
9936 AMDGPUISD::SBUFFER_LOAD,
DL, DAG.
getVTList(WidenedVT),
Ops, WidenedVT,
9959 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9961 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
9965 unsigned NumLoads = 1;
9971 if (NumElts == 8 || NumElts == 16) {
9972 NumLoads = NumElts / 4;
9976 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
9981 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
9983 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
9984 for (
unsigned i = 0; i < NumLoads; ++i) {
9986 Loads.
push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD,
DL, VTList,
Ops,
9990 if (NumElts == 8 || NumElts == 16)
9998 if (!Subtarget->hasArchitectedSGPRs())
10003 return DAG.
getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
10010 unsigned Width)
const {
10012 using namespace AMDGPU::Hwreg;
10014 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
10053 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
10055 EVT VT =
Op.getValueType();
10057 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
10061 switch (IntrinsicID) {
10062 case Intrinsic::amdgcn_implicit_buffer_ptr: {
10065 return getPreloadedValue(DAG, *MFI, VT,
10068 case Intrinsic::amdgcn_dispatch_ptr:
10069 case Intrinsic::amdgcn_queue_ptr: {
10070 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
10072 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
10073 DL.getDebugLoc()));
10077 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
10080 return getPreloadedValue(DAG, *MFI, VT, RegID);
10082 case Intrinsic::amdgcn_implicitarg_ptr: {
10084 return getImplicitArgPtr(DAG,
DL);
10085 return getPreloadedValue(DAG, *MFI, VT,
10088 case Intrinsic::amdgcn_kernarg_segment_ptr: {
10094 return getPreloadedValue(DAG, *MFI, VT,
10097 case Intrinsic::amdgcn_dispatch_id: {
10100 case Intrinsic::amdgcn_rcp:
10101 return DAG.
getNode(AMDGPUISD::RCP,
DL, VT,
Op.getOperand(1));
10102 case Intrinsic::amdgcn_rsq:
10103 return DAG.
getNode(AMDGPUISD::RSQ,
DL, VT,
Op.getOperand(1));
10104 case Intrinsic::amdgcn_rsq_legacy:
10108 case Intrinsic::amdgcn_rcp_legacy:
10111 return DAG.
getNode(AMDGPUISD::RCP_LEGACY,
DL, VT,
Op.getOperand(1));
10112 case Intrinsic::amdgcn_rsq_clamp: {
10114 return DAG.
getNode(AMDGPUISD::RSQ_CLAMP,
DL, VT,
Op.getOperand(1));
10126 case Intrinsic::r600_read_ngroups_x:
10127 if (Subtarget->isAmdHsaOS())
10130 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10133 case Intrinsic::r600_read_ngroups_y:
10134 if (Subtarget->isAmdHsaOS())
10137 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10140 case Intrinsic::r600_read_ngroups_z:
10141 if (Subtarget->isAmdHsaOS())
10144 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10147 case Intrinsic::r600_read_local_size_x:
10148 if (Subtarget->isAmdHsaOS())
10151 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10153 case Intrinsic::r600_read_local_size_y:
10154 if (Subtarget->isAmdHsaOS())
10157 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10159 case Intrinsic::r600_read_local_size_z:
10160 if (Subtarget->isAmdHsaOS())
10163 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10165 case Intrinsic::amdgcn_workgroup_id_x:
10166 return lowerWorkGroupId(DAG, *MFI, VT,
10170 case Intrinsic::amdgcn_workgroup_id_y:
10171 return lowerWorkGroupId(DAG, *MFI, VT,
10175 case Intrinsic::amdgcn_workgroup_id_z:
10176 return lowerWorkGroupId(DAG, *MFI, VT,
10180 case Intrinsic::amdgcn_cluster_id_x:
10181 return Subtarget->hasClusters()
10182 ? getPreloadedValue(DAG, *MFI, VT,
10184 : DAG.getPOISON(VT);
10185 case Intrinsic::amdgcn_cluster_id_y:
10186 return Subtarget->hasClusters()
10187 ? getPreloadedValue(DAG, *MFI, VT,
10190 case Intrinsic::amdgcn_cluster_id_z:
10191 return Subtarget->hasClusters()
10192 ? getPreloadedValue(DAG, *MFI, VT,
10195 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10196 return Subtarget->hasClusters()
10197 ? getPreloadedValue(
10201 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10202 return Subtarget->hasClusters()
10203 ? getPreloadedValue(
10207 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10208 return Subtarget->hasClusters()
10209 ? getPreloadedValue(
10213 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10214 return Subtarget->hasClusters()
10217 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10218 return Subtarget->hasClusters()
10219 ? getPreloadedValue(
10223 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10224 return Subtarget->hasClusters()
10225 ? getPreloadedValue(
10229 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10230 return Subtarget->hasClusters()
10231 ? getPreloadedValue(
10235 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10236 return Subtarget->hasClusters()
10237 ? getPreloadedValue(
10241 case Intrinsic::amdgcn_wave_id:
10242 return lowerWaveID(DAG,
Op);
10243 case Intrinsic::amdgcn_lds_kernel_id: {
10245 return getLDSKernelId(DAG,
DL);
10246 return getPreloadedValue(DAG, *MFI, VT,
10249 case Intrinsic::amdgcn_workitem_id_x:
10250 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
10251 case Intrinsic::amdgcn_workitem_id_y:
10252 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
10253 case Intrinsic::amdgcn_workitem_id_z:
10254 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
10255 case Intrinsic::amdgcn_wavefrontsize:
10257 SDLoc(
Op), MVT::i32);
10258 case Intrinsic::amdgcn_s_buffer_load: {
10259 unsigned CPol =
Op.getConstantOperandVal(3);
10266 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
10267 Op.getOperand(3), DAG);
10269 case Intrinsic::amdgcn_fdiv_fast:
10270 return lowerFDIV_FAST(
Op, DAG);
10271 case Intrinsic::amdgcn_sin:
10272 return DAG.
getNode(AMDGPUISD::SIN_HW,
DL, VT,
Op.getOperand(1));
10274 case Intrinsic::amdgcn_cos:
10275 return DAG.
getNode(AMDGPUISD::COS_HW,
DL, VT,
Op.getOperand(1));
10277 case Intrinsic::amdgcn_mul_u24:
10278 return DAG.
getNode(AMDGPUISD::MUL_U24,
DL, VT,
Op.getOperand(1),
10280 case Intrinsic::amdgcn_mul_i24:
10281 return DAG.
getNode(AMDGPUISD::MUL_I24,
DL, VT,
Op.getOperand(1),
10284 case Intrinsic::amdgcn_log_clamp: {
10290 case Intrinsic::amdgcn_fract:
10291 return DAG.
getNode(AMDGPUISD::FRACT,
DL, VT,
Op.getOperand(1));
10293 case Intrinsic::amdgcn_class:
10294 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, VT,
Op.getOperand(1),
10296 case Intrinsic::amdgcn_div_fmas:
10297 return DAG.
getNode(AMDGPUISD::DIV_FMAS,
DL, VT,
Op.getOperand(1),
10298 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10300 case Intrinsic::amdgcn_div_fixup:
10301 return DAG.
getNode(AMDGPUISD::DIV_FIXUP,
DL, VT,
Op.getOperand(1),
10302 Op.getOperand(2),
Op.getOperand(3));
10304 case Intrinsic::amdgcn_div_scale: {
10310 SDValue Denominator =
Op.getOperand(2);
10317 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
10319 return DAG.
getNode(AMDGPUISD::DIV_SCALE,
DL,
Op->getVTList(), Src0,
10320 Denominator, Numerator);
10322 case Intrinsic::amdgcn_icmp: {
10324 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
10325 Op.getConstantOperandVal(2) == 0 &&
10330 case Intrinsic::amdgcn_fcmp: {
10333 case Intrinsic::amdgcn_ballot:
10335 case Intrinsic::amdgcn_fmed3:
10336 return DAG.
getNode(AMDGPUISD::FMED3,
DL, VT,
Op.getOperand(1),
10337 Op.getOperand(2),
Op.getOperand(3));
10338 case Intrinsic::amdgcn_fdot2:
10339 return DAG.
getNode(AMDGPUISD::FDOT2,
DL, VT,
Op.getOperand(1),
10340 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10341 case Intrinsic::amdgcn_fmul_legacy:
10342 return DAG.
getNode(AMDGPUISD::FMUL_LEGACY,
DL, VT,
Op.getOperand(1),
10344 case Intrinsic::amdgcn_sffbh:
10345 return DAG.
getNode(AMDGPUISD::FFBH_I32,
DL, VT,
Op.getOperand(1));
10346 case Intrinsic::amdgcn_sbfe:
10347 return DAG.
getNode(AMDGPUISD::BFE_I32,
DL, VT,
Op.getOperand(1),
10348 Op.getOperand(2),
Op.getOperand(3));
10349 case Intrinsic::amdgcn_ubfe:
10350 return DAG.
getNode(AMDGPUISD::BFE_U32,
DL, VT,
Op.getOperand(1),
10351 Op.getOperand(2),
Op.getOperand(3));
10352 case Intrinsic::amdgcn_cvt_pkrtz:
10353 case Intrinsic::amdgcn_cvt_pknorm_i16:
10354 case Intrinsic::amdgcn_cvt_pknorm_u16:
10355 case Intrinsic::amdgcn_cvt_pk_i16:
10356 case Intrinsic::amdgcn_cvt_pk_u16: {
10358 EVT VT =
Op.getValueType();
10361 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10362 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10363 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10364 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10365 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10366 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10367 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10368 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10370 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10373 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
10376 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
10379 case Intrinsic::amdgcn_fmad_ftz:
10380 return DAG.
getNode(AMDGPUISD::FMAD_FTZ,
DL, VT,
Op.getOperand(1),
10381 Op.getOperand(2),
Op.getOperand(3));
10383 case Intrinsic::amdgcn_if_break:
10385 Op->getOperand(1),
Op->getOperand(2)),
10388 case Intrinsic::amdgcn_groupstaticsize: {
10394 const GlobalValue *GV =
10400 case Intrinsic::amdgcn_is_shared:
10401 case Intrinsic::amdgcn_is_private: {
10408 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10412 Subtarget->hasGloballyAddressableScratch()) {
10415 AMDGPU::S_MOV_B32,
DL, MVT::i32,
10416 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10425 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10428 case Intrinsic::amdgcn_perm:
10429 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op.getOperand(1),
10430 Op.getOperand(2),
Op.getOperand(3));
10431 case Intrinsic::amdgcn_reloc_constant: {
10441 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10442 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10443 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10444 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10445 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10446 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10447 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10448 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10449 if (
Op.getOperand(4).getValueType() == MVT::i32)
10455 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10456 Op.getOperand(3), IndexKeyi32);
10458 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10459 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10460 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10461 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10462 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10463 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10464 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10465 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10466 if (
Op.getOperand(4).getValueType() == MVT::i64)
10472 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10473 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10474 Op.getOperand(6)});
10476 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10477 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10478 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10479 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10480 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10481 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10482 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10485 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
10491 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10492 Op.getOperand(3),
Op.getOperand(4),
Op.getOperand(5),
10493 IndexKey,
Op.getOperand(7),
Op.getOperand(8)};
10494 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
10495 Args.push_back(
Op.getOperand(9));
10498 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10499 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10500 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10501 if (
Op.getOperand(6).getValueType() == MVT::i32)
10507 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10508 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10509 IndexKeyi32, Op.getOperand(7)});
10511 case Intrinsic::amdgcn_addrspacecast_nonnull:
10512 return lowerADDRSPACECAST(
Op, DAG);
10513 case Intrinsic::amdgcn_readlane:
10514 case Intrinsic::amdgcn_readfirstlane:
10515 case Intrinsic::amdgcn_writelane:
10516 case Intrinsic::amdgcn_permlane16:
10517 case Intrinsic::amdgcn_permlanex16:
10518 case Intrinsic::amdgcn_permlane64:
10519 case Intrinsic::amdgcn_set_inactive:
10520 case Intrinsic::amdgcn_set_inactive_chain_arg:
10521 case Intrinsic::amdgcn_mov_dpp8:
10522 case Intrinsic::amdgcn_update_dpp:
10524 case Intrinsic::amdgcn_dead: {
10526 for (
const EVT ValTy :
Op.getNode()->values())
10530 case Intrinsic::amdgcn_wave_shuffle:
10533 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10535 return lowerImage(
Op, ImageDimIntr, DAG,
false);
10545 if (Subtarget->hasRestrictedSOffset() &&
isNullConstant(SOffset))
10546 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10552 unsigned NewOpcode)
const {
10556 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10557 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10575 M->getMemOperand());
10580 unsigned NewOpcode)
const {
10584 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10585 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10603 M->getMemOperand());
10608 unsigned IntrID =
Op.getConstantOperandVal(1);
10612 case Intrinsic::amdgcn_ds_ordered_add:
10613 case Intrinsic::amdgcn_ds_ordered_swap: {
10618 unsigned IndexOperand =
M->getConstantOperandVal(7);
10619 unsigned WaveRelease =
M->getConstantOperandVal(8);
10620 unsigned WaveDone =
M->getConstantOperandVal(9);
10622 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10623 IndexOperand &= ~0x3f;
10624 unsigned CountDw = 0;
10627 CountDw = (IndexOperand >> 24) & 0xf;
10628 IndexOperand &= ~(0xf << 24);
10630 if (CountDw < 1 || CountDw > 4) {
10633 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
10634 DL.getDebugLoc()));
10639 if (IndexOperand) {
10642 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
10645 if (WaveDone && !WaveRelease) {
10649 Fn,
"ds_ordered_count: wave_done requires wave_release",
10650 DL.getDebugLoc()));
10653 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10654 unsigned ShaderType =
10656 unsigned Offset0 = OrderedCountIndex << 2;
10657 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10660 Offset1 |= (CountDw - 1) << 6;
10663 Offset1 |= ShaderType << 2;
10665 unsigned Offset = Offset0 | (Offset1 << 8);
10672 M->getVTList(),
Ops,
M->getMemoryVT(),
10673 M->getMemOperand());
10675 case Intrinsic::amdgcn_raw_buffer_load:
10676 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10677 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10678 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10679 case Intrinsic::amdgcn_raw_buffer_load_format:
10680 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10681 const bool IsFormat =
10682 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10683 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10685 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10686 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10700 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
10702 case Intrinsic::amdgcn_struct_buffer_load:
10703 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10704 case Intrinsic::amdgcn_struct_buffer_load_format:
10705 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10706 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10707 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10708 const bool IsFormat =
10709 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10710 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10712 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10713 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10728 case Intrinsic::amdgcn_raw_tbuffer_load:
10729 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10731 EVT LoadVT =
Op.getValueType();
10732 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10733 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10749 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10751 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
10752 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10755 case Intrinsic::amdgcn_struct_tbuffer_load:
10756 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10758 EVT LoadVT =
Op.getValueType();
10759 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10760 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10776 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10778 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
10779 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10782 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10783 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10784 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10785 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10786 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10787 return lowerStructBufferAtomicIntrin(
Op, DAG,
10788 AMDGPUISD::BUFFER_ATOMIC_FADD);
10789 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10790 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10791 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10792 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10793 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10794 return lowerStructBufferAtomicIntrin(
Op, DAG,
10795 AMDGPUISD::BUFFER_ATOMIC_FMIN);
10796 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10797 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10798 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10799 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10800 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10801 return lowerStructBufferAtomicIntrin(
Op, DAG,
10802 AMDGPUISD::BUFFER_ATOMIC_FMAX);
10803 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10804 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10805 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10806 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10807 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10808 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10809 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10810 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10811 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10812 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10813 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10814 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10815 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10816 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10817 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10818 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10819 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10820 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10821 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10822 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10823 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10824 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10825 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10826 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10827 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10828 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10829 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10830 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10831 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10832 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10833 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10834 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10835 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10836 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10837 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10838 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10839 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10840 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10841 return lowerStructBufferAtomicIntrin(
Op, DAG,
10842 AMDGPUISD::BUFFER_ATOMIC_SWAP);
10843 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10844 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10845 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10846 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10847 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10848 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10849 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10850 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10851 return lowerStructBufferAtomicIntrin(
Op, DAG,
10852 AMDGPUISD::BUFFER_ATOMIC_SMIN);
10853 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10854 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10855 return lowerStructBufferAtomicIntrin(
Op, DAG,
10856 AMDGPUISD::BUFFER_ATOMIC_UMIN);
10857 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10858 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10859 return lowerStructBufferAtomicIntrin(
Op, DAG,
10860 AMDGPUISD::BUFFER_ATOMIC_SMAX);
10861 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10862 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10863 return lowerStructBufferAtomicIntrin(
Op, DAG,
10864 AMDGPUISD::BUFFER_ATOMIC_UMAX);
10865 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10866 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10867 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10868 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10869 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10870 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10871 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10872 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10873 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10874 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10875 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10876 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10877 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10878 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10879 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10880 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10881 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10882 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
10883 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10884 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10885 return lowerStructBufferAtomicIntrin(
Op, DAG,
10886 AMDGPUISD::BUFFER_ATOMIC_CSUB);
10887 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10888 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10889 return lowerRawBufferAtomicIntrin(
Op, DAG,
10890 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10891 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10892 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10893 return lowerStructBufferAtomicIntrin(
Op, DAG,
10894 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10895 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10896 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10897 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
10898 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10912 EVT VT =
Op.getValueType();
10916 Op->getVTList(),
Ops, VT,
10917 M->getMemOperand());
10919 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10920 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10921 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
10922 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
10936 EVT VT =
Op.getValueType();
10940 Op->getVTList(),
Ops, VT,
10941 M->getMemOperand());
10943 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10944 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10946 SDValue NodePtr =
M->getOperand(2);
10947 SDValue RayExtent =
M->getOperand(3);
10948 SDValue InstanceMask =
M->getOperand(4);
10949 SDValue RayOrigin =
M->getOperand(5);
10950 SDValue RayDir =
M->getOperand(6);
10952 SDValue TDescr =
M->getOperand(8);
10957 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10962 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10963 const unsigned NumVDataDwords = 10;
10964 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10966 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10967 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10968 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10972 Ops.push_back(NodePtr);
10975 {DAG.getBitcast(MVT::i32, RayExtent),
10976 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10977 Ops.push_back(RayOrigin);
10978 Ops.push_back(RayDir);
10979 Ops.push_back(Offsets);
10980 Ops.push_back(TDescr);
10981 Ops.push_back(
M->getChain());
10984 MachineMemOperand *MemRef =
M->getMemOperand();
10988 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10990 SDValue NodePtr =
M->getOperand(2);
10991 SDValue RayExtent =
M->getOperand(3);
10992 SDValue RayOrigin =
M->getOperand(4);
10993 SDValue RayDir =
M->getOperand(5);
10994 SDValue RayInvDir =
M->getOperand(6);
10995 SDValue TDescr =
M->getOperand(7);
11002 if (!Subtarget->hasGFX10_AEncoding()) {
11012 const unsigned NumVDataDwords = 4;
11013 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
11014 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
11015 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
11018 const unsigned BaseOpcodes[2][2] = {
11019 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
11020 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
11021 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
11025 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
11026 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
11027 : AMDGPU::MIMGEncGfx10NSA,
11028 NumVDataDwords, NumVAddrDwords);
11032 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
11033 : AMDGPU::MIMGEncGfx10Default,
11034 NumVDataDwords, NumVAddrDwords);
11040 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
11043 if (Lanes[0].getValueSizeInBits() == 32) {
11044 for (
unsigned I = 0;
I < 3; ++
I)
11051 Ops.push_back(Lanes[2]);
11063 if (UseNSA && IsGFX11Plus) {
11064 Ops.push_back(NodePtr);
11066 Ops.push_back(RayOrigin);
11071 for (
unsigned I = 0;
I < 3; ++
I) {
11074 {DirLanes[I], InvDirLanes[I]})));
11078 Ops.push_back(RayDir);
11079 Ops.push_back(RayInvDir);
11086 Ops.push_back(NodePtr);
11089 packLanes(RayOrigin,
true);
11090 packLanes(RayDir,
true);
11091 packLanes(RayInvDir,
false);
11096 if (NumVAddrDwords > 12) {
11104 Ops.push_back(MergedOps);
11107 Ops.push_back(TDescr);
11109 Ops.push_back(
M->getChain());
11112 MachineMemOperand *MemRef =
M->getMemOperand();
11116 case Intrinsic::amdgcn_global_atomic_fmin_num:
11117 case Intrinsic::amdgcn_global_atomic_fmax_num:
11118 case Intrinsic::amdgcn_flat_atomic_fmin_num:
11119 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11126 unsigned Opcode = 0;
11128 case Intrinsic::amdgcn_global_atomic_fmin_num:
11129 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
11133 case Intrinsic::amdgcn_global_atomic_fmax_num:
11134 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11141 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
11142 Ops,
M->getMemOperand());
11144 case Intrinsic::amdgcn_s_get_barrier_state:
11145 case Intrinsic::amdgcn_s_get_named_barrier_state: {
11152 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
11153 BarID = (BarID >> 4) & 0x3F;
11154 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
11157 Ops.push_back(Chain);
11159 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
11160 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
11168 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11176 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11177 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11178 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11182 EVT VT =
Op->getValueType(0);
11188 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11190 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11198SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
11205 EVT VT = VTList.
VTs[0];
11208 bool IsTFE = VTList.
NumVTs == 3;
11211 unsigned NumOpDWords = NumValueDWords + 1;
11213 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
11214 MachineMemOperand *OpDWordsMMO =
11216 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
11217 OpDWordsVT, OpDWordsMMO, DAG);
11222 NumValueDWords == 1
11231 if (!Subtarget->hasDwordx3LoadStores() &&
11232 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11236 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
11238 WidenedMemVT, WidenedMMO);
11248 bool ImageStore)
const {
11258 if (Subtarget->hasUnpackedD16VMem()) {
11272 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11283 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
11289 if ((NumElements % 2) == 1) {
11291 unsigned I = Elts.
size() / 2;
11307 if (NumElements == 3) {
11328 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
11331 switch (IntrinsicID) {
11332 case Intrinsic::amdgcn_exp_compr: {
11333 if (!Subtarget->hasCompressedExport()) {
11336 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
11358 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11362 case Intrinsic::amdgcn_struct_tbuffer_store:
11363 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11365 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11367 VData = handleD16VData(VData, DAG);
11368 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11369 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11383 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11384 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11387 M->getMemoryVT(),
M->getMemOperand());
11390 case Intrinsic::amdgcn_raw_tbuffer_store:
11391 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11393 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11395 VData = handleD16VData(VData, DAG);
11396 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11397 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11411 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11412 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11415 M->getMemoryVT(),
M->getMemOperand());
11418 case Intrinsic::amdgcn_raw_buffer_store:
11419 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11420 case Intrinsic::amdgcn_raw_buffer_store_format:
11421 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11422 const bool IsFormat =
11423 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11424 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11431 VData = handleD16VData(VData, DAG);
11441 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11442 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11456 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11457 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11462 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
11465 M->getMemoryVT(),
M->getMemOperand());
11468 case Intrinsic::amdgcn_struct_buffer_store:
11469 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11470 case Intrinsic::amdgcn_struct_buffer_store_format:
11471 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11472 const bool IsFormat =
11473 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11474 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11482 VData = handleD16VData(VData, DAG);
11492 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11493 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11507 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11508 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11512 EVT VDataType = VData.getValueType().getScalarType();
11514 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
11517 M->getMemoryVT(),
M->getMemOperand());
11519 case Intrinsic::amdgcn_raw_buffer_load_lds:
11520 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11521 case Intrinsic::amdgcn_struct_buffer_load_lds:
11522 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11523 if (!Subtarget->hasVMemToLDSLoad())
11527 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11528 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11529 unsigned OpOffset = HasVIndex ? 1 : 0;
11530 SDValue VOffset =
Op.getOperand(5 + OpOffset);
11532 unsigned Size =
Op->getConstantOperandVal(4);
11538 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11539 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11540 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11541 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11544 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11545 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11546 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11547 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11550 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11551 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11552 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11553 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11556 if (!Subtarget->hasLDSLoadB96_B128())
11558 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11559 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11560 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11561 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11564 if (!Subtarget->hasLDSLoadB96_B128())
11566 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11567 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11568 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11569 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11577 if (HasVIndex && HasVOffset)
11581 else if (HasVIndex)
11582 Ops.push_back(
Op.getOperand(5));
11583 else if (HasVOffset)
11584 Ops.push_back(VOffset);
11586 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11587 Ops.push_back(Rsrc);
11588 Ops.push_back(
Op.getOperand(6 + OpOffset));
11589 Ops.push_back(
Op.getOperand(7 + OpOffset));
11591 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
11604 MachineMemOperand *LoadMMO =
M->getMemOperand();
11609 MachinePointerInfo StorePtrI = LoadPtrI;
11633 case Intrinsic::amdgcn_load_to_lds:
11634 case Intrinsic::amdgcn_global_load_lds: {
11635 if (!Subtarget->hasVMemToLDSLoad())
11639 unsigned Size =
Op->getConstantOperandVal(4);
11644 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11647 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11650 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11653 if (!Subtarget->hasLDSLoadB96_B128())
11655 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11658 if (!Subtarget->hasLDSLoadB96_B128())
11660 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11676 if (
LHS->isDivergent())
11680 RHS.getOperand(0).getValueType() == MVT::i32) {
11683 VOffset =
RHS.getOperand(0);
11687 Ops.push_back(Addr);
11695 Ops.push_back(VOffset);
11698 Ops.push_back(
Op.getOperand(5));
11700 unsigned Aux =
Op.getConstantOperandVal(6);
11708 MachineMemOperand *LoadMMO =
M->getMemOperand();
11710 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
11711 MachinePointerInfo StorePtrI = LoadPtrI;
11730 case Intrinsic::amdgcn_end_cf:
11732 Op->getOperand(2), Chain),
11734 case Intrinsic::amdgcn_s_barrier_init:
11735 case Intrinsic::amdgcn_s_barrier_signal_var: {
11742 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11743 ? AMDGPU::S_BARRIER_INIT_M0
11744 : AMDGPU::S_BARRIER_SIGNAL_M0;
11759 constexpr unsigned ShAmt = 16;
11766 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11771 case Intrinsic::amdgcn_s_wakeup_barrier: {
11772 if (!Subtarget->hasSWakeupBarrier())
11776 case Intrinsic::amdgcn_s_barrier_join: {
11785 switch (IntrinsicID) {
11788 case Intrinsic::amdgcn_s_barrier_join:
11789 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11791 case Intrinsic::amdgcn_s_wakeup_barrier:
11792 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11796 unsigned BarID = (BarVal >> 4) & 0x3F;
11799 Ops.push_back(Chain);
11801 switch (IntrinsicID) {
11804 case Intrinsic::amdgcn_s_barrier_join:
11805 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11807 case Intrinsic::amdgcn_s_wakeup_barrier:
11808 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11819 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11825 case Intrinsic::amdgcn_s_prefetch_data: {
11828 return Op.getOperand(0);
11831 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11833 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
11840 Op->getVTList(),
Ops,
M->getMemoryVT(),
11841 M->getMemOperand());
11843 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11844 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11845 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11854 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11856 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11872 return PtrVT == MVT::i64;
11886std::pair<SDValue, SDValue>
11899 bool CheckNUW = Subtarget->hasGFX1250Insts();
11916 unsigned Overflow = ImmOffset & ~MaxImm;
11917 ImmOffset -= Overflow;
11918 if ((int32_t)Overflow < 0) {
11919 Overflow += ImmOffset;
11924 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
11943void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
11945 Align Alignment)
const {
11947 SDLoc
DL(CombinedOffset);
11949 uint32_t
Imm =
C->getZExtValue();
11950 uint32_t SOffset, ImmOffset;
11951 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11962 bool CheckNUW = Subtarget->hasGFX1250Insts();
11965 uint32_t SOffset, ImmOffset;
11968 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
11976 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11985SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
11988 return MaybePointer;
12002 SDValue NumRecords =
Op->getOperand(3);
12008 if (Subtarget->has45BitNumRecordsBufferResource()) {
12027 SDValue ExtShiftedStrideVec =
12039 DAG.
getNode(
ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
12041 DAG.
getNode(
ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
12046 auto [LowHalf, HighHalf] =
12047 DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
12057 NumRecords, Flags);
12069 bool IsTFE)
const {
12074 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
12075 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
12078 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
12090 ? AMDGPUISD::BUFFER_LOAD_UBYTE
12091 : AMDGPUISD::BUFFER_LOAD_USHORT;
12093 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
12107 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
12111 Ops[1] = BufferStoreExt;
12112 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
12113 : AMDGPUISD::BUFFER_STORE_SHORT;
12116 M->getMemOperand());
12141 DAGCombinerInfo &DCI)
const {
12142 SelectionDAG &DAG = DCI.DAG;
12157 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
12164 "unexpected vector extload");
12177 "unexpected fp extload");
12195 DCI.AddToWorklist(Cvt.
getNode());
12200 DCI.AddToWorklist(Cvt.
getNode());
12211 if (Info.isEntryFunction())
12212 return Info.getUserSGPRInfo().hasFlatScratchInit();
12220 EVT MemVT =
Load->getMemoryVT();
12221 MachineMemOperand *MMO =
Load->getMemOperand();
12233 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12261 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
12262 "Custom lowering for non-i32 vectors hasn't been implemented.");
12265 unsigned AS =
Load->getAddressSpace();
12266 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12273 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12277 !Subtarget->hasMultiDwordFlatScratchAddressing())
12287 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
12290 Alignment >=
Align(4) && NumElements < 32) {
12292 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12304 if (NumElements > 4)
12307 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12317 switch (Subtarget->getMaxPrivateElementSize()) {
12323 if (NumElements > 2)
12328 if (NumElements > 4)
12331 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12340 auto Flags =
Load->getMemOperand()->getFlags();
12342 Load->getAlign(), Flags, &
Fast) &&
12351 MemVT, *
Load->getMemOperand())) {
12360 EVT VT =
Op.getValueType();
12397 EVT VT =
Op.getValueType();
12398 const SDNodeFlags
Flags =
Op->getFlags();
12400 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
12406 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12409 if (CLHS->isExactlyValue(1.0)) {
12422 return DAG.
getNode(AMDGPUISD::RCP, SL, VT,
RHS);
12426 if (CLHS->isExactlyValue(-1.0)) {
12429 return DAG.
getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12435 if (!AllowInaccurateRcp &&
12436 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
12450 EVT VT =
Op.getValueType();
12451 const SDNodeFlags
Flags =
Op->getFlags();
12453 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
12454 if (!AllowInaccurateDiv)
12475 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
12485 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12489 return DAG.
getNode(Opcode, SL, VTList,
12498 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
12508 Opcode = AMDGPUISD::FMA_W_CHAIN;
12512 return DAG.
getNode(Opcode, SL, VTList,
12518 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12519 return FastLowered;
12522 EVT VT =
Op.getValueType();
12529 if (VT == MVT::bf16) {
12552 unsigned FMADOpCode =
12556 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt,
Op->getFlags());
12559 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12561 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
12562 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12572 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst,
RHS,
LHS,
12578 SDNodeFlags
Flags =
Op->getFlags();
12588 const APFloat K0Val(0x1p+96f);
12591 const APFloat K1Val(0x1p-32f);
12618 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
12619 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
12620 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12625 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12626 return FastLowered;
12632 SDNodeFlags
Flags =
Op->getFlags();
12633 Flags.setNoFPExcept(
true);
12641 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
12650 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12654 using namespace AMDGPU::Hwreg;
12655 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12659 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
12660 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
12663 const bool HasDynamicDenormals =
12669 if (!PreservesDenormals) {
12674 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12677 if (HasDynamicDenormals) {
12681 SavedDenormMode =
SDValue(GetReg, 0);
12687 SDNode *EnableDenorm;
12688 if (Subtarget->hasDenormModeInst()) {
12689 const SDValue EnableDenormValue =
12692 EnableDenorm = DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12696 const SDValue EnableDenormValue =
12698 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12699 {EnableDenormValue,
BitField, Glue});
12709 ApproxRcp, One, NegDivScale0, Flags);
12712 ApproxRcp, Fma0, Flags);
12718 NumeratorScaled,
Mul, Flags);
12724 NumeratorScaled, Fma3, Flags);
12726 if (!PreservesDenormals) {
12727 SDNode *DisableDenorm;
12728 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12732 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12734 DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12738 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
12739 const SDValue DisableDenormValue =
12740 HasDynamicDenormals
12745 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12756 {Fma4, Fma1, Fma3, Scale},
Flags);
12758 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas,
RHS,
LHS, Flags);
12762 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
12763 return FastLowered;
12771 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
12777 SDValue Rcp = DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12795 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12825 DAG.
getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3,
Mul, Scale);
12827 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas,
Y,
X);
12831 EVT VT =
Op.getValueType();
12833 if (VT == MVT::f32)
12834 return LowerFDIV32(
Op, DAG);
12836 if (VT == MVT::f64)
12837 return LowerFDIV64(
Op, DAG);
12839 if (VT == MVT::f16 || VT == MVT::bf16)
12840 return LowerFDIV16(
Op, DAG);
12849 EVT ResultExpVT =
Op->getValueType(1);
12850 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12860 if (Subtarget->hasFractBug()) {
12878 EVT VT =
Store->getMemoryVT();
12880 if (VT == MVT::i1) {
12884 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
12888 Store->getValue().getValueType().getScalarType() == MVT::i32);
12890 unsigned AS =
Store->getAddressSpace();
12891 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12899 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12903 !Subtarget->hasMultiDwordFlatScratchAddressing())
12910 if (NumElements > 4)
12913 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12917 VT, *
Store->getMemOperand()))
12923 switch (Subtarget->getMaxPrivateElementSize()) {
12927 if (NumElements > 2)
12931 if (NumElements > 4 ||
12932 (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
12940 auto Flags =
Store->getMemOperand()->getFlags();
12959 assert(!Subtarget->has16BitInsts());
12960 SDNodeFlags
Flags =
Op->getFlags();
12974 SDNodeFlags
Flags =
Op->getFlags();
12975 MVT VT =
Op.getValueType().getSimpleVT();
13083 SDNodeFlags
Flags =
Op->getFlags();
13146 EVT VT =
Op.getValueType();
13157 if (!
V.getValueType().isVector())
13165 if (Subtarget->hasTrigReducedRange()) {
13167 TrigVal = UnrollIfVec(DAG.
getNode(AMDGPUISD::FRACT,
DL, VT, MulVal, Flags));
13172 switch (
Op.getOpcode()) {
13174 TrigVal = DAG.
getNode(AMDGPUISD::COS_HW, SDLoc(
Op), VT, TrigVal, Flags);
13177 TrigVal = DAG.
getNode(AMDGPUISD::SIN_HW, SDLoc(
Op), VT, TrigVal, Flags);
13183 return UnrollIfVec(TrigVal);
13203 EVT VT =
Op.getValueType();
13211 Op->getVTList(),
Ops, VT,
13220SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
13221 DAGCombinerInfo &DCI)
const {
13222 EVT VT =
N->getValueType(0);
13224 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13227 SelectionDAG &DAG = DCI.DAG;
13231 EVT SrcVT = Src.getValueType();
13237 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13240 DCI.AddToWorklist(Cvt.
getNode());
13243 if (ScalarVT != MVT::f32) {
13255 DAGCombinerInfo &DCI)
const {
13266 SelectionDAG &DAG = DCI.DAG;
13285 for (
unsigned I = 0;
I != NumElts; ++
I) {
13309 if (NewElts.
size() == 1)
13331 for (
unsigned I = 0;
I != NumElts; ++
I) {
13366SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
13368 DAGCombinerInfo &DCI)
const {
13385 SelectionDAG &DAG = DCI.DAG;
13398 AM.BaseOffs =
Offset.getSExtValue();
13403 EVT VT =
N->getValueType(0);
13409 Flags.setNoUnsignedWrap(
13410 N->getFlags().hasNoUnsignedWrap() &&
13422 switch (
N->getOpcode()) {
13433 DAGCombinerInfo &DCI)
const {
13434 SelectionDAG &DAG = DCI.DAG;
13441 SDValue NewPtr = performSHLPtrCombine(Ptr.
getNode(),
N->getAddressSpace(),
13442 N->getMemoryVT(), DCI);
13446 NewOps[PtrIdx] = NewPtr;
13455 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13456 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13465SDValue SITargetLowering::splitBinaryBitConstantOp(
13469 uint32_t ValLo =
Lo_32(Val);
13470 uint32_t ValHi =
Hi_32(Val);
13477 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
13491 if (V.getValueType() != MVT::i1)
13493 switch (V.getOpcode()) {
13498 case AMDGPUISD::FP_CLASS:
13510 return V.getResNo() == 1;
13512 unsigned IntrinsicID = V.getConstantOperandVal(0);
13513 switch (IntrinsicID) {
13514 case Intrinsic::amdgcn_is_shared:
13515 case Intrinsic::amdgcn_is_private:
13532 if (!(
C & 0x000000ff))
13533 ZeroByteMask |= 0x000000ff;
13534 if (!(
C & 0x0000ff00))
13535 ZeroByteMask |= 0x0000ff00;
13536 if (!(
C & 0x00ff0000))
13537 ZeroByteMask |= 0x00ff0000;
13538 if (!(
C & 0xff000000))
13539 ZeroByteMask |= 0xff000000;
13540 uint32_t NonZeroByteMask = ~ZeroByteMask;
13541 if ((NonZeroByteMask &
C) != NonZeroByteMask)
13554 assert(V.getValueSizeInBits() == 32);
13556 if (V.getNumOperands() != 2)
13565 switch (V.getOpcode()) {
13570 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13575 return (0x03020100 & ~ConstMask) | ConstMask;
13582 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
13588 return uint32_t(0x0c0c0c0c03020100ull >>
C);
13595 DAGCombinerInfo &DCI)
const {
13596 if (DCI.isBeforeLegalize())
13599 SelectionDAG &DAG = DCI.DAG;
13600 EVT VT =
N->getValueType(0);
13605 if (VT == MVT::i64 && CRHS) {
13607 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
13611 if (CRHS && VT == MVT::i32) {
13621 unsigned Shift = CShift->getZExtValue();
13623 unsigned Offset = NB + Shift;
13624 if ((
Offset & (Bits - 1)) == 0) {
13627 DAG.
getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
LHS->getOperand(0),
13648 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13650 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
13663 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
13668 if (
X !=
LHS.getOperand(1))
13672 const ConstantFPSDNode *C1 =
13689 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
X,
13695 if (
RHS.getOpcode() ==
ISD::SETCC &&
LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13698 if (
LHS.getOpcode() ==
ISD::SETCC &&
RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13706 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
13707 LHS.getOperand(0) ==
LHS.getOperand(1))) {
13709 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
13710 :
Mask->getZExtValue() & OrdMask;
13713 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
RHS.getOperand(0),
13731 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13734 if (LHSMask != ~0u && RHSMask != ~0u) {
13737 if (LHSMask > RHSMask) {
13744 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13745 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13748 if (!(LHSUsedLanes & RHSUsedLanes) &&
13751 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13757 uint32_t
Mask = LHSMask & RHSMask;
13758 for (
unsigned I = 0;
I < 32;
I += 8) {
13759 uint32_t ByteSel = 0xff <<
I;
13760 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13761 Mask &= (0x0c <<
I) & 0xffffffff;
13766 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
13769 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
13819static const std::optional<ByteProvider<SDValue>>
13821 unsigned Depth = 0) {
13824 return std::nullopt;
13826 if (
Op.getValueSizeInBits() < 8)
13827 return std::nullopt;
13829 if (
Op.getValueType().isVector())
13832 switch (
Op->getOpcode()) {
13845 NarrowVT = VTSign->getVT();
13848 return std::nullopt;
13851 if (SrcIndex >= NarrowByteWidth)
13852 return std::nullopt;
13860 return std::nullopt;
13862 uint64_t BitShift = ShiftOp->getZExtValue();
13864 if (BitShift % 8 != 0)
13865 return std::nullopt;
13867 SrcIndex += BitShift / 8;
13885static const std::optional<ByteProvider<SDValue>>
13887 unsigned StartingIndex = 0) {
13891 return std::nullopt;
13893 unsigned BitWidth =
Op.getScalarValueSizeInBits();
13895 return std::nullopt;
13897 return std::nullopt;
13899 bool IsVec =
Op.getValueType().isVector();
13900 switch (
Op.getOpcode()) {
13903 return std::nullopt;
13908 return std::nullopt;
13912 return std::nullopt;
13915 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
13916 return std::nullopt;
13917 if (!
LHS ||
LHS->isConstantZero())
13919 if (!
RHS ||
RHS->isConstantZero())
13921 return std::nullopt;
13926 return std::nullopt;
13930 return std::nullopt;
13932 uint32_t BitMask = BitMaskOp->getZExtValue();
13934 uint32_t IndexMask = 0xFF << (Index * 8);
13936 if ((IndexMask & BitMask) != IndexMask) {
13939 if (IndexMask & BitMask)
13940 return std::nullopt;
13949 return std::nullopt;
13953 if (!ShiftOp ||
Op.getValueType().isVector())
13954 return std::nullopt;
13956 uint64_t BitsProvided =
Op.getValueSizeInBits();
13957 if (BitsProvided % 8 != 0)
13958 return std::nullopt;
13960 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13962 return std::nullopt;
13964 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13965 uint64_t ByteShift = BitShift / 8;
13967 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13968 uint64_t BytesProvided = BitsProvided / 8;
13969 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13970 NewIndex %= BytesProvided;
13977 return std::nullopt;
13981 return std::nullopt;
13983 uint64_t BitShift = ShiftOp->getZExtValue();
13985 return std::nullopt;
13987 auto BitsProvided =
Op.getScalarValueSizeInBits();
13988 if (BitsProvided % 8 != 0)
13989 return std::nullopt;
13991 uint64_t BytesProvided = BitsProvided / 8;
13992 uint64_t ByteShift = BitShift / 8;
13997 return BytesProvided - ByteShift > Index
14005 return std::nullopt;
14009 return std::nullopt;
14011 uint64_t BitShift = ShiftOp->getZExtValue();
14012 if (BitShift % 8 != 0)
14013 return std::nullopt;
14014 uint64_t ByteShift = BitShift / 8;
14020 return Index < ByteShift
14023 Depth + 1, StartingIndex);
14032 return std::nullopt;
14040 NarrowBitWidth = VTSign->getVT().getSizeInBits();
14042 if (NarrowBitWidth % 8 != 0)
14043 return std::nullopt;
14044 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14046 if (Index >= NarrowByteWidth)
14048 ? std::optional<ByteProvider<SDValue>>(
14056 return std::nullopt;
14060 if (NarrowByteWidth >= Index) {
14065 return std::nullopt;
14072 return std::nullopt;
14078 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
14079 if (NarrowBitWidth % 8 != 0)
14080 return std::nullopt;
14081 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14086 if (Index >= NarrowByteWidth) {
14088 ? std::optional<ByteProvider<SDValue>>(
14093 if (NarrowByteWidth > Index) {
14097 return std::nullopt;
14102 return std::nullopt;
14105 Depth + 1, StartingIndex);
14111 return std::nullopt;
14112 auto VecIdx = IdxOp->getZExtValue();
14113 auto ScalarSize =
Op.getScalarValueSizeInBits();
14114 if (ScalarSize < 32)
14115 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
14117 StartingIndex, Index);
14120 case AMDGPUISD::PERM: {
14122 return std::nullopt;
14126 return std::nullopt;
14129 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
14130 if (IdxMask > 0x07 && IdxMask != 0x0c)
14131 return std::nullopt;
14133 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
14134 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
14136 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
14142 return std::nullopt;
14157 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
14164 auto MemVT = L->getMemoryVT();
14167 return L->getMemoryVT().getSizeInBits() == 16;
14177 int Low8 = Mask & 0xff;
14178 int Hi8 = (Mask & 0xff00) >> 8;
14180 assert(Low8 < 8 && Hi8 < 8);
14182 bool IsConsecutive = (Hi8 - Low8 == 1);
14187 bool Is16Aligned = !(Low8 % 2);
14189 return IsConsecutive && Is16Aligned;
14197 int Low16 = PermMask & 0xffff;
14198 int Hi16 = (PermMask & 0xffff0000) >> 16;
14208 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14210 if (!OtherOpIs16Bit)
14218 unsigned DWordOffset) {
14223 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14228 if (Src.getValueType().isVector()) {
14229 auto ScalarTySize = Src.getScalarValueSizeInBits();
14230 auto ScalarTy = Src.getValueType().getScalarType();
14231 if (ScalarTySize == 32) {
14235 if (ScalarTySize > 32) {
14238 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
14239 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14246 assert(ScalarTySize < 32);
14247 auto NumElements =
TypeSize / ScalarTySize;
14248 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14249 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14250 auto NumElementsIn32 = 32 / ScalarTySize;
14251 auto NumAvailElements = DWordOffset < Trunc32Elements
14253 : NumElements - NormalizedTrunc;
14266 auto ShiftVal = 32 * DWordOffset;
14274 [[maybe_unused]]
EVT VT =
N->getValueType(0);
14279 for (
int i = 0; i < 4; i++) {
14281 std::optional<ByteProvider<SDValue>>
P =
14284 if (!
P ||
P->isConstantZero())
14289 if (PermNodes.
size() != 4)
14292 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14293 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14295 for (
size_t i = 0; i < PermNodes.
size(); i++) {
14296 auto PermOp = PermNodes[i];
14299 int SrcByteAdjust = 4;
14303 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14304 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14306 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14307 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14311 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14312 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14315 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14317 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14320 SDValue Op = *PermNodes[FirstSrc.first].Src;
14322 assert(
Op.getValueSizeInBits() == 32);
14326 int Low16 = PermMask & 0xffff;
14327 int Hi16 = (PermMask & 0xffff0000) >> 16;
14329 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14330 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14333 if (WellFormedLow && WellFormedHi)
14337 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
14346 (
N->getOperand(0) ==
Op ||
N->getOperand(0) == OtherOp) &&
14347 (
N->getOperand(1) ==
Op ||
N->getOperand(1) == OtherOp))
14352 assert(
Op.getValueType().isByteSized() &&
14363 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op, OtherOp,
14370 DAGCombinerInfo &DCI)
const {
14371 SelectionDAG &DAG = DCI.DAG;
14375 EVT VT =
N->getValueType(0);
14376 if (VT == MVT::i1) {
14378 if (
LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14379 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14381 if (Src !=
RHS.getOperand(0))
14386 if (!CLHS || !CRHS)
14390 static const uint32_t MaxMask = 0x3ff;
14395 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1, Src,
14404 LHS.getOpcode() == AMDGPUISD::PERM &&
14410 Sel |=
LHS.getConstantOperandVal(2);
14412 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14419 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14423 auto usesCombinedOperand = [](SDNode *OrUse) {
14426 !OrUse->getValueType(0).isVector())
14430 for (
auto *VUser : OrUse->users()) {
14431 if (!VUser->getValueType(0).isVector())
14438 if (VUser->getOpcode() == VectorwiseOp)
14444 if (!
any_of(
N->users(), usesCombinedOperand))
14450 if (LHSMask != ~0u && RHSMask != ~0u) {
14453 if (LHSMask > RHSMask) {
14460 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14461 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14464 if (!(LHSUsedLanes & RHSUsedLanes) &&
14467 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14469 LHSMask &= ~RHSUsedLanes;
14470 RHSMask &= ~LHSUsedLanes;
14472 LHSMask |= LHSUsedLanes & 0x04040404;
14474 uint32_t Sel = LHSMask | RHSMask;
14477 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14482 if (LHSMask == ~0u || RHSMask == ~0u) {
14523 return IdentitySrc;
14529 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14544 if (SrcVT == MVT::i32) {
14549 DCI.AddToWorklist(LowOr.
getNode());
14550 DCI.AddToWorklist(HiBits.getNode());
14561 N->getOperand(0), CRHS))
14569 DAGCombinerInfo &DCI)
const {
14570 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
14577 SelectionDAG &DAG = DCI.DAG;
14579 EVT VT =
N->getValueType(0);
14580 if (CRHS && VT == MVT::i64) {
14582 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
14589 unsigned Opc =
LHS.getOpcode();
14619 LHS->getOperand(0), FNegLHS, FNegRHS);
14628SITargetLowering::performZeroOrAnyExtendCombine(
SDNode *
N,
14629 DAGCombinerInfo &DCI)
const {
14630 if (!Subtarget->has16BitInsts() ||
14634 EVT VT =
N->getValueType(0);
14635 if (VT != MVT::i32)
14639 if (Src.getValueType() != MVT::i16)
14642 if (!Src->hasOneUse())
14649 std::optional<ByteProvider<SDValue>> BP0 =
14651 if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
14655 std::optional<ByteProvider<SDValue>> BP1 =
14657 if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
14665 SelectionDAG &DAG = DCI.DAG;
14667 uint32_t PermMask = 0x0c0c0c0c;
14670 PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
14675 PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
14678 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32, V0, V1,
14683SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
14684 DAGCombinerInfo &DCI)
const {
14690 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14691 VTSign->getVT() == MVT::i8) ||
14692 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14693 VTSign->getVT() == MVT::i16))) {
14694 assert(Subtarget->hasScalarSubwordLoads() &&
14695 "s_buffer_load_{u8, i8} are supported "
14696 "in GFX12 (or newer) architectures.");
14697 EVT VT = Src.getValueType();
14698 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14699 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14700 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14702 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14709 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14710 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14714 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14715 VTSign->getVT() == MVT::i8) ||
14716 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14717 VTSign->getVT() == MVT::i16)) &&
14726 Src.getOperand(6), Src.getOperand(7)};
14729 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14730 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14731 ? AMDGPUISD::BUFFER_LOAD_BYTE
14732 : AMDGPUISD::BUFFER_LOAD_SHORT;
14733 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14734 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14735 return DCI.DAG.getMergeValues(
14736 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
14742 DAGCombinerInfo &DCI)
const {
14743 SelectionDAG &DAG = DCI.DAG;
14750 if (
N->getOperand(0).isUndef())
14757 DAGCombinerInfo &DCI)
const {
14758 EVT VT =
N->getValueType(0);
14768 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(
N), VT, N0,
14775 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(
N), VT, N0.
getOperand(0),
14783 unsigned MaxDepth)
const {
14784 unsigned Opcode =
Op.getOpcode();
14789 const auto &
F = CFP->getValueAPF();
14790 if (
F.isNaN() &&
F.isSignaling())
14792 if (!
F.isDenormal())
14824 case AMDGPUISD::FMUL_LEGACY:
14825 case AMDGPUISD::FMAD_FTZ:
14826 case AMDGPUISD::RCP:
14827 case AMDGPUISD::RSQ:
14828 case AMDGPUISD::RSQ_CLAMP:
14829 case AMDGPUISD::RCP_LEGACY:
14830 case AMDGPUISD::RCP_IFLAG:
14831 case AMDGPUISD::LOG:
14832 case AMDGPUISD::EXP:
14833 case AMDGPUISD::DIV_SCALE:
14834 case AMDGPUISD::DIV_FMAS:
14835 case AMDGPUISD::DIV_FIXUP:
14836 case AMDGPUISD::FRACT:
14837 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14838 case AMDGPUISD::CVT_F32_UBYTE0:
14839 case AMDGPUISD::CVT_F32_UBYTE1:
14840 case AMDGPUISD::CVT_F32_UBYTE2:
14841 case AMDGPUISD::CVT_F32_UBYTE3:
14842 case AMDGPUISD::FP_TO_FP16:
14843 case AMDGPUISD::SIN_HW:
14844 case AMDGPUISD::COS_HW:
14855 if (
Op.getValueType() == MVT::i32) {
14861 if (RHS->getZExtValue() == 0xffff0000) {
14871 return Op.getValueType().getScalarType() != MVT::f16;
14881 case AMDGPUISD::CLAMP:
14882 case AMDGPUISD::FMED3:
14883 case AMDGPUISD::FMAX3:
14884 case AMDGPUISD::FMIN3:
14885 case AMDGPUISD::FMAXIMUM3:
14886 case AMDGPUISD::FMINIMUM3: {
14892 if (Subtarget->supportsMinMaxDenormModes() ||
14902 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
14914 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
14941 if (
Op.getValueType() == MVT::i16) {
14952 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
14954 switch (IntrinsicID) {
14955 case Intrinsic::amdgcn_cvt_pkrtz:
14956 case Intrinsic::amdgcn_cubeid:
14957 case Intrinsic::amdgcn_frexp_mant:
14958 case Intrinsic::amdgcn_fdot2:
14959 case Intrinsic::amdgcn_rcp:
14960 case Intrinsic::amdgcn_rsq:
14961 case Intrinsic::amdgcn_rsq_clamp:
14962 case Intrinsic::amdgcn_rcp_legacy:
14963 case Intrinsic::amdgcn_rsq_legacy:
14964 case Intrinsic::amdgcn_trig_preop:
14965 case Intrinsic::amdgcn_tanh:
14966 case Intrinsic::amdgcn_log:
14967 case Intrinsic::amdgcn_exp2:
14968 case Intrinsic::amdgcn_sqrt:
14986 unsigned MaxDepth)
const {
14989 unsigned Opcode =
MI->getOpcode();
14991 if (Opcode == AMDGPU::G_FCANONICALIZE)
14994 std::optional<FPValueAndVReg> FCR;
14997 if (FCR->Value.isSignaling())
14999 if (!FCR->Value.isDenormal())
15010 case AMDGPU::G_FADD:
15011 case AMDGPU::G_FSUB:
15012 case AMDGPU::G_FMUL:
15013 case AMDGPU::G_FCEIL:
15014 case AMDGPU::G_FFLOOR:
15015 case AMDGPU::G_FRINT:
15016 case AMDGPU::G_FNEARBYINT:
15017 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
15018 case AMDGPU::G_INTRINSIC_TRUNC:
15019 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
15020 case AMDGPU::G_FMA:
15021 case AMDGPU::G_FMAD:
15022 case AMDGPU::G_FSQRT:
15023 case AMDGPU::G_FDIV:
15024 case AMDGPU::G_FREM:
15025 case AMDGPU::G_FPOW:
15026 case AMDGPU::G_FPEXT:
15027 case AMDGPU::G_FLOG:
15028 case AMDGPU::G_FLOG2:
15029 case AMDGPU::G_FLOG10:
15030 case AMDGPU::G_FPTRUNC:
15031 case AMDGPU::G_AMDGPU_RCP_IFLAG:
15032 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
15033 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
15034 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
15035 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
15037 case AMDGPU::G_FNEG:
15038 case AMDGPU::G_FABS:
15039 case AMDGPU::G_FCOPYSIGN:
15041 case AMDGPU::G_FMINNUM:
15042 case AMDGPU::G_FMAXNUM:
15043 case AMDGPU::G_FMINNUM_IEEE:
15044 case AMDGPU::G_FMAXNUM_IEEE:
15045 case AMDGPU::G_FMINIMUM:
15046 case AMDGPU::G_FMAXIMUM:
15047 case AMDGPU::G_FMINIMUMNUM:
15048 case AMDGPU::G_FMAXIMUMNUM: {
15049 if (Subtarget->supportsMinMaxDenormModes() ||
15056 case AMDGPU::G_BUILD_VECTOR:
15061 case AMDGPU::G_INTRINSIC:
15062 case AMDGPU::G_INTRINSIC_CONVERGENT:
15064 case Intrinsic::amdgcn_fmul_legacy:
15065 case Intrinsic::amdgcn_fmad_ftz:
15066 case Intrinsic::amdgcn_sqrt:
15067 case Intrinsic::amdgcn_fmed3:
15068 case Intrinsic::amdgcn_sin:
15069 case Intrinsic::amdgcn_cos:
15070 case Intrinsic::amdgcn_log:
15071 case Intrinsic::amdgcn_exp2:
15072 case Intrinsic::amdgcn_log_clamp:
15073 case Intrinsic::amdgcn_rcp:
15074 case Intrinsic::amdgcn_rcp_legacy:
15075 case Intrinsic::amdgcn_rsq:
15076 case Intrinsic::amdgcn_rsq_clamp:
15077 case Intrinsic::amdgcn_rsq_legacy:
15078 case Intrinsic::amdgcn_div_scale:
15079 case Intrinsic::amdgcn_div_fmas:
15080 case Intrinsic::amdgcn_div_fixup:
15081 case Intrinsic::amdgcn_fract:
15082 case Intrinsic::amdgcn_cvt_pkrtz:
15083 case Intrinsic::amdgcn_cubeid:
15084 case Intrinsic::amdgcn_cubema:
15085 case Intrinsic::amdgcn_cubesc:
15086 case Intrinsic::amdgcn_cubetc:
15087 case Intrinsic::amdgcn_frexp_mant:
15088 case Intrinsic::amdgcn_fdot2:
15089 case Intrinsic::amdgcn_trig_preop:
15090 case Intrinsic::amdgcn_tanh:
15109 if (
C.isDenormal()) {
15123 if (
C.isSignaling()) {
15146SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
15147 DAGCombinerInfo &DCI)
const {
15148 SelectionDAG &DAG = DCI.DAG;
15150 EVT VT =
N->getValueType(0);
15159 EVT VT =
N->getValueType(0);
15160 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
15176 EVT EltVT =
Lo.getValueType();
15179 for (
unsigned I = 0;
I != 2; ++
I) {
15183 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
15184 }
else if (
Op.isUndef()) {
15220 return AMDGPUISD::FMAX3;
15222 return AMDGPUISD::FMAXIMUM3;
15224 return AMDGPUISD::SMAX3;
15226 return AMDGPUISD::UMAX3;
15230 return AMDGPUISD::FMIN3;
15232 return AMDGPUISD::FMINIMUM3;
15234 return AMDGPUISD::SMIN3;
15236 return AMDGPUISD::UMIN3;
15257 if (!MinK || !MaxK)
15269 unsigned Med3Opc =
Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15270 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15271 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
15330 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15336 if (
Info->getMode().DX10Clamp) {
15345 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15377 case AMDGPUISD::FMIN_LEGACY:
15378 case AMDGPUISD::FMAX_LEGACY:
15379 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
15380 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
15383 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
15384 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
15385 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
15390 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
15399 DAGCombinerInfo &DCI)
const {
15400 SelectionDAG &DAG = DCI.DAG;
15432 if (
SDValue Med3 = performIntMed3ImmCombine(
15437 if (
SDValue Med3 = performIntMed3ImmCombine(
15443 if (
SDValue Med3 = performIntMed3ImmCombine(
15448 if (
SDValue Med3 = performIntMed3ImmCombine(
15461 (
Opc == AMDGPUISD::FMIN_LEGACY &&
15462 Op0.
getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15463 (VT == MVT::f32 || VT == MVT::f64 ||
15464 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15465 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15466 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15467 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15469 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1))
15476 const SDNodeFlags
Flags =
N->getFlags();
15478 !Subtarget->hasIEEEMinimumMaximumInsts() &&
15482 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
15492 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15493 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15502 DAGCombinerInfo &DCI)
const {
15503 EVT VT =
N->getValueType(0);
15507 SelectionDAG &DAG = DCI.DAG;
15518 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15522 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15526 if (
Info->getMode().DX10Clamp) {
15539 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15546 DAGCombinerInfo &DCI)
const {
15550 return DCI.DAG.getUNDEF(
N->getValueType(0));
15558 bool IsDivergentIdx,
15563 unsigned VecSize = EltSize * NumElem;
15566 if (VecSize <= 64 && EltSize < 32)
15575 if (IsDivergentIdx)
15579 unsigned NumInsts = NumElem +
15580 ((EltSize + 31) / 32) * NumElem ;
15584 if (Subtarget->useVGPRIndexMode())
15585 return NumInsts <= 16;
15589 if (Subtarget->hasMovrel())
15590 return NumInsts <= 15;
15596 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
15611SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
15612 DAGCombinerInfo &DCI)
const {
15618 EVT ResVT =
N->getValueType(0);
15642 if (!
C ||
C->getZExtValue() != 0x1f)
15658 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15686 DCI.AddToWorklist(Elt0.
getNode());
15687 DCI.AddToWorklist(Elt1.
getNode());
15709 if (!DCI.isBeforeLegalize())
15717 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15720 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15721 unsigned EltIdx = BitIndex / 32;
15722 unsigned LeftoverBitIdx = BitIndex % 32;
15726 DCI.AddToWorklist(Cast.
getNode());
15730 DCI.AddToWorklist(Elt.
getNode());
15733 DCI.AddToWorklist(Srl.
getNode());
15737 DCI.AddToWorklist(Trunc.
getNode());
15739 if (VecEltVT == ResVT) {
15751SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
15752 DAGCombinerInfo &DCI)
const {
15763 SelectionDAG &DAG = DCI.DAG;
15783 Src.getOperand(0).getValueType() == MVT::f16) {
15784 return Src.getOperand(0);
15788 APFloat Val = CFP->getValueAPF();
15789 bool LosesInfo =
true;
15799 DAGCombinerInfo &DCI)
const {
15800 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15801 "combine only useful on gfx8");
15803 SDValue TruncSrc =
N->getOperand(0);
15804 EVT VT =
N->getValueType(0);
15805 if (VT != MVT::f16)
15808 if (TruncSrc.
getOpcode() != AMDGPUISD::FMED3 ||
15812 SelectionDAG &DAG = DCI.DAG;
15843unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
15845 const SDNode *N1)
const {
15850 if (((VT == MVT::f32 &&
15852 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15872 EVT VT =
N->getValueType(0);
15873 if (VT != MVT::i32 && VT != MVT::i64)
15879 unsigned Opc =
N->getOpcode();
15934 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
15953 DAGCombinerInfo &DCI)
const {
15956 SelectionDAG &DAG = DCI.DAG;
15957 EVT VT =
N->getValueType(0);
15967 if (!
N->isDivergent() && Subtarget->hasSMulHi())
15971 if (NumBits <= 32 || NumBits > 64)
15982 if (!Subtarget->hasFullRate64Ops()) {
15983 unsigned NumUsers = 0;
15984 for (SDNode *User :
LHS->
users()) {
15987 if (!
User->isAnyAdd())
16011 bool MulSignedLo =
false;
16012 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
16021 if (VT != MVT::i64) {
16044 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
16046 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
16047 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
16049 if (!MulLHSUnsigned32) {
16056 if (!MulRHSUnsigned32) {
16067 if (VT != MVT::i64)
16073SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
16074 DAGCombinerInfo &DCI)
const {
16084 SelectionDAG &DAG = DCI.DAG;
16099 unsigned Opcode =
N->getOpcode();
16103 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
16114static std::optional<ByteProvider<SDValue>>
16117 if (!Byte0 || Byte0->isConstantZero()) {
16118 return std::nullopt;
16121 if (Byte1 && !Byte1->isConstantZero()) {
16122 return std::nullopt;
16128 unsigned FirstCs =
First & 0x0c0c0c0c;
16129 unsigned SecondCs = Second & 0x0c0c0c0c;
16130 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
16131 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
16133 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
16134 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
16135 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
16136 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
16138 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
16162 for (
int BPI = 0; BPI < 2; BPI++) {
16165 BPP = {Src1, Src0};
16167 unsigned ZeroMask = 0x0c0c0c0c;
16168 unsigned FMask = 0xFF << (8 * (3 - Step));
16170 unsigned FirstMask =
16171 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16172 unsigned SecondMask =
16173 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16177 int FirstGroup = -1;
16178 for (
int I = 0;
I < 2;
I++) {
16180 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
16181 return IterElt.SrcOp == *BPP.first.Src &&
16182 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
16186 if (Match != Srcs.
end()) {
16187 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
16192 if (FirstGroup != -1) {
16194 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
16195 return IterElt.SrcOp == *BPP.second.Src &&
16196 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
16199 if (Match != Srcs.
end()) {
16200 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
16202 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
16210 unsigned ZeroMask = 0x0c0c0c0c;
16211 unsigned FMask = 0xFF << (8 * (3 - Step));
16215 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16219 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
16228 if (Srcs.
size() == 1) {
16229 auto *Elt = Srcs.
begin();
16233 if (Elt->PermMask == 0x3020100)
16236 return DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16240 auto *FirstElt = Srcs.
begin();
16241 auto *SecondElt = std::next(FirstElt);
16248 auto FirstMask = FirstElt->PermMask;
16249 auto SecondMask = SecondElt->PermMask;
16251 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
16252 unsigned FirstPlusFour = FirstMask | 0x04040404;
16255 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
16267 FirstElt = std::next(SecondElt);
16268 if (FirstElt == Srcs.
end())
16271 SecondElt = std::next(FirstElt);
16274 if (SecondElt == Srcs.
end()) {
16279 DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
16280 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
16286 return Perms.
size() == 2
16292 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
16293 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
16294 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
16295 EntryMask += ZeroMask;
16300 auto Opcode =
Op.getOpcode();
16302 return (Opcode ==
ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
16303 Opcode == AMDGPUISD::MUL_I24);
16306static std::optional<bool>
16317 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16320 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16322 assert(!(S0IsUnsigned && S0IsSigned));
16323 assert(!(S1IsUnsigned && S1IsSigned));
16331 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16337 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16338 return std::nullopt;
16350 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16351 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16356 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16362 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16363 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16364 return std::nullopt;
16370 DAGCombinerInfo &DCI)
const {
16371 SelectionDAG &DAG = DCI.DAG;
16372 EVT VT =
N->getValueType(0);
16378 if (Subtarget->hasMad64_32()) {
16379 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16384 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
16388 if (VT == MVT::i64) {
16389 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16394 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16396 std::optional<bool> IsSigned;
16402 int ChainLength = 0;
16403 for (
int I = 0;
I < 4;
I++) {
16407 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16410 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16415 TempNode->getOperand(MulIdx), *Src0, *Src1,
16416 TempNode->getOperand(MulIdx)->getOperand(0),
16417 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16421 IsSigned = *IterIsSigned;
16422 if (*IterIsSigned != *IsSigned)
16425 auto AddIdx = 1 - MulIdx;
16428 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
16429 Src2s.
push_back(TempNode->getOperand(AddIdx));
16439 TempNode->getOperand(AddIdx), *Src0, *Src1,
16440 TempNode->getOperand(AddIdx)->getOperand(0),
16441 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16445 if (*IterIsSigned != *IsSigned)
16449 ChainLength =
I + 2;
16453 TempNode = TempNode->getOperand(AddIdx);
16455 ChainLength =
I + 1;
16456 if (TempNode->getNumOperands() < 2)
16458 LHS = TempNode->getOperand(0);
16459 RHS = TempNode->getOperand(1);
16462 if (ChainLength < 2)
16468 if (ChainLength < 4) {
16478 bool UseOriginalSrc =
false;
16479 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
16480 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
16481 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
16482 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
16483 SmallVector<unsigned, 4> SrcBytes;
16484 auto Src0Mask = Src0s.
begin()->PermMask;
16485 SrcBytes.
push_back(Src0Mask & 0xFF000000);
16486 bool UniqueEntries =
true;
16487 for (
auto I = 1;
I < 4;
I++) {
16488 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
16491 UniqueEntries =
false;
16497 if (UniqueEntries) {
16498 UseOriginalSrc =
true;
16500 auto *FirstElt = Src0s.
begin();
16504 auto *SecondElt = Src1s.
begin();
16506 SecondElt->DWordOffset);
16515 if (!UseOriginalSrc) {
16522 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16525 : Intrinsic::amdgcn_udot4,
16535 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16540 unsigned Opc =
LHS.getOpcode();
16552 auto Cond =
RHS.getOperand(0);
16557 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16574 DAGCombinerInfo &DCI)
const {
16575 SelectionDAG &DAG = DCI.DAG;
16577 EVT VT =
N->getValueType(0);
16590 SDNodeFlags ShlFlags = N1->
getFlags();
16594 SDNodeFlags NewShlFlags =
16599 DCI.AddToWorklist(Inner.
getNode());
16606 if (Subtarget->hasMad64_32()) {
16607 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16616 if (VT == MVT::i64) {
16617 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16630 if (!YIsConstant && !ZIsConstant && !
X->isDivergent() &&
16631 Y->isDivergent() !=
Z->isDivergent()) {
16640 if (
Y->isDivergent())
16643 SDNodeFlags ReassocFlags =
16646 DCI.AddToWorklist(UniformInner.
getNode());
16654 DAGCombinerInfo &DCI)
const {
16655 SelectionDAG &DAG = DCI.DAG;
16656 EVT VT =
N->getValueType(0);
16658 if (VT == MVT::i64) {
16659 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16663 if (VT != MVT::i32)
16672 unsigned Opc =
RHS.getOpcode();
16679 auto Cond =
RHS.getOperand(0);
16684 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16702SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
16703 DAGCombinerInfo &DCI)
const {
16705 if (
N->getValueType(0) != MVT::i32)
16711 SelectionDAG &DAG = DCI.DAG;
16716 unsigned LHSOpc =
LHS.getOpcode();
16717 unsigned Opc =
N->getOpcode();
16721 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
16727 DAGCombinerInfo &DCI)
const {
16731 SelectionDAG &DAG = DCI.DAG;
16732 EVT VT =
N->getValueType(0);
16744 if (
A ==
LHS.getOperand(1)) {
16745 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16746 if (FusedOp != 0) {
16748 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
16756 if (
A ==
RHS.getOperand(1)) {
16757 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16758 if (FusedOp != 0) {
16760 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
16769 DAGCombinerInfo &DCI)
const {
16773 SelectionDAG &DAG = DCI.DAG;
16775 EVT VT =
N->getValueType(0);
16788 if (
A ==
LHS.getOperand(1)) {
16789 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16790 if (FusedOp != 0) {
16794 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
16803 if (
A ==
RHS.getOperand(1)) {
16804 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16805 if (FusedOp != 0) {
16807 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
16816 DAGCombinerInfo &DCI)
const {
16817 SelectionDAG &DAG = DCI.DAG;
16819 EVT VT =
N->getValueType(0);
16828 SDNodeFlags
Flags =
N->getFlags();
16829 SDNodeFlags RHSFlags =
RHS->getFlags();
16835 bool IsNegative =
false;
16836 if (CLHS->isExactlyValue(1.0) ||
16837 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16843 DAG.
getNode(AMDGPUISD::RSQ, SL, VT,
RHS.getOperand(0), Flags);
16853 DAGCombinerInfo &DCI)
const {
16854 SelectionDAG &DAG = DCI.DAG;
16855 EVT VT =
N->getValueType(0);
16859 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
16860 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16875 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16880 const ConstantFPSDNode *FalseNode =
16890 if (ScalarVT == MVT::f32 &&
16896 if (TrueNodeExpVal == INT_MIN)
16899 if (FalseNodeExpVal == INT_MIN)
16919 DAGCombinerInfo &DCI)
const {
16920 SelectionDAG &DAG = DCI.DAG;
16921 EVT VT =
N->getValueType(0);
16924 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16942 (
N->getFlags().hasAllowContract() &&
16943 FMA->getFlags().hasAllowContract())) {
16977 if (Vec1 == Vec2 || Vec3 == Vec4)
16983 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16984 return DAG.
getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16992 DAGCombinerInfo &DCI)
const {
16993 SelectionDAG &DAG = DCI.DAG;
16998 EVT VT =
LHS.getValueType();
17027 return LHS.getOperand(0);
17041 const APInt &CT =
LHS.getConstantOperandAPInt(1);
17042 const APInt &CF =
LHS.getConstantOperandAPInt(2);
17047 return DAG.
getNOT(SL,
LHS.getOperand(0), MVT::i1);
17050 return LHS.getOperand(0);
17083 DAG.
getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
17088 {Op0Hi, Op1Hi, CarryInHi});
17098 DCI.CombineTo(
LHS.getNode(), Result);
17102 if (VT != MVT::f32 && VT != MVT::f64 &&
17103 (!Subtarget->has16BitInsts() || VT != MVT::f16))
17118 const unsigned IsInfMask =
17120 const unsigned IsFiniteMask =
17125 return DAG.
getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1,
LHS.getOperand(0),
17134SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
17135 DAGCombinerInfo &DCI)
const {
17136 SelectionDAG &DAG = DCI.DAG;
17138 unsigned Offset =
N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
17157 unsigned ShiftOffset = 8 *
Offset;
17159 ShiftOffset -=
C->getZExtValue();
17161 ShiftOffset +=
C->getZExtValue();
17163 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
17164 return DAG.
getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
17165 MVT::f32, Shifted);
17176 DCI.AddToWorklist(
N);
17183 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
17189 DAGCombinerInfo &DCI)
const {
17194 const MachineFunction &MF = DCI.DAG.getMachineFunction();
17198 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
17199 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
17202 APFloat One(
F.getSemantics(),
"1.0");
17204 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
17210 DAGCombinerInfo &DCI)
const {
17231 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
17232 bool isInteger =
LHS.getValueType().isInteger();
17235 if (!isFloatingPoint && !isInteger)
17240 if (!isEquality && !isNonEquality)
17257 if (isFloatingPoint) {
17259 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
17270 if (!(isEquality && TrueVal == ConstVal) &&
17271 !(isNonEquality && FalseVal == ConstVal))
17278 SelectLHS, SelectRHS);
17283 switch (
N->getOpcode()) {
17299 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
17309 switch (
N->getOpcode()) {
17311 return performAddCombine(
N, DCI);
17313 return performPtrAddCombine(
N, DCI);
17315 return performSubCombine(
N, DCI);
17318 return performAddCarrySubCarryCombine(
N, DCI);
17320 return performFAddCombine(
N, DCI);
17322 return performFSubCombine(
N, DCI);
17324 return performFDivCombine(
N, DCI);
17326 return performFMulCombine(
N, DCI);
17328 return performSetCCCombine(
N, DCI);
17330 if (
auto Res = performSelectCombine(
N, DCI))
17345 case AMDGPUISD::FMIN_LEGACY:
17346 case AMDGPUISD::FMAX_LEGACY:
17347 return performMinMaxCombine(
N, DCI);
17349 return performFMACombine(
N, DCI);
17351 return performAndCombine(
N, DCI);
17353 return performOrCombine(
N, DCI);
17356 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
17357 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
17363 return performXorCombine(
N, DCI);
17366 return performZeroOrAnyExtendCombine(
N, DCI);
17368 return performSignExtendInRegCombine(
N, DCI);
17369 case AMDGPUISD::FP_CLASS:
17370 return performClassCombine(
N, DCI);
17372 return performFCanonicalizeCombine(
N, DCI);
17373 case AMDGPUISD::RCP:
17374 return performRcpCombine(
N, DCI);
17376 case AMDGPUISD::FRACT:
17377 case AMDGPUISD::RSQ:
17378 case AMDGPUISD::RCP_LEGACY:
17379 case AMDGPUISD::RCP_IFLAG:
17380 case AMDGPUISD::RSQ_CLAMP: {
17389 return performUCharToFloatCombine(
N, DCI);
17391 return performFCopySignCombine(
N, DCI);
17392 case AMDGPUISD::CVT_F32_UBYTE0:
17393 case AMDGPUISD::CVT_F32_UBYTE1:
17394 case AMDGPUISD::CVT_F32_UBYTE2:
17395 case AMDGPUISD::CVT_F32_UBYTE3:
17396 return performCvtF32UByteNCombine(
N, DCI);
17397 case AMDGPUISD::FMED3:
17398 return performFMed3Combine(
N, DCI);
17399 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17400 return performCvtPkRTZCombine(
N, DCI);
17401 case AMDGPUISD::CLAMP:
17402 return performClampCombine(
N, DCI);
17405 EVT VT =
N->getValueType(0);
17408 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17411 EVT EltVT = Src.getValueType();
17412 if (EltVT != MVT::i16)
17422 return performExtractVectorEltCombine(
N, DCI);
17424 return performInsertVectorEltCombine(
N, DCI);
17426 return performFPRoundCombine(
N, DCI);
17435 return performMemSDNodeCombine(MemNode, DCI);
17466 unsigned Opcode =
Node->getMachineOpcode();
17469 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17470 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
17473 SDNode *
Users[5] = {
nullptr};
17475 unsigned DmaskIdx =
17476 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17477 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
17478 unsigned NewDmask = 0;
17479 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17480 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17481 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
17482 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
17483 unsigned TFCLane = 0;
17484 bool HasChain =
Node->getNumValues() > 1;
17486 if (OldDmask == 0) {
17494 TFCLane = OldBitsSet;
17498 for (SDUse &Use :
Node->uses()) {
17501 if (
Use.getResNo() != 0)
17504 SDNode *
User =
Use.getUser();
17507 if (!
User->isMachineOpcode() ||
17508 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17520 if (UsesTFC && Lane == TFCLane) {
17525 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17527 Dmask &= ~(1 << Comp);
17535 NewDmask |= 1 << Comp;
17540 bool NoChannels = !NewDmask;
17547 if (OldBitsSet == 1)
17553 if (NewDmask == OldDmask)
17562 unsigned NewChannels = BitsSet + UsesTFC;
17566 assert(NewOpcode != -1 &&
17567 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
17568 "failed to find equivalent MIMG op");
17576 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
17578 MVT ResultVT = NewChannels == 1
17581 : NewChannels == 5 ? 8
17583 SDVTList NewVTList =
17586 MachineSDNode *NewNode =
17595 if (NewChannels == 1) {
17605 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17610 if (i || !NoChannels)
17615 if (NewUser != User) {
17625 Idx = AMDGPU::sub1;
17628 Idx = AMDGPU::sub2;
17631 Idx = AMDGPU::sub3;
17634 Idx = AMDGPU::sub4;
17645 Op =
Op.getOperand(0);
17666 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17670 Node->getOperand(0), SL, VReg, SrcVal,
17676 return ToResultReg.
getNode();
17681 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
17683 Ops.push_back(
Node->getOperand(i));
17689 Node->getOperand(i).getValueType(),
17690 Node->getOperand(i)),
17702 unsigned Opcode =
Node->getMachineOpcode();
17704 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
17705 !
TII->isGather4(Opcode) &&
17707 return adjustWritemask(
Node, DAG);
17710 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17716 case AMDGPU::V_DIV_SCALE_F32_e64:
17717 case AMDGPU::V_DIV_SCALE_F64_e64: {
17727 (Src0 == Src1 || Src0 == Src2))
17783 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
17784 unsigned InitIdx = 0;
17786 if (
TII->isImage(
MI)) {
17794 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
17795 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
17796 unsigned D16Val = D16 ? D16->getImm() : 0;
17798 if (!TFEVal && !LWEVal)
17809 assert(MO_Dmask &&
"Expected dmask operand in instruction");
17811 unsigned dmask = MO_Dmask->
getImm();
17816 bool Packed = !Subtarget->hasUnpackedD16VMem();
17818 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17825 uint32_t DstSize =
TRI.getRegSizeInBits(*DstRC) / 32;
17826 if (DstSize < InitIdx)
17830 InitIdx =
TRI.getRegSizeInBits(*DstRC) / 32;
17838 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
17839 unsigned NewDst = 0;
17844 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17845 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17848 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17849 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
17869 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
17881 if (
TII->isVOP3(
MI.getOpcode())) {
17883 TII->legalizeOperandsVOP3(
MRI,
MI);
17885 if (
TII->isMAI(
MI)) {
17890 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17891 AMDGPU::OpName::scale_src0);
17892 if (Src0Idx != -1) {
17893 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17894 AMDGPU::OpName::scale_src1);
17895 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
17896 TII->usesConstantBus(
MRI,
MI, Src1Idx))
17897 TII->legalizeOpWithMove(
MI, Src1Idx);
17904 if (
TII->isImage(
MI))
17905 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
17979std::pair<unsigned, const TargetRegisterClass *>
17986 if (Constraint.
size() == 1) {
17990 if (VT == MVT::Other)
17993 switch (Constraint[0]) {
18000 RC = &AMDGPU::SReg_32RegClass;
18003 RC = &AMDGPU::SGPR_64RegClass;
18008 return std::pair(0U,
nullptr);
18015 return std::pair(0U,
nullptr);
18017 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
18018 : &AMDGPU::VGPR_32_Lo256RegClass;
18021 RC = Subtarget->has1024AddressableVGPRs()
18022 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
18025 return std::pair(0U,
nullptr);
18030 if (!Subtarget->hasMAIInsts())
18034 return std::pair(0U,
nullptr);
18036 RC = &AMDGPU::AGPR_32RegClass;
18041 return std::pair(0U,
nullptr);
18046 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
18050 RC = &AMDGPU::AV_32RegClass;
18053 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
18055 return std::pair(0U,
nullptr);
18064 return std::pair(0U, RC);
18067 if (Kind !=
'\0') {
18069 RC = &AMDGPU::VGPR_32_Lo256RegClass;
18070 }
else if (Kind ==
's') {
18071 RC = &AMDGPU::SGPR_32RegClass;
18072 }
else if (Kind ==
'a') {
18073 RC = &AMDGPU::AGPR_32RegClass;
18079 return std::pair(0U,
nullptr);
18085 return std::pair(0U,
nullptr);
18089 RC =
TRI->getVGPRClassForBitWidth(Width);
18091 RC =
TRI->getSGPRClassForBitWidth(Width);
18093 RC =
TRI->getAGPRClassForBitWidth(Width);
18095 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
18100 return std::pair(0U,
nullptr);
18102 return std::pair(Reg, RC);
18108 return std::pair(0U,
nullptr);
18109 if (Idx < RC->getNumRegs())
18111 return std::pair(0U,
nullptr);
18117 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
18123 if (Constraint.
size() == 1) {
18124 switch (Constraint[0]) {
18134 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
18142 if (Constraint.
size() == 1) {
18143 switch (Constraint[0]) {
18151 }
else if (Constraint.
size() == 2) {
18152 if (Constraint ==
"VA")
18170 std::vector<SDValue> &
Ops,
18185 unsigned Size =
Op.getScalarValueSizeInBits();
18189 if (
Size == 16 && !Subtarget->has16BitInsts())
18193 Val =
C->getSExtValue();
18197 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
18201 if (
Size != 16 ||
Op.getNumOperands() != 2)
18203 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
18206 Val =
C->getSExtValue();
18210 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
18220 if (Constraint.
size() == 1) {
18221 switch (Constraint[0]) {
18236 }
else if (Constraint.
size() == 2) {
18237 if (Constraint ==
"DA") {
18238 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
18239 int64_t LoBits =
static_cast<int32_t
>(Val);
18243 if (Constraint ==
"DB") {
18251 unsigned MaxSize)
const {
18252 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
18253 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
18255 MVT VT =
Op.getSimpleValueType();
18280 switch (UnalignedClassID) {
18281 case AMDGPU::VReg_64RegClassID:
18282 return AMDGPU::VReg_64_Align2RegClassID;
18283 case AMDGPU::VReg_96RegClassID:
18284 return AMDGPU::VReg_96_Align2RegClassID;
18285 case AMDGPU::VReg_128RegClassID:
18286 return AMDGPU::VReg_128_Align2RegClassID;
18287 case AMDGPU::VReg_160RegClassID:
18288 return AMDGPU::VReg_160_Align2RegClassID;
18289 case AMDGPU::VReg_192RegClassID:
18290 return AMDGPU::VReg_192_Align2RegClassID;
18291 case AMDGPU::VReg_224RegClassID:
18292 return AMDGPU::VReg_224_Align2RegClassID;
18293 case AMDGPU::VReg_256RegClassID:
18294 return AMDGPU::VReg_256_Align2RegClassID;
18295 case AMDGPU::VReg_288RegClassID:
18296 return AMDGPU::VReg_288_Align2RegClassID;
18297 case AMDGPU::VReg_320RegClassID:
18298 return AMDGPU::VReg_320_Align2RegClassID;
18299 case AMDGPU::VReg_352RegClassID:
18300 return AMDGPU::VReg_352_Align2RegClassID;
18301 case AMDGPU::VReg_384RegClassID:
18302 return AMDGPU::VReg_384_Align2RegClassID;
18303 case AMDGPU::VReg_512RegClassID:
18304 return AMDGPU::VReg_512_Align2RegClassID;
18305 case AMDGPU::VReg_1024RegClassID:
18306 return AMDGPU::VReg_1024_Align2RegClassID;
18307 case AMDGPU::AReg_64RegClassID:
18308 return AMDGPU::AReg_64_Align2RegClassID;
18309 case AMDGPU::AReg_96RegClassID:
18310 return AMDGPU::AReg_96_Align2RegClassID;
18311 case AMDGPU::AReg_128RegClassID:
18312 return AMDGPU::AReg_128_Align2RegClassID;
18313 case AMDGPU::AReg_160RegClassID:
18314 return AMDGPU::AReg_160_Align2RegClassID;
18315 case AMDGPU::AReg_192RegClassID:
18316 return AMDGPU::AReg_192_Align2RegClassID;
18317 case AMDGPU::AReg_256RegClassID:
18318 return AMDGPU::AReg_256_Align2RegClassID;
18319 case AMDGPU::AReg_512RegClassID:
18320 return AMDGPU::AReg_512_Align2RegClassID;
18321 case AMDGPU::AReg_1024RegClassID:
18322 return AMDGPU::AReg_1024_Align2RegClassID;
18338 if (Info->isEntryFunction()) {
18345 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18347 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
18348 :
TRI->getAlignedHighSGPRForRC(MF, 2,
18349 &AMDGPU::SGPR_64RegClass);
18350 Info->setSGPRForEXECCopy(SReg);
18352 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
18353 Info->getStackPtrOffsetReg()));
18354 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18355 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
18359 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18360 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
18362 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18363 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
18365 Info->limitOccupancy(MF);
18367 if (ST.isWave32() && !MF.
empty()) {
18368 for (
auto &
MBB : MF) {
18369 for (
auto &
MI :
MBB) {
18370 TII->fixImplicitOperands(
MI);
18380 if (ST.needsAlignedVGPRs()) {
18381 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
18387 if (NewClassID != -1)
18388 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
18397 const APInt &DemandedElts,
18399 unsigned Depth)
const {
18401 unsigned Opc =
Op.getOpcode();
18404 unsigned IID =
Op.getConstantOperandVal(0);
18406 case Intrinsic::amdgcn_mbcnt_lo:
18407 case Intrinsic::amdgcn_mbcnt_hi: {
18413 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18423 Op, Known, DemandedElts, DAG,
Depth);
18439 unsigned MaxValue =
18446 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
18450 unsigned Src1Cst = 0;
18451 if (Src1.
isImm()) {
18452 Src1Cst = Src1.
getImm();
18453 }
else if (Src1.
isReg()) {
18457 Src1Cst = Cst->Value.getZExtValue();
18468 if (Width >= BFEWidth)
18477 Known = Known.
sext(BFEWidth);
18479 Known = Known.
zext(BFEWidth);
18485 unsigned Depth)
const {
18488 switch (
MI->getOpcode()) {
18489 case AMDGPU::S_BFE_I32:
18492 case AMDGPU::S_BFE_U32:
18495 case AMDGPU::S_BFE_I64:
18498 case AMDGPU::S_BFE_U64:
18501 case AMDGPU::G_INTRINSIC:
18502 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18505 case Intrinsic::amdgcn_workitem_id_x:
18508 case Intrinsic::amdgcn_workitem_id_y:
18511 case Intrinsic::amdgcn_workitem_id_z:
18514 case Intrinsic::amdgcn_mbcnt_lo:
18515 case Intrinsic::amdgcn_mbcnt_hi: {
18527 case Intrinsic::amdgcn_groupstaticsize: {
18538 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18541 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18544 case AMDGPU::G_AMDGPU_SMED3:
18545 case AMDGPU::G_AMDGPU_UMED3: {
18546 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
18573 unsigned Depth)
const {
18580 AttributeList Attrs =
18582 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
18609 if (Header->getAlignment() != PrefAlign)
18610 return Header->getAlignment();
18612 unsigned LoopSize = 0;
18617 LoopSize +=
MBB->getAlignment().value() / 2;
18620 LoopSize +=
TII->getInstSizeInBytes(
MI);
18621 if (LoopSize > 192)
18626 if (LoopSize <= 64)
18629 if (LoopSize <= 128)
18630 return CacheLineAlign;
18636 auto I = Exit->getFirstNonDebugInstr();
18637 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18638 return CacheLineAlign;
18647 if (PreTerm == Pre->
begin() ||
18648 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18652 auto ExitHead = Exit->getFirstNonDebugInstr();
18653 if (ExitHead == Exit->end() ||
18654 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18659 return CacheLineAlign;
18667 N =
N->getOperand(0).getNode();
18677 switch (
N->getOpcode()) {
18685 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
18686 return !
TRI->isSGPRReg(
MRI, Reg);
18692 return !
TRI->isSGPRReg(
MRI, Reg);
18696 unsigned AS = L->getAddressSpace();
18706 case AMDGPUISD::ATOMIC_CMP_SWAP:
18707 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18708 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18709 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18710 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18711 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18712 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18713 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18714 case AMDGPUISD::BUFFER_ATOMIC_AND:
18715 case AMDGPUISD::BUFFER_ATOMIC_OR:
18716 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18717 case AMDGPUISD::BUFFER_ATOMIC_INC:
18718 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18719 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18720 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18721 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18722 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18728 return A->readMem() &&
A->writeMem();
18749 switch (Ty.getScalarSizeInBits()) {
18761 const APInt &DemandedElts,
18764 unsigned Depth)
const {
18765 if (
Op.getOpcode() == AMDGPUISD::CLAMP) {
18769 if (Info->getMode().DX10Clamp)
18781 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
18801 <<
"Hardware instruction generated for atomic "
18803 <<
" operation at memory scope " << MemScope;
18808 Type *EltTy = VT->getElementType();
18809 return VT->getNumElements() == 2 &&
18829 unsigned BW =
IT->getBitWidth();
18830 return BW == 32 || BW == 64;
18844 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
18845 return BW == 32 || BW == 64;
18848 if (Ty->isFloatTy() || Ty->isDoubleTy())
18852 return VT->getNumElements() == 2 &&
18853 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18863 bool HasSystemScope) {
18870 if (HasSystemScope) {
18871 if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
18874 if (Subtarget.hasEmulatedSystemScopeAtomics())
18876 }
else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
18879 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
18892 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
18900 return STI.hasGloballyAddressableScratch()
18918 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
18931 bool HasSystemScope =
18963 if (!
IT ||
IT->getBitWidth() != 32)
18969 if (Subtarget->hasEmulatedSystemScopeAtomics())
18985 if (!HasSystemScope &&
18986 Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
18998 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
19006 ConstVal && ConstVal->isNullValue())
19044 if (Ty->isFloatTy()) {
19049 if (Ty->isDoubleTy()) {
19070 if (Ty->isFloatTy() &&
19071 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
19084 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
19088 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
19092 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
19097 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
19102 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
19106 if (Ty->isFloatTy()) {
19109 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19112 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19117 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
19125 if (Subtarget->hasFlatAtomicFaddF32Inst())
19134 if (Subtarget->hasLDSFPAtomicAddF32()) {
19135 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
19137 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
19165 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
19167 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
19171 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
19173 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
19227 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
19228 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
19229 : &AMDGPU::SReg_32RegClass;
19230 if (!
TRI->isSGPRClass(RC) && !isDivergent)
19231 return TRI->getEquivalentSGPRClass(RC);
19232 if (
TRI->isSGPRClass(RC) && isDivergent) {
19233 if (Subtarget->hasGFX90AInsts())
19234 return TRI->getEquivalentAVClass(RC);
19235 return TRI->getEquivalentVGPRClass(RC);
19248 unsigned WaveSize) {
19253 if (!
IT ||
IT->getBitWidth() != WaveSize)
19258 if (!Visited.
insert(V).second)
19260 bool Result =
false;
19261 for (
const auto *U : V->users()) {
19263 if (V == U->getOperand(1)) {
19268 case Intrinsic::amdgcn_if_break:
19269 case Intrinsic::amdgcn_if:
19270 case Intrinsic::amdgcn_else:
19275 if (V == U->getOperand(0)) {
19280 case Intrinsic::amdgcn_end_cf:
19281 case Intrinsic::amdgcn_loop:
19287 Result =
hasCFUser(U, Visited, WaveSize);
19296 const Value *V)
const {
19298 if (CI->isInlineAsm()) {
19307 for (
auto &TC : TargetConstraints) {
19321 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
19349 return MRI.hasOneNonDBGUse(N0);
19356 if (
I.getMetadata(
"amdgpu.noclobber"))
19358 if (
I.getMetadata(
"amdgpu.last.use"))
19422 Alignment = RMW->getAlign();
19435 bool FullFlatEmulation =
19437 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19438 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19439 RMW->getType()->isDoubleTy()));
19442 bool ReturnValueIsUsed = !AI->
use_empty();
19451 if (FullFlatEmulation) {
19462 std::prev(BB->
end())->eraseFromParent();
19463 Builder.SetInsertPoint(BB);
19465 Value *LoadedShared =
nullptr;
19466 if (FullFlatEmulation) {
19467 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19468 {Addr},
nullptr,
"is.shared");
19469 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19470 Builder.SetInsertPoint(SharedBB);
19471 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19477 LoadedShared = Clone;
19479 Builder.CreateBr(PhiBB);
19480 Builder.SetInsertPoint(CheckPrivateBB);
19483 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19484 {Addr},
nullptr,
"is.private");
19485 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19487 Builder.SetInsertPoint(PrivateBB);
19489 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19492 Value *LoadedPrivate;
19494 LoadedPrivate = Builder.CreateAlignedLoad(
19495 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
19498 LoadedPrivate, RMW->getValOperand());
19500 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19502 auto [ResultLoad, Equal] =
19508 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19511 Builder.CreateBr(PhiBB);
19513 Builder.SetInsertPoint(GlobalBB);
19517 if (FullFlatEmulation) {
19518 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19527 if (!FullFlatEmulation) {
19532 MDNode *RangeNotPrivate =
19535 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
19539 Builder.CreateBr(PhiBB);
19541 Builder.SetInsertPoint(PhiBB);
19543 if (ReturnValueIsUsed) {
19546 if (FullFlatEmulation)
19547 Loaded->addIncoming(LoadedShared, SharedBB);
19548 Loaded->addIncoming(LoadedPrivate, PrivateBB);
19549 Loaded->addIncoming(LoadedGlobal, GlobalBB);
19550 Loaded->takeName(AI);
19553 Builder.CreateBr(ExitBB);
19557 unsigned PtrOpIdx) {
19558 Value *PtrOp =
I->getOperand(PtrOpIdx);
19565 I->setOperand(PtrOpIdx, ASCast);
19577 ConstVal && ConstVal->isNullValue()) {
19607 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19615 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19630 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
AMDGPUArgumentUsageInfo & getArgUsageInfo()
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned AndSaveExecOpc
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf()
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
const SIInstrInfo * getInstrInfo() const override
const SIRegisterInfo * getRegisterInfo() const override
bool hasMin3Max3_16() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Type * getValueType() const
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
MachineFunctionAnalysisManager * getMFAM()
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
constexpr bool empty() const
empty - Check if the string is empty.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< user_iterator > users()
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
constexpr T MinAlign(U A, V B)
A and B are either alignments or offsets.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
constexpr RegState getUndefRegState(bool B)
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
unsigned AtomicNoRetBaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const