44#include "llvm/IR/IntrinsicsAMDGPU.h"
45#include "llvm/IR/IntrinsicsR600.h"
56#define DEBUG_TYPE "si-lower"
62 cl::desc(
"Do not align and prefetch loops"),
66 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
67 cl::desc(
"Use indirect register addressing for divergent indexes"),
81 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
82 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
84 return AMDGPU::SGPR0 +
Reg;
100 TRI->getDefaultVectorSuperClassForBitWidth(32);
106 TRI->getDefaultVectorSuperClassForBitWidth(64);
144 TRI->getDefaultVectorSuperClassForBitWidth(320));
148 TRI->getDefaultVectorSuperClassForBitWidth(352));
152 TRI->getDefaultVectorSuperClassForBitWidth(384));
156 TRI->getDefaultVectorSuperClassForBitWidth(512));
163 TRI->getDefaultVectorSuperClassForBitWidth(1024));
165 if (Subtarget->has16BitInsts()) {
166 if (Subtarget->useRealTrue16Insts()) {
196 TRI->getDefaultVectorSuperClassForBitWidth(1024));
209 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
210 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
211 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
212 MVT::i1, MVT::v32i32},
216 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
217 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
218 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
219 MVT::i1, MVT::v32i32},
288 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
295 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
296 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
297 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
300 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
301 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
302 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
306 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
307 MVT::v3i16, MVT::v4i16, MVT::Other},
312 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
328 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
329 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
330 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
331 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
332 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
333 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
334 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
335 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
367 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
381 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
395 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
409 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
423 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
438 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
439 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
442 if (Subtarget->hasPkMovB32()) {
463 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
464 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
469 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
473 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
474 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
475 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
476 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
500 if (Subtarget->hasSMemRealTime() ||
505 if (Subtarget->has16BitInsts()) {
512 if (Subtarget->hasMadMacF32Insts())
529 if (Subtarget->hasIntClamp())
532 if (Subtarget->hasAddNoCarry())
538 {MVT::f32, MVT::f64},
Custom);
544 {MVT::f32, MVT::f64},
Legal);
546 if (Subtarget->haveRoundOpsF64())
569 if (Subtarget->has16BitInsts()) {
622 if (Subtarget->hasBF16TransInsts())
641 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
642 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
643 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
776 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
777 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
778 MVT::v32f16, MVT::v32bf16},
788 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
792 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
796 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
797 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
805 if (Subtarget->hasVOP3PInsts()) {
816 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
819 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
820 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
821 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
824 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
832 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
839 {MVT::v2f16, MVT::v4f16},
Custom);
845 if (Subtarget->hasBF16PackedInsts()) {
846 for (
MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
852 if (Subtarget->hasPackedFP32Ops()) {
856 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
863 if (Subtarget->has16BitInsts()) {
876 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
877 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
878 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
879 MVT::v32f16, MVT::v32bf16},
884 if (Subtarget->hasVectorMulU64())
886 else if (Subtarget->hasScalarSMulU64())
889 if (Subtarget->hasMad64_32())
892 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
895 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
897 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
900 if (Subtarget->hasMinimum3Maximum3F32())
903 if (Subtarget->hasMinimum3Maximum3PKF16()) {
907 if (!Subtarget->hasMinimum3Maximum3F16())
912 if (Subtarget->hasVOP3PInsts()) {
915 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
919 if (Subtarget->hasIntMinMax64())
924 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
925 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
930 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
931 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
932 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
933 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
937 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
938 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
939 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
940 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
955 if (Subtarget->hasBF16ConversionInsts()) {
960 if (Subtarget->hasBF16PackedInsts()) {
966 if (Subtarget->hasBF16TransInsts()) {
970 if (Subtarget->hasCvtPkF16F32Inst()) {
972 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1022 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1063 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1076 EVT DestVT,
EVT SrcVT)
const {
1078 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1079 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1081 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1088 LLT DestTy,
LLT SrcTy)
const {
1089 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1090 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1092 SrcTy.getScalarSizeInBits() == 16 &&
1113 if (Subtarget->has16BitInsts()) {
1116 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1118 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1122 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1144 if (
Size == 16 && Subtarget->has16BitInsts())
1145 return (NumElts + 1) / 2;
1151 return NumElts * ((
Size + 31) / 32);
1160 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1168 if (
Size == 16 && Subtarget->has16BitInsts()) {
1169 if (ScalarVT == MVT::bf16) {
1170 RegisterVT = MVT::i32;
1171 IntermediateVT = MVT::v2bf16;
1173 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1174 IntermediateVT = RegisterVT;
1176 NumIntermediates = (NumElts + 1) / 2;
1177 return NumIntermediates;
1182 IntermediateVT = RegisterVT;
1183 NumIntermediates = NumElts;
1184 return NumIntermediates;
1189 RegisterVT = MVT::i16;
1190 IntermediateVT = ScalarVT;
1191 NumIntermediates = NumElts;
1192 return NumIntermediates;
1196 RegisterVT = MVT::i32;
1197 IntermediateVT = ScalarVT;
1198 NumIntermediates = NumElts;
1199 return NumIntermediates;
1203 RegisterVT = MVT::i32;
1204 IntermediateVT = RegisterVT;
1205 NumIntermediates = NumElts * ((
Size + 31) / 32);
1206 return NumIntermediates;
1211 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1216 unsigned MaxNumLanes) {
1217 assert(MaxNumLanes != 0);
1221 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1232 unsigned MaxNumLanes) {
1238 assert(ST->getNumContainedTypes() == 2 &&
1239 ST->getContainedType(1)->isIntegerTy(32));
1253 return MVT::amdgpuBufferFatPointer;
1255 DL.getPointerSizeInBits(AS) == 192)
1256 return MVT::amdgpuBufferStridedPointer;
1265 DL.getPointerSizeInBits(AS) == 160) ||
1267 DL.getPointerSizeInBits(AS) == 192))
1274 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1275 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1276 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1278 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1279 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1280 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1281 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1282 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1284 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1285 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1286 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1287 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1288 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1290 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1291 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1292 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1293 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1294 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1333 unsigned IntrID)
const {
1335 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1353 if (RsrcIntr->IsImage) {
1368 Info.ptrVal = RsrcArg;
1371 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1380 if (RsrcIntr->IsImage) {
1381 unsigned MaxNumLanes = 4;
1396 std::numeric_limits<unsigned>::max());
1406 if (RsrcIntr->IsImage) {
1427 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1429 Info.memVT = MVT::i32;
1436 case Intrinsic::amdgcn_raw_buffer_load_lds:
1437 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1438 case Intrinsic::amdgcn_struct_buffer_load_lds:
1439 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1445 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1446 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1447 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1448 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1451 std::numeric_limits<unsigned>::max());
1461 case Intrinsic::amdgcn_ds_ordered_add:
1462 case Intrinsic::amdgcn_ds_ordered_swap: {
1475 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1476 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1479 Info.ptrVal =
nullptr;
1484 case Intrinsic::amdgcn_ds_append:
1485 case Intrinsic::amdgcn_ds_consume: {
1498 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1499 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1500 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1505 Info.memVT = MVT::i64;
1511 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1512 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1513 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1516 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1519 ->getElementType(0));
1527 case Intrinsic::amdgcn_global_atomic_fmin_num:
1528 case Intrinsic::amdgcn_global_atomic_fmax_num:
1529 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1530 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1531 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1541 case Intrinsic::amdgcn_flat_load_monitor_b32:
1542 case Intrinsic::amdgcn_flat_load_monitor_b64:
1543 case Intrinsic::amdgcn_flat_load_monitor_b128:
1544 case Intrinsic::amdgcn_global_load_monitor_b32:
1545 case Intrinsic::amdgcn_global_load_monitor_b64:
1546 case Intrinsic::amdgcn_global_load_monitor_b128:
1547 case Intrinsic::amdgcn_cluster_load_b32:
1548 case Intrinsic::amdgcn_cluster_load_b64:
1549 case Intrinsic::amdgcn_cluster_load_b128:
1550 case Intrinsic::amdgcn_ds_load_tr6_b96:
1551 case Intrinsic::amdgcn_ds_load_tr4_b64:
1552 case Intrinsic::amdgcn_ds_load_tr8_b64:
1553 case Intrinsic::amdgcn_ds_load_tr16_b128:
1554 case Intrinsic::amdgcn_global_load_tr6_b96:
1555 case Intrinsic::amdgcn_global_load_tr4_b64:
1556 case Intrinsic::amdgcn_global_load_tr_b64:
1557 case Intrinsic::amdgcn_global_load_tr_b128:
1558 case Intrinsic::amdgcn_ds_read_tr4_b64:
1559 case Intrinsic::amdgcn_ds_read_tr6_b96:
1560 case Intrinsic::amdgcn_ds_read_tr8_b64:
1561 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1569 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1570 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1571 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1579 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1580 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1581 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1589 case Intrinsic::amdgcn_ds_gws_init:
1590 case Intrinsic::amdgcn_ds_gws_barrier:
1591 case Intrinsic::amdgcn_ds_gws_sema_v:
1592 case Intrinsic::amdgcn_ds_gws_sema_br:
1593 case Intrinsic::amdgcn_ds_gws_sema_p:
1594 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1604 Info.memVT = MVT::i32;
1606 Info.align =
Align(4);
1608 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1614 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1615 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1616 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1617 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1618 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1619 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1620 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1621 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1628 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1629 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1630 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1631 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1638 case Intrinsic::amdgcn_load_to_lds:
1639 case Intrinsic::amdgcn_global_load_lds: {
1650 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1651 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1652 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1653 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1663 Info.memVT = MVT::i32;
1665 Info.align =
Align(4);
1670 case Intrinsic::amdgcn_s_prefetch_data:
1671 case Intrinsic::amdgcn_flat_prefetch:
1672 case Intrinsic::amdgcn_global_prefetch: {
1687 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1690 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1691 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1703 Type *&AccessTy)
const {
1704 Value *Ptr =
nullptr;
1705 switch (
II->getIntrinsicID()) {
1706 case Intrinsic::amdgcn_cluster_load_b128:
1707 case Intrinsic::amdgcn_cluster_load_b64:
1708 case Intrinsic::amdgcn_cluster_load_b32:
1709 case Intrinsic::amdgcn_ds_append:
1710 case Intrinsic::amdgcn_ds_consume:
1711 case Intrinsic::amdgcn_ds_load_tr8_b64:
1712 case Intrinsic::amdgcn_ds_load_tr16_b128:
1713 case Intrinsic::amdgcn_ds_load_tr4_b64:
1714 case Intrinsic::amdgcn_ds_load_tr6_b96:
1715 case Intrinsic::amdgcn_ds_read_tr4_b64:
1716 case Intrinsic::amdgcn_ds_read_tr6_b96:
1717 case Intrinsic::amdgcn_ds_read_tr8_b64:
1718 case Intrinsic::amdgcn_ds_read_tr16_b64:
1719 case Intrinsic::amdgcn_ds_ordered_add:
1720 case Intrinsic::amdgcn_ds_ordered_swap:
1721 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1722 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1723 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1724 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1725 case Intrinsic::amdgcn_flat_load_monitor_b128:
1726 case Intrinsic::amdgcn_flat_load_monitor_b32:
1727 case Intrinsic::amdgcn_flat_load_monitor_b64:
1728 case Intrinsic::amdgcn_global_atomic_fmax_num:
1729 case Intrinsic::amdgcn_global_atomic_fmin_num:
1730 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1731 case Intrinsic::amdgcn_global_load_monitor_b128:
1732 case Intrinsic::amdgcn_global_load_monitor_b32:
1733 case Intrinsic::amdgcn_global_load_monitor_b64:
1734 case Intrinsic::amdgcn_global_load_tr_b64:
1735 case Intrinsic::amdgcn_global_load_tr_b128:
1736 case Intrinsic::amdgcn_global_load_tr4_b64:
1737 case Intrinsic::amdgcn_global_load_tr6_b96:
1738 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1739 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1740 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1741 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1742 Ptr =
II->getArgOperand(0);
1744 case Intrinsic::amdgcn_load_to_lds:
1745 case Intrinsic::amdgcn_global_load_lds:
1746 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1747 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1748 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1749 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1750 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1751 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1752 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1753 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1754 Ptr =
II->getArgOperand(1);
1759 AccessTy =
II->getType();
1765 unsigned AddrSpace)
const {
1766 if (!Subtarget->hasFlatInstOffsets()) {
1777 return AM.
Scale == 0 &&
1778 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1779 AM.
BaseOffs, AddrSpace, FlatVariant));
1783 if (Subtarget->hasFlatGlobalInsts())
1786 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1799 return isLegalMUBUFAddressingMode(AM);
1802bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1813 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1825 if (AM.HasBaseReg) {
1857 return isLegalMUBUFAddressingMode(AM);
1859 if (!Subtarget->hasScalarSubwordLoads()) {
1864 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1912 return Subtarget->enableFlatScratch()
1914 : isLegalMUBUFAddressingMode(AM);
1961 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1970 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
1973 Align RequiredAlignment(
1975 if (Subtarget->hasLDSMisalignedBug() &&
Size > 32 &&
1976 Alignment < RequiredAlignment)
1991 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
1997 RequiredAlignment =
Align(4);
1999 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2015 *IsFast = (Alignment >= RequiredAlignment) ? 64
2016 : (Alignment <
Align(4)) ? 32
2023 if (!Subtarget->hasDS96AndDS128())
2029 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2038 *IsFast = (Alignment >= RequiredAlignment) ? 96
2039 : (Alignment <
Align(4)) ? 32
2046 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2052 RequiredAlignment =
Align(8);
2054 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2063 *IsFast = (Alignment >= RequiredAlignment) ? 128
2064 : (Alignment <
Align(4)) ? 32
2081 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2083 return Alignment >= RequiredAlignment ||
2084 Subtarget->hasUnalignedDSAccessEnabled();
2092 bool AlignedBy4 = Alignment >=
Align(4);
2093 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2095 *IsFast = AlignedBy4 ?
Size : 1;
2100 *IsFast = AlignedBy4;
2111 return Alignment >=
Align(4) ||
2112 Subtarget->hasUnalignedBufferAccessEnabled();
2124 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2139 return Size >= 32 && Alignment >=
Align(4);
2144 unsigned *IsFast)
const {
2146 Alignment, Flags, IsFast);
2151 const AttributeList &FuncAttributes)
const {
2157 if (
Op.size() >= 16 &&
2161 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2179 unsigned DestAS)
const {
2182 Subtarget->hasGloballyAddressableScratch()) {
2212 unsigned Index)
const {
2228 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2263 auto [InputPtrReg, RC, ArgTy] =
2273 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2279 const SDLoc &SL)
const {
2286 const SDLoc &SL)
const {
2289 std::optional<uint32_t> KnownSize =
2291 if (KnownSize.has_value())
2317 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2326SDValue SITargetLowering::lowerKernargMemParameter(
2331 MachinePointerInfo PtrInfo =
2340 int64_t OffsetDiff =
Offset - AlignDownOffset;
2346 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2357 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2362 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain,
Offset);
2367 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2376 const SDLoc &SL)
const {
2445 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2448 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2449 if (ConvertedVal == ArgValue)
2450 return ConvertedVal;
2455SDValue SITargetLowering::lowerWorkGroupId(
2460 if (!Subtarget->hasClusters())
2461 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2469 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2470 SDLoc SL(ClusterIdXYZ);
2471 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2474 SDValue ClusterWorkGroupIdXYZ =
2475 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2485 return ClusterIdXYZ;
2487 using namespace AMDGPU::Hwreg;
2491 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2502SDValue SITargetLowering::getPreloadedValue(
2505 const ArgDescriptor *
Reg =
nullptr;
2506 const TargetRegisterClass *RC;
2510 const ArgDescriptor WorkGroupIDX =
2518 const ArgDescriptor WorkGroupIDZ =
2520 const ArgDescriptor ClusterWorkGroupIDX =
2522 const ArgDescriptor ClusterWorkGroupIDY =
2524 const ArgDescriptor ClusterWorkGroupIDZ =
2526 const ArgDescriptor ClusterWorkGroupMaxIDX =
2528 const ArgDescriptor ClusterWorkGroupMaxIDY =
2530 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2532 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2535 auto LoadConstant = [&](
unsigned N) {
2539 if (Subtarget->hasArchitectedSGPRs() &&
2546 Reg = &WorkGroupIDX;
2547 RC = &AMDGPU::SReg_32RegClass;
2551 Reg = &WorkGroupIDY;
2552 RC = &AMDGPU::SReg_32RegClass;
2556 Reg = &WorkGroupIDZ;
2557 RC = &AMDGPU::SReg_32RegClass;
2561 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2562 return LoadConstant(0);
2563 Reg = &ClusterWorkGroupIDX;
2564 RC = &AMDGPU::SReg_32RegClass;
2568 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2569 return LoadConstant(0);
2570 Reg = &ClusterWorkGroupIDY;
2571 RC = &AMDGPU::SReg_32RegClass;
2575 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2576 return LoadConstant(0);
2577 Reg = &ClusterWorkGroupIDZ;
2578 RC = &AMDGPU::SReg_32RegClass;
2583 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2584 Reg = &ClusterWorkGroupMaxIDX;
2585 RC = &AMDGPU::SReg_32RegClass;
2590 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2591 Reg = &ClusterWorkGroupMaxIDY;
2592 RC = &AMDGPU::SReg_32RegClass;
2597 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2598 Reg = &ClusterWorkGroupMaxIDZ;
2599 RC = &AMDGPU::SReg_32RegClass;
2603 Reg = &ClusterWorkGroupMaxFlatID;
2604 RC = &AMDGPU::SReg_32RegClass;
2635 for (
unsigned I = 0,
E = Ins.
size(), PSInputNum = 0;
I !=
E; ++
I) {
2639 "vector type argument should have been split");
2644 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2652 "unexpected vector split in ps argument type");
2666 Info->markPSInputAllocated(PSInputNum);
2668 Info->markPSInputEnabled(PSInputNum);
2684 if (Info.hasWorkItemIDX()) {
2690 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2694 if (Info.hasWorkItemIDY()) {
2695 assert(Info.hasWorkItemIDX());
2696 if (Subtarget->hasPackedTID()) {
2697 Info.setWorkItemIDY(
2700 unsigned Reg = AMDGPU::VGPR1;
2708 if (Info.hasWorkItemIDZ()) {
2709 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2710 if (Subtarget->hasPackedTID()) {
2711 Info.setWorkItemIDZ(
2714 unsigned Reg = AMDGPU::VGPR2;
2734 if (RegIdx == ArgVGPRs.
size()) {
2741 unsigned Reg = ArgVGPRs[RegIdx];
2753 unsigned NumArgRegs) {
2756 if (RegIdx == ArgSGPRs.
size())
2759 unsigned Reg = ArgSGPRs[RegIdx];
2801 const unsigned Mask = 0x3ff;
2804 if (Info.hasWorkItemIDX()) {
2806 Info.setWorkItemIDX(Arg);
2809 if (Info.hasWorkItemIDY()) {
2811 Info.setWorkItemIDY(Arg);
2814 if (Info.hasWorkItemIDZ())
2826 const unsigned Mask = 0x3ff;
2835 auto &
ArgInfo = Info.getArgInfo();
2847 if (Info.hasImplicitArgPtr())
2855 if (Info.hasWorkGroupIDX())
2858 if (Info.hasWorkGroupIDY())
2861 if (Info.hasWorkGroupIDZ())
2864 if (Info.hasLDSKernelId())
2875 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2876 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2882 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2883 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2888 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2889 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2895 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2901 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
2910 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2915 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
2916 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2921 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
2922 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2937 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2939 bool InPreloadSequence =
true;
2941 bool AlignedForImplictArgs =
false;
2942 unsigned ImplicitArgOffset = 0;
2943 for (
auto &Arg :
F.args()) {
2944 if (!InPreloadSequence || !Arg.hasInRegAttr())
2947 unsigned ArgIdx = Arg.getArgNo();
2950 if (InIdx < Ins.
size() &&
2951 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2954 for (; InIdx < Ins.
size() && Ins[InIdx].isOrigArg() &&
2955 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2957 assert(ArgLocs[ArgIdx].isMemLoc());
2958 auto &ArgLoc = ArgLocs[InIdx];
2960 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2962 unsigned NumAllocSGPRs =
2963 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2966 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2967 if (!AlignedForImplictArgs) {
2969 alignTo(LastExplicitArgOffset,
2970 Subtarget->getAlignmentForImplicitArgPtr()) -
2971 LastExplicitArgOffset;
2972 AlignedForImplictArgs =
true;
2974 ArgOffset += ImplicitArgOffset;
2978 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2979 assert(InIdx >= 1 &&
"No previous SGPR");
2980 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2981 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2985 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2986 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2989 InPreloadSequence =
false;
2995 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2997 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2999 if (PreloadRegs->
size() > 1)
3000 RC = &AMDGPU::SGPR_32RegClass;
3001 for (
auto &Reg : *PreloadRegs) {
3007 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3016 if (Info.hasLDSKernelId()) {
3017 Register Reg = Info.addLDSKernelId();
3018 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3027 bool IsShader)
const {
3028 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3029 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3035 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3037 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3041 unsigned NumRequiredSystemSGPRs =
3042 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3043 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3044 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3045 Register Reg = Info.addReservedUserSGPR();
3046 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3051 if (!HasArchitectedSGPRs) {
3052 if (Info.hasWorkGroupIDX()) {
3053 Register Reg = Info.addWorkGroupIDX();
3054 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3058 if (Info.hasWorkGroupIDY()) {
3059 Register Reg = Info.addWorkGroupIDY();
3060 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3064 if (Info.hasWorkGroupIDZ()) {
3065 Register Reg = Info.addWorkGroupIDZ();
3066 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3071 if (Info.hasWorkGroupInfo()) {
3072 Register Reg = Info.addWorkGroupInfo();
3073 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3077 if (Info.hasPrivateSegmentWaveByteOffset()) {
3079 unsigned PrivateSegmentWaveByteOffsetReg;
3082 PrivateSegmentWaveByteOffsetReg =
3083 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3087 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3089 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3092 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3094 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3095 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3098 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3099 Info.getNumPreloadedSGPRs() >= 16);
3114 if (HasStackObjects)
3115 Info.setHasNonSpillStackObjects(
true);
3120 HasStackObjects =
true;
3124 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3126 if (!ST.enableFlatScratch()) {
3127 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3134 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3136 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3146 Info.setScratchRSrcReg(ReservedBufferReg);
3165 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
3166 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3173 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3174 if (!
MRI.isLiveIn(
Reg)) {
3175 Info.setStackPtrOffsetReg(
Reg);
3180 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3187 if (ST.getFrameLowering()->hasFP(MF)) {
3188 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3204 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3213 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3214 RC = &AMDGPU::SGPR_64RegClass;
3215 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3216 RC = &AMDGPU::SGPR_32RegClass;
3222 Entry->addLiveIn(*
I);
3227 for (
auto *Exit : Exits)
3229 TII->get(TargetOpcode::COPY), *
I)
3244 bool IsError =
false;
3248 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3266 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3267 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3269 if (!Subtarget->enableFlatScratch())
3274 !Subtarget->hasArchitectedSGPRs())
3275 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3276 !Info->hasWorkGroupIDZ());
3279 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3297 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3298 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3301 Info->markPSInputAllocated(0);
3302 Info->markPSInputEnabled(0);
3304 if (Subtarget->isAmdPalOS()) {
3313 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3314 if ((PsInputBits & 0x7F) == 0 ||
3315 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3318 }
else if (IsKernel) {
3319 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3331 if (IsKernel && Subtarget->hasKernargPreload())
3335 }
else if (!IsGraphics) {
3340 if (!Subtarget->enableFlatScratch())
3352 Info->setNumWaveDispatchSGPRs(
3354 Info->setNumWaveDispatchVGPRs(
3356 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3357 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3362 if (IsWholeWaveFunc) {
3364 {MVT::i1, MVT::Other}, Chain);
3376 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.
size(), ArgIdx = 0; i != e;
3387 if (IsEntryFunc && VA.
isMemLoc()) {
3410 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3414 int64_t OffsetDiff =
Offset - AlignDownOffset;
3421 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3432 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3433 Ins[i].Flags.isSExt(), &Ins[i]);
3441 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3444 if (PreloadRegs.
size() == 1) {
3445 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3450 TRI->getRegSizeInBits(*RC)));
3458 for (
auto Reg : PreloadRegs) {
3465 PreloadRegs.size()),
3482 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3483 Ins[i].Flags.isSExt(), &Ins[i]);
3495 "hidden argument in kernel signature was not preloaded",
3501 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3502 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3522 if (!IsEntryFunc && VA.
isMemLoc()) {
3523 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3534 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3535 RC = &AMDGPU::VGPR_32RegClass;
3536 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3537 RC = &AMDGPU::SGPR_32RegClass;
3557 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3566 auto &ArgUsageInfo =
3569 }
else if (
auto *MFAM = DAG.
getMFAM()) {
3571 auto *ArgUsageInfo =
3573 .getCachedResult<AMDGPUArgumentUsageAnalysis>(M);
3575 ArgUsageInfo->setFuncArgInfo(Fn, Info->getArgInfo());
3579 Info->setBytesInStackArgArea(StackArgSize);
3581 return Chains.
empty() ? Chain
3590 const Type *RetTy)
const {
3598 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3603 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3604 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3605 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3606 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3629 Info->setIfReturnsVoid(Outs.
empty());
3630 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3649 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3650 ++
I, ++RealRVLocIdx) {
3654 SDValue Arg = OutVals[RealRVLocIdx];
3677 ReadFirstLane, Arg);
3684 if (!Info->isEntryFunction()) {
3690 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3692 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3705 unsigned Opc = AMDGPUISD::ENDPGM;
3707 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3708 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3709 : AMDGPUISD::RET_GLUE;
3791 auto &ArgUsageInfo =
3794 &ArgUsageInfo.getArgUsageInfo().lookupFuncArgInfo(*CalleeFunc);
3795 }
else if (
auto *MFAM = DAG.
getMFAM()) {
3797 auto *ArgUsageInfo =
3802 CalleeArgInfo = &ArgUsageInfo->lookupFuncArgInfo(*CalleeFunc);
3830 const auto [OutgoingArg, ArgRC, ArgTy] =
3835 const auto [IncomingArg, IncomingArgRC, Ty] =
3837 assert(IncomingArgRC == ArgRC);
3840 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3848 InputReg = getImplicitArgPtr(DAG,
DL);
3850 std::optional<uint32_t> Id =
3852 if (Id.has_value()) {
3863 if (OutgoingArg->isRegister()) {
3864 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3865 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3868 unsigned SpecialArgOffset =
3879 auto [OutgoingArg, ArgRC, Ty] =
3882 std::tie(OutgoingArg, ArgRC, Ty) =
3885 std::tie(OutgoingArg, ArgRC, Ty) =
3900 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3901 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3902 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3907 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3915 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
3925 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
3934 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3935 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3946 : IncomingArgY ? *IncomingArgY
3953 if (OutgoingArg->isRegister()) {
3955 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3981 if (Callee->isDivergent())
3988 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3992 if (!CallerPreserved)
3995 bool CCMatch = CallerCC == CalleeCC;
4008 if (Arg.hasByValAttr())
4022 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
4023 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4032 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4045 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4047 if (!CCVA.isRegLoc())
4052 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4054 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4078enum ChainCallArgIdx {
4100 bool UsesDynamicVGPRs =
false;
4101 if (IsChainCallConv) {
4106 auto RequestedExecIt =
4108 return Arg.OrigArgIndex == 2;
4110 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4112 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4115 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4118 "Haven't popped all the special args");
4121 CLI.
Args[ChainCallArgIdx::Exec];
4122 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4130 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4132 ChainCallSpecialArgs.
push_back(Arg.Node);
4135 PushNodeOrTargetConstant(RequestedExecArg);
4141 if (FlagsValue.
isZero()) {
4142 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4144 "no additional args allowed if flags == 0");
4146 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4150 if (!Subtarget->isWave32()) {
4152 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4155 UsesDynamicVGPRs =
true;
4156 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4157 CLI.
Args.end(), PushNodeOrTargetConstant);
4166 bool IsSibCall =
false;
4180 "unsupported call to variadic function ");
4188 "unsupported required tail call to function ");
4193 Outs, OutVals, Ins, DAG);
4197 "site marked musttail or on llvm.amdgcn.cs.chain");
4204 if (!TailCallOpt && IsTailCall)
4244 auto *
TRI = Subtarget->getRegisterInfo();
4251 if (!IsSibCall || IsChainCallConv) {
4252 if (!Subtarget->enableFlatScratch()) {
4258 RegsToPass.emplace_back(IsChainCallConv
4259 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4260 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4267 const unsigned NumSpecialInputs = RegsToPass.size();
4269 MVT PtrVT = MVT::i32;
4272 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4300 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4308 int32_t
Offset = LocMemOffset;
4315 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4321 ? Flags.getNonZeroByValAlign()
4348 if (Outs[i].Flags.isByVal()) {
4350 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4353 Outs[i].Flags.getNonZeroByValAlign(),
4355 nullptr, std::nullopt, DstInfo,
4361 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4367 if (!MemOpChains.
empty())
4383 unsigned ArgIdx = 0;
4384 for (
auto [Reg, Val] : RegsToPass) {
4385 if (ArgIdx++ >= NumSpecialInputs &&
4386 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4412 if (IsTailCall && !IsSibCall) {
4417 std::vector<SDValue>
Ops({Chain});
4423 Ops.push_back(Callee);
4440 Ops.push_back(Callee);
4451 if (IsChainCallConv)
4456 for (
auto &[Reg, Val] : RegsToPass)
4460 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4461 assert(Mask &&
"Missing call preserved mask for calling convention");
4471 MVT::Glue, GlueOps),
4476 Ops.push_back(InGlue);
4482 unsigned OPC = AMDGPUISD::TC_RETURN;
4485 OPC = AMDGPUISD::TC_RETURN_GFX;
4489 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4490 : AMDGPUISD::TC_RETURN_CHAIN;
4496 if (Info->isWholeWaveFunction())
4497 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4504 Chain =
Call.getValue(0);
4505 InGlue =
Call.getValue(1);
4507 uint64_t CalleePopBytes = NumBytes;
4528 EVT VT =
Op.getValueType();
4542 "Stack grows upwards for AMDGPU");
4544 Chain = BaseAddr.getValue(1);
4546 if (Alignment > StackAlign) {
4548 << Subtarget->getWavefrontSizeLog2();
4549 uint64_t StackAlignMask = ScaledAlignment - 1;
4556 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4562 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4573 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4589 if (
Op.getValueType() != MVT::i32)
4608 assert(
Op.getValueType() == MVT::i32);
4617 Op.getOperand(0), IntrinID, GetRoundBothImm);
4651 SDValue RoundModeTimesNumBits =
4671 TableEntry, EnumOffset);
4687 static_cast<uint32_t>(ConstMode->getZExtValue()),
4699 if (UseReducedTable) {
4705 SDValue RoundModeTimesNumBits =
4725 SDValue RoundModeTimesNumBits =
4734 NewMode = TruncTable;
4743 ReadFirstLaneID, NewMode);
4756 IntrinID, RoundBothImm, NewMode);
4762 if (
Op->isDivergent() &&
4763 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4773 if (Subtarget->hasSafeSmemPrefetch())
4781 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4790 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4791 EVT SrcVT = Src.getValueType();
4800 EVT DstVT =
Op.getValueType();
4809 if (
Op.getValueType() != MVT::i64)
4823 Op.getOperand(0), IntrinID, ModeHwRegImm);
4825 Op.getOperand(0), IntrinID, TrapHwRegImm);
4839 if (
Op.getOperand(1).getValueType() != MVT::i64)
4851 ReadFirstLaneID, NewModeReg);
4853 ReadFirstLaneID, NewTrapReg);
4855 unsigned ModeHwReg =
4858 unsigned TrapHwReg =
4866 IntrinID, ModeHwRegImm, NewModeReg);
4869 IntrinID, TrapHwRegImm, NewTrapReg);
4878 .
Case(
"m0", AMDGPU::M0)
4879 .
Case(
"exec", AMDGPU::EXEC)
4880 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4881 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4882 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4883 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4884 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4889 if (!Subtarget->hasFlatScrRegister() &&
4890 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4892 "\" for subtarget."));
4897 case AMDGPU::EXEC_LO:
4898 case AMDGPU::EXEC_HI:
4899 case AMDGPU::FLAT_SCR_LO:
4900 case AMDGPU::FLAT_SCR_HI:
4905 case AMDGPU::FLAT_SCR:
4924 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4933static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4955 auto Next = std::next(
I);
4966 MBB.addSuccessor(LoopBB);
4968 return std::pair(LoopBB, RemainderBB);
4975 auto I =
MI.getIterator();
4976 auto E = std::next(
I);
4998 Src->setIsKill(
false);
5008 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
5014 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5017 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5041 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5042 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5052 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
5053 Register NewExec =
MRI.createVirtualRegister(BoolRC);
5055 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5056 Register CondReg =
MRI.createVirtualRegister(BoolRC);
5064 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5071 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5075 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5083 MRI.setSimpleHint(NewExec, CondReg);
5085 if (UseGPRIdxMode) {
5087 SGPRIdxReg = CurrentIdxReg;
5089 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5090 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5100 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5131 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5132 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5140 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5142 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
5143 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
5159 InitResultReg, DstReg, PhiReg, TmpExec,
5160 Offset, UseGPRIdxMode, SGPRIdxReg);
5166 LoopBB->removeSuccessor(RemainderBB);
5168 LoopBB->addSuccessor(LandingPad);
5179static std::pair<unsigned, int>
5183 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5188 return std::pair(AMDGPU::sub0,
Offset);
5228 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5245 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5246 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5255 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5258 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5262 if (UseGPRIdxMode) {
5269 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5282 MI.eraseFromParent();
5291 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5292 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5298 UseGPRIdxMode, SGPRIdxReg);
5302 if (UseGPRIdxMode) {
5304 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5306 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5311 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5316 MI.eraseFromParent();
5333 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5343 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5345 if (Idx->
getReg() == AMDGPU::NoRegister) {
5356 MI.eraseFromParent();
5361 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5365 if (UseGPRIdxMode) {
5369 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5378 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5379 TRI.getRegSizeInBits(*VecRC), 32,
false);
5385 MI.eraseFromParent();
5395 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5399 UseGPRIdxMode, SGPRIdxReg);
5402 if (UseGPRIdxMode) {
5404 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5406 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5412 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5413 TRI.getRegSizeInBits(*VecRC), 32,
false);
5414 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5420 MI.eraseFromParent();
5436 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5437 if (ST.hasScalarAddSub64()) {
5438 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5448 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5449 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5452 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5454 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5457 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5459 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5461 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5462 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5471 MI.eraseFromParent();
5477 case AMDGPU::S_MIN_U32:
5478 return std::numeric_limits<uint32_t>::max();
5479 case AMDGPU::S_MIN_I32:
5480 return std::numeric_limits<int32_t>::max();
5481 case AMDGPU::S_MAX_U32:
5482 return std::numeric_limits<uint32_t>::min();
5483 case AMDGPU::S_MAX_I32:
5484 return std::numeric_limits<int32_t>::min();
5485 case AMDGPU::V_ADD_F32_e64:
5487 case AMDGPU::V_SUB_F32_e64:
5489 case AMDGPU::S_ADD_I32:
5490 case AMDGPU::S_SUB_I32:
5491 case AMDGPU::S_OR_B32:
5492 case AMDGPU::S_XOR_B32:
5493 return std::numeric_limits<uint32_t>::min();
5494 case AMDGPU::S_AND_B32:
5495 return std::numeric_limits<uint32_t>::max();
5496 case AMDGPU::V_MIN_F32_e64:
5497 case AMDGPU::V_MAX_F32_e64:
5501 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5507 case AMDGPU::V_CMP_LT_U64_e64:
5508 return std::numeric_limits<uint64_t>::max();
5509 case AMDGPU::V_CMP_LT_I64_e64:
5510 return std::numeric_limits<int64_t>::max();
5511 case AMDGPU::V_CMP_GT_U64_e64:
5512 return std::numeric_limits<uint64_t>::min();
5513 case AMDGPU::V_CMP_GT_I64_e64:
5514 return std::numeric_limits<int64_t>::min();
5515 case AMDGPU::S_ADD_U64_PSEUDO:
5516 case AMDGPU::S_SUB_U64_PSEUDO:
5517 case AMDGPU::S_OR_B64:
5518 case AMDGPU::S_XOR_B64:
5519 return std::numeric_limits<uint64_t>::min();
5520 case AMDGPU::S_AND_B64:
5521 return std::numeric_limits<uint64_t>::max();
5524 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5529 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5530 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5531 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5532 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5533 Opc == AMDGPU::S_XOR_B32 ||
Opc == AMDGPU::V_MIN_F32_e64 ||
5534 Opc == AMDGPU::V_MAX_F32_e64 ||
Opc == AMDGPU::V_ADD_F32_e64 ||
5535 Opc == AMDGPU::V_SUB_F32_e64;
5539 return Opc == AMDGPU::V_MIN_F32_e64 ||
Opc == AMDGPU::V_MAX_F32_e64 ||
5540 Opc == AMDGPU::V_ADD_F32_e64 ||
Opc == AMDGPU::V_SUB_F32_e64;
5554 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5559 case AMDGPU::S_MIN_U32:
5560 case AMDGPU::S_MIN_I32:
5561 case AMDGPU::V_MIN_F32_e64:
5562 case AMDGPU::S_MAX_U32:
5563 case AMDGPU::S_MAX_I32:
5564 case AMDGPU::V_MAX_F32_e64:
5565 case AMDGPU::S_AND_B32:
5566 case AMDGPU::S_OR_B32: {
5572 case AMDGPU::V_CMP_LT_U64_e64:
5573 case AMDGPU::V_CMP_LT_I64_e64:
5574 case AMDGPU::V_CMP_GT_U64_e64:
5575 case AMDGPU::V_CMP_GT_I64_e64:
5576 case AMDGPU::S_AND_B64:
5577 case AMDGPU::S_OR_B64: {
5583 case AMDGPU::S_XOR_B32:
5584 case AMDGPU::S_XOR_B64:
5585 case AMDGPU::S_ADD_I32:
5586 case AMDGPU::S_ADD_U64_PSEUDO:
5587 case AMDGPU::V_ADD_F32_e64:
5588 case AMDGPU::S_SUB_I32:
5589 case AMDGPU::S_SUB_U64_PSEUDO:
5590 case AMDGPU::V_SUB_F32_e64: {
5593 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5595 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5597 bool IsWave32 = ST.isWave32();
5598 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5599 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5600 unsigned BitCountOpc =
5601 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5605 auto NewAccumulator =
5610 case AMDGPU::S_XOR_B32:
5611 case AMDGPU::S_XOR_B64: {
5617 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5620 .
addReg(NewAccumulator->getOperand(0).getReg())
5623 if (
Opc == AMDGPU::S_XOR_B32) {
5629 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5631 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5635 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5638 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5640 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5650 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5658 case AMDGPU::S_SUB_I32: {
5659 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5667 .
addReg(NewAccumulator->getOperand(0).getReg());
5670 case AMDGPU::S_ADD_I32: {
5673 .
addReg(NewAccumulator->getOperand(0).getReg());
5676 case AMDGPU::S_ADD_U64_PSEUDO:
5677 case AMDGPU::S_SUB_U64_PSEUDO: {
5678 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5679 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5681 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5683 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5684 Register CarryReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5685 Register AddReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5687 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5689 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5693 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5696 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5698 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5700 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5703 .
addReg(NewAccumulator->getOperand(0).getReg())
5713 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5715 : NewAccumulator->getOperand(0).getReg();
5726 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5732 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5738 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5745 case AMDGPU::V_ADD_F32_e64:
5746 case AMDGPU::V_SUB_F32_e64: {
5748 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5749 Register DstVreg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5753 .
addReg(NewAccumulator->getOperand(0).getReg())
5758 unsigned srcMod =
Opc == AMDGPU::V_SUB_F32_e64 ? 1 : 0;
5766 BuildMI(BB,
MI,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5795 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5796 Register IdentityValReg =
MRI.createVirtualRegister(DstRegClass);
5797 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5798 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5799 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5800 Register FF1Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5801 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
5803 bool IsWave32 = ST.isWave32();
5804 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5805 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5812 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5816 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5825 I = ComputeLoop->begin();
5827 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5831 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5835 I = ComputeLoop->end();
5838 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5842 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5848 MRI.createVirtualRegister(
MRI.getRegClass(SrcReg));
5849 Register DstVreg =
MRI.createVirtualRegister(
MRI.getRegClass(SrcReg));
5851 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_MOV_B32_e32),
5861 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5862 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
5871 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5873 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5874 Register LaneValReg =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5877 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5879 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5881 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5883 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5887 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5891 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5892 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5898 case AMDGPU::S_OR_B64:
5899 case AMDGPU::S_AND_B64:
5900 case AMDGPU::S_XOR_B64: {
5903 .
addReg(LaneValue->getOperand(0).getReg())
5907 case AMDGPU::V_CMP_GT_I64_e64:
5908 case AMDGPU::V_CMP_GT_U64_e64:
5909 case AMDGPU::V_CMP_LT_I64_e64:
5910 case AMDGPU::V_CMP_LT_U64_e64: {
5911 Register LaneMaskReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5913 MRI.createVirtualRegister(WaveMaskRegClass);
5916 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5917 Register AccumulatorVReg =
MRI.createVirtualRegister(VregClass);
5920 VregClass, AMDGPU::sub0, VSubRegClass);
5923 VregClass, AMDGPU::sub1, VSubRegClass);
5924 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
5931 .
addReg(LaneValue->getOperand(0).getReg())
5932 .
addReg(AccumulatorVReg);
5934 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5935 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
5939 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5940 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5941 .
addReg(LaneValue->getOperand(0).getReg())
5945 case AMDGPU::S_ADD_U64_PSEUDO:
5946 case AMDGPU::S_SUB_U64_PSEUDO: {
5949 .
addReg(LaneValue->getOperand(0).getReg());
5956 unsigned BITSETOpc =
5957 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5958 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5964 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5967 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5969 .
addReg(NewActiveBitsReg)
5971 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5976 MI.eraseFromParent();
5991 switch (
MI.getOpcode()) {
5992 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5994 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5996 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5998 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6000 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6002 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6004 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6006 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6008 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6010 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6012 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6014 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6016 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6018 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6020 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6022 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6024 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6026 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6028 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6030 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6032 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6034 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6036 case AMDGPU::S_UADDO_PSEUDO:
6037 case AMDGPU::S_USUBO_PSEUDO: {
6043 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6045 : AMDGPU::S_SUB_U32;
6053 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6056 MI.eraseFromParent();
6059 case AMDGPU::S_ADD_U64_PSEUDO:
6060 case AMDGPU::S_SUB_U64_PSEUDO: {
6063 case AMDGPU::V_ADD_U64_PSEUDO:
6064 case AMDGPU::V_SUB_U64_PSEUDO: {
6065 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6071 if (ST.hasAddSubU64Insts()) {
6073 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6074 : AMDGPU::V_SUB_U64_e64),
6079 TII->legalizeOperands(*
I);
6080 MI.eraseFromParent();
6084 if (IsAdd && ST.hasLshlAddU64Inst()) {
6090 TII->legalizeOperands(*
Add);
6091 MI.eraseFromParent();
6095 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6097 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6098 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6100 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
6101 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
6105 : &AMDGPU::VReg_64RegClass;
6108 : &AMDGPU::VReg_64RegClass;
6111 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6113 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6116 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6118 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6121 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6123 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6126 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6133 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6147 TII->legalizeOperands(*LoHalf);
6148 TII->legalizeOperands(*HiHalf);
6149 MI.eraseFromParent();
6152 case AMDGPU::S_ADD_CO_PSEUDO:
6153 case AMDGPU::S_SUB_CO_PSEUDO: {
6164 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6165 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6170 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6171 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6175 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6177 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6182 if (ST.isWave64()) {
6183 if (ST.hasScalarCompareEq64()) {
6190 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6192 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6194 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6195 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6197 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6211 unsigned Opc =
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6212 ? AMDGPU::S_ADDC_U32
6213 : AMDGPU::S_SUBB_U32;
6218 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6224 MI.eraseFromParent();
6227 case AMDGPU::SI_INIT_M0: {
6230 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6233 MI.eraseFromParent();
6236 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6239 TII->get(AMDGPU::S_CMP_EQ_U32))
6244 case AMDGPU::GET_GROUPSTATICSIZE: {
6248 .
add(
MI.getOperand(0))
6250 MI.eraseFromParent();
6253 case AMDGPU::GET_SHADERCYCLESHILO: {
6266 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6268 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6269 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6271 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6272 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6274 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6278 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6283 .
add(
MI.getOperand(0))
6288 MI.eraseFromParent();
6291 case AMDGPU::SI_INDIRECT_SRC_V1:
6292 case AMDGPU::SI_INDIRECT_SRC_V2:
6293 case AMDGPU::SI_INDIRECT_SRC_V3:
6294 case AMDGPU::SI_INDIRECT_SRC_V4:
6295 case AMDGPU::SI_INDIRECT_SRC_V5:
6296 case AMDGPU::SI_INDIRECT_SRC_V6:
6297 case AMDGPU::SI_INDIRECT_SRC_V7:
6298 case AMDGPU::SI_INDIRECT_SRC_V8:
6299 case AMDGPU::SI_INDIRECT_SRC_V9:
6300 case AMDGPU::SI_INDIRECT_SRC_V10:
6301 case AMDGPU::SI_INDIRECT_SRC_V11:
6302 case AMDGPU::SI_INDIRECT_SRC_V12:
6303 case AMDGPU::SI_INDIRECT_SRC_V16:
6304 case AMDGPU::SI_INDIRECT_SRC_V32:
6306 case AMDGPU::SI_INDIRECT_DST_V1:
6307 case AMDGPU::SI_INDIRECT_DST_V2:
6308 case AMDGPU::SI_INDIRECT_DST_V3:
6309 case AMDGPU::SI_INDIRECT_DST_V4:
6310 case AMDGPU::SI_INDIRECT_DST_V5:
6311 case AMDGPU::SI_INDIRECT_DST_V6:
6312 case AMDGPU::SI_INDIRECT_DST_V7:
6313 case AMDGPU::SI_INDIRECT_DST_V8:
6314 case AMDGPU::SI_INDIRECT_DST_V9:
6315 case AMDGPU::SI_INDIRECT_DST_V10:
6316 case AMDGPU::SI_INDIRECT_DST_V11:
6317 case AMDGPU::SI_INDIRECT_DST_V12:
6318 case AMDGPU::SI_INDIRECT_DST_V16:
6319 case AMDGPU::SI_INDIRECT_DST_V32:
6321 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6322 case AMDGPU::SI_KILL_I1_PSEUDO:
6324 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6328 Register SrcCond =
MI.getOperand(3).getReg();
6330 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6331 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6332 const auto *CondRC =
TRI->getWaveMaskRegClass();
6333 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
6337 : &AMDGPU::VReg_64RegClass;
6340 : &AMDGPU::VReg_64RegClass;
6343 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6345 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6348 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6350 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6353 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6355 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6376 MI.eraseFromParent();
6379 case AMDGPU::SI_BR_UNDEF: {
6381 .
add(
MI.getOperand(0));
6383 MI.eraseFromParent();
6386 case AMDGPU::ADJCALLSTACKUP:
6387 case AMDGPU::ADJCALLSTACKDOWN: {
6394 case AMDGPU::SI_CALL_ISEL: {
6395 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
6398 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6404 MI.eraseFromParent();
6407 case AMDGPU::V_ADD_CO_U32_e32:
6408 case AMDGPU::V_SUB_CO_U32_e32:
6409 case AMDGPU::V_SUBREV_CO_U32_e32: {
6411 unsigned Opc =
MI.getOpcode();
6413 bool NeedClampOperand =
false;
6414 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
6416 NeedClampOperand =
true;
6420 if (
TII->isVOP3(*
I)) {
6423 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
6424 if (NeedClampOperand)
6427 TII->legalizeOperands(*
I);
6429 MI.eraseFromParent();
6432 case AMDGPU::V_ADDC_U32_e32:
6433 case AMDGPU::V_SUBB_U32_e32:
6434 case AMDGPU::V_SUBBREV_U32_e32:
6437 TII->legalizeOperands(
MI);
6439 case AMDGPU::DS_GWS_INIT:
6440 case AMDGPU::DS_GWS_SEMA_BR:
6441 case AMDGPU::DS_GWS_BARRIER:
6442 case AMDGPU::DS_GWS_SEMA_V:
6443 case AMDGPU::DS_GWS_SEMA_P:
6444 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6452 case AMDGPU::S_SETREG_B32: {
6468 const unsigned SetMask = WidthMask <<
Offset;
6471 unsigned SetDenormOp = 0;
6472 unsigned SetRoundOp = 0;
6480 SetRoundOp = AMDGPU::S_ROUND_MODE;
6481 SetDenormOp = AMDGPU::S_DENORM_MODE;
6483 SetRoundOp = AMDGPU::S_ROUND_MODE;
6485 SetDenormOp = AMDGPU::S_DENORM_MODE;
6488 if (SetRoundOp || SetDenormOp) {
6490 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6491 unsigned ImmVal = Def->getOperand(1).getImm();
6505 MI.eraseFromParent();
6514 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6518 case AMDGPU::S_INVERSE_BALLOT_U32:
6519 case AMDGPU::S_INVERSE_BALLOT_U64:
6522 MI.setDesc(
TII->get(AMDGPU::COPY));
6524 case AMDGPU::ENDPGM_TRAP: {
6526 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6546 MI.eraseFromParent();
6549 case AMDGPU::SIMULATED_TRAP: {
6550 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6552 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6553 MI.eraseFromParent();
6556 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6557 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6563 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6564 Register OriginalExec = Setup->getOperand(0).getReg();
6566 MI.getOperand(0).setReg(OriginalExec);
6603 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6607 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6634 if (!Subtarget->hasMadMacF32Insts())
6635 return Subtarget->hasFastFMAF32();
6641 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6644 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6660 switch (Ty.getScalarSizeInBits()) {
6678 if (Ty.getScalarSizeInBits() == 16)
6680 if (Ty.getScalarSizeInBits() == 32)
6681 return Subtarget->hasMadMacF32Insts() &&
6691 EVT VT =
N->getValueType(0);
6693 return Subtarget->hasMadMacF32Insts() &&
6695 if (VT == MVT::f16) {
6696 return Subtarget->hasMadF16() &&
6711 unsigned Opc =
Op.getOpcode();
6712 EVT VT =
Op.getValueType();
6713 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6714 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6715 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6716 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6717 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6718 VT == MVT::v32bf16);
6734 [[maybe_unused]]
EVT VT =
Op.getValueType();
6736 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6737 VT == MVT::v16i32) &&
6738 "Unexpected ValueType.");
6747 unsigned Opc =
Op.getOpcode();
6748 EVT VT =
Op.getValueType();
6749 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6750 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6751 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6752 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6753 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6754 VT == MVT::v32bf16);
6762 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6764 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6771 unsigned Opc =
Op.getOpcode();
6772 EVT VT =
Op.getValueType();
6773 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6774 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6775 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6776 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6777 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6778 VT == MVT::v32bf16);
6783 : std::pair(Op0, Op0);
6792 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
6794 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
6800 switch (
Op.getOpcode()) {
6804 return LowerBRCOND(
Op, DAG);
6806 return LowerRETURNADDR(
Op, DAG);
6809 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6810 "Load should return a value and a chain");
6814 EVT VT =
Op.getValueType();
6816 return lowerFSQRTF32(
Op, DAG);
6818 return lowerFSQRTF64(
Op, DAG);
6823 return LowerTrig(
Op, DAG);
6825 return LowerSELECT(
Op, DAG);
6827 return LowerFDIV(
Op, DAG);
6829 return LowerFFREXP(
Op, DAG);
6831 return LowerATOMIC_CMP_SWAP(
Op, DAG);
6833 return LowerSTORE(
Op, DAG);
6837 return LowerGlobalAddress(MFI,
Op, DAG);
6840 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
6842 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
6844 return LowerINTRINSIC_VOID(
Op, DAG);
6846 return lowerADDRSPACECAST(
Op, DAG);
6848 return lowerINSERT_SUBVECTOR(
Op, DAG);
6850 return lowerINSERT_VECTOR_ELT(
Op, DAG);
6852 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
6854 return lowerVECTOR_SHUFFLE(
Op, DAG);
6856 return lowerSCALAR_TO_VECTOR(
Op, DAG);
6858 return lowerBUILD_VECTOR(
Op, DAG);
6861 return lowerFP_ROUND(
Op, DAG);
6863 return lowerTRAP(
Op, DAG);
6865 return lowerDEBUGTRAP(
Op, DAG);
6874 return lowerFMINNUM_FMAXNUM(
Op, DAG);
6877 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
6880 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
6883 return lowerFLDEXP(
Op, DAG);
6889 Op.getValueType() == MVT::i16 &&
6890 Op.getOperand(0).getValueType() == MVT::f32) {
6914 return lowerFCOPYSIGN(
Op, DAG);
6916 return lowerMUL(
Op, DAG);
6919 return lowerXMULO(
Op, DAG);
6922 return lowerXMUL_LOHI(
Op, DAG);
6957 EVT FittingLoadVT = LoadVT;
6989SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6992 bool IsIntrinsic)
const {
6995 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6996 EVT LoadVT =
M->getValueType(0);
6998 EVT EquivLoadVT = LoadVT;
7012 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
7016 M->getMemoryVT(),
M->getMemOperand());
7027 EVT LoadVT =
M->getValueType(0);
7033 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
7034 bool IsTFE =
M->getNumValues() == 3;
7036 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7037 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7038 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7039 : AMDGPUISD::BUFFER_LOAD;
7042 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG,
Ops);
7047 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
7051 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
7052 M->getMemOperand(), DAG);
7056 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
7058 M->getMemOperand(), DAG);
7066 EVT VT =
N->getValueType(0);
7067 unsigned CondCode =
N->getConstantOperandVal(3);
7078 EVT CmpVT =
LHS.getValueType();
7079 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
7080 unsigned PromoteOp =
7100 EVT VT =
N->getValueType(0);
7102 unsigned CondCode =
N->getConstantOperandVal(3);
7111 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7120 SDValue SetCC = DAG.
getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7129 EVT VT =
N->getValueType(0);
7153 Exec = AMDGPU::EXEC_LO;
7155 Exec = AMDGPU::EXEC;
7172 EVT VT =
N->getValueType(0);
7174 unsigned IID =
N->getConstantOperandVal(0);
7175 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7176 IID == Intrinsic::amdgcn_permlanex16;
7177 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7178 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7182 unsigned SplitSize = 32;
7183 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7184 ST->hasDPALU_DPP() &&
7192 case Intrinsic::amdgcn_permlane16:
7193 case Intrinsic::amdgcn_permlanex16:
7194 case Intrinsic::amdgcn_update_dpp:
7199 case Intrinsic::amdgcn_writelane:
7202 case Intrinsic::amdgcn_readlane:
7203 case Intrinsic::amdgcn_set_inactive:
7204 case Intrinsic::amdgcn_set_inactive_chain_arg:
7205 case Intrinsic::amdgcn_mov_dpp8:
7208 case Intrinsic::amdgcn_readfirstlane:
7209 case Intrinsic::amdgcn_permlane64:
7217 std::reverse(Operands.
begin(), Operands.
end());
7219 if (
SDNode *GL =
N->getGluedNode()) {
7221 GL = GL->getOperand(0).getNode();
7231 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7232 IID == Intrinsic::amdgcn_mov_dpp8 ||
7233 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7234 Src1 =
N->getOperand(2);
7235 if (IID == Intrinsic::amdgcn_writelane ||
7236 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7237 Src2 =
N->getOperand(3);
7240 if (ValSize == SplitSize) {
7250 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7255 if (IID == Intrinsic::amdgcn_writelane) {
7260 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7262 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7265 if (ValSize % SplitSize != 0)
7269 EVT VT =
N->getValueType(0);
7273 unsigned NumOperands =
N->getNumOperands();
7275 SDNode *GL =
N->getGluedNode();
7280 for (
unsigned i = 0; i != NE; ++i) {
7281 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7283 SDValue Operand =
N->getOperand(j);
7292 Operands[j] = Operand;
7297 Operands[NumOperands - 1] =
7313 if (SplitSize == 32) {
7315 return unrollLaneOp(LaneOp.
getNode());
7321 unsigned SubVecNumElt =
7325 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7326 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7330 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7335 if (IID == Intrinsic::amdgcn_writelane)
7340 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7341 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7342 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7343 EltIdx += SubVecNumElt;
7357 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7360 if (IID == Intrinsic::amdgcn_writelane)
7363 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7371 switch (
N->getOpcode()) {
7383 unsigned IID =
N->getConstantOperandVal(0);
7385 case Intrinsic::amdgcn_make_buffer_rsrc:
7386 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
7388 case Intrinsic::amdgcn_cvt_pkrtz: {
7393 DAG.
getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
7397 case Intrinsic::amdgcn_cvt_pknorm_i16:
7398 case Intrinsic::amdgcn_cvt_pknorm_u16:
7399 case Intrinsic::amdgcn_cvt_pk_i16:
7400 case Intrinsic::amdgcn_cvt_pk_u16: {
7406 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7407 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7408 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7409 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7410 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7411 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7413 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7415 EVT VT =
N->getValueType(0);
7424 case Intrinsic::amdgcn_s_buffer_load: {
7430 if (!Subtarget->hasScalarSubwordLoads())
7436 EVT VT =
Op.getValueType();
7437 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
7449 if (!
Offset->isDivergent()) {
7468 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
7473 case Intrinsic::amdgcn_dead: {
7474 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7485 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7486 Results.push_back(Res.getOperand(
I));
7490 Results.push_back(Res.getValue(1));
7499 EVT VT =
N->getValueType(0);
7504 EVT SelectVT = NewVT;
7505 if (NewVT.
bitsLT(MVT::i32)) {
7508 SelectVT = MVT::i32;
7514 if (NewVT != SelectVT)
7520 if (
N->getValueType(0) != MVT::v2f16)
7532 if (
N->getValueType(0) != MVT::v2f16)
7544 if (
N->getValueType(0) != MVT::f16)
7559 if (U.get() !=
Value)
7562 if (U.getUser()->getOpcode() == Opcode)
7568unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
7571 case Intrinsic::amdgcn_if:
7572 return AMDGPUISD::IF;
7573 case Intrinsic::amdgcn_else:
7574 return AMDGPUISD::ELSE;
7575 case Intrinsic::amdgcn_loop:
7576 return AMDGPUISD::LOOP;
7577 case Intrinsic::amdgcn_end_cf:
7597 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7624 SDNode *Intr = BRCOND.getOperand(1).getNode();
7641 Intr =
LHS.getNode();
7649 assert(BR &&
"brcond missing unconditional branch user");
7654 unsigned CFNode = isCFIntrinsic(Intr);
7674 Ops.push_back(Target);
7697 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
7716 MVT VT =
Op.getSimpleValueType();
7719 if (
Op.getConstantOperandVal(0) != 0)
7723 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7725 if (
Info->isEntryFunction())
7742 return Op.getValueType().bitsLE(VT)
7750 EVT DstVT =
Op.getValueType();
7757 unsigned Opc =
Op.getOpcode();
7769 EVT SrcVT = Src.getValueType();
7770 EVT DstVT =
Op.getValueType();
7773 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
7776 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
7783 if (DstVT == MVT::f16) {
7788 if (!Subtarget->has16BitInsts()) {
7793 if (
Op->getFlags().hasApproximateFuncs()) {
7804 "custom lower FP_ROUND for f16 or bf16");
7805 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
7818 EVT VT =
Op.getValueType();
7820 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7821 bool IsIEEEMode =
Info->getMode().IEEE;
7830 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7837SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
7839 EVT VT =
Op.getValueType();
7841 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7842 bool IsIEEEMode =
Info->getMode().IEEE;
7847 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7855 EVT VT =
Op.getValueType();
7859 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7860 !Subtarget->hasMinimum3Maximum3F16() &&
7861 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7862 "should not need to widen f16 minimum/maximum to v2f16");
7876 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7884 EVT VT =
Op.getValueType();
7888 EVT ExpVT =
Exp.getValueType();
7889 if (ExpVT == MVT::i16)
7910 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
7917 switch (
Op->getOpcode()) {
7947 DAGCombinerInfo &DCI)
const {
7948 const unsigned Opc =
Op.getOpcode();
7956 :
Op->getOperand(0).getValueType();
7959 if (DCI.isBeforeLegalizeOps() ||
7963 auto &DAG = DCI.DAG;
7969 LHS =
Op->getOperand(1);
7970 RHS =
Op->getOperand(2);
7972 LHS =
Op->getOperand(0);
7973 RHS =
Op->getOperand(1);
8012 if (MagVT == SignVT)
8029 EVT VT =
Op.getValueType();
8035 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
8062 if (
Op->isDivergent())
8075 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8077 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8080 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8082 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8088 EVT VT =
Op.getValueType();
8095 const APInt &
C = RHSC->getAPIntValue();
8097 if (
C.isPowerOf2()) {
8099 bool UseArithShift = isSigned && !
C.isMinSignedValue();
8126 if (
Op->isDivergent()) {
8130 if (Subtarget->hasSMulHi()) {
8141 if (!Subtarget->isTrapHandlerEnabled() ||
8143 return lowerTrapEndpgm(
Op, DAG);
8145 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
8146 : lowerTrapHsaQueuePtr(
Op, DAG);
8152 return DAG.
getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8156SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
8158 ImplicitParameter Param)
const {
8162 MachinePointerInfo PtrInfo =
8179 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
8182 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8185 if (UserSGPR == AMDGPU::NoRegister) {
8202 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8211 if (Subtarget->hasPrivEnabledTrap2NopBug())
8212 return DAG.
getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
8216 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8224 if (!Subtarget->isTrapHandlerEnabled() ||
8228 "debugtrap handler not supported",
8236 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
8239SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
8241 if (Subtarget->hasApertureRegs()) {
8243 ? AMDGPU::SRC_SHARED_BASE
8244 : AMDGPU::SRC_PRIVATE_BASE;
8245 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8246 !Subtarget->hasGloballyAddressableScratch()) &&
8247 "Cannot use src_private_base with globally addressable scratch!");
8268 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
8272 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8274 if (UserSGPR == AMDGPU::NoRegister) {
8319 const AMDGPUTargetMachine &TM =
8322 unsigned DestAS, SrcAS;
8324 bool IsNonNull =
false;
8326 SrcAS = ASC->getSrcAddressSpace();
8327 Src = ASC->getOperand(0);
8328 DestAS = ASC->getDestAddressSpace();
8331 Op.getConstantOperandVal(0) ==
8332 Intrinsic::amdgcn_addrspacecast_nonnull);
8333 Src =
Op->getOperand(1);
8334 SrcAS =
Op->getConstantOperandVal(2);
8335 DestAS =
Op->getConstantOperandVal(3);
8348 Subtarget->hasGloballyAddressableScratch()) {
8353 AMDGPU::S_MOV_B32, SL, MVT::i32,
8354 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8362 unsigned NullVal = TM.getNullPointerValue(DestAS);
8377 Subtarget->hasGloballyAddressableScratch()) {
8386 if (Subtarget->isWave64())
8392 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8400 AMDGPU::S_MOV_B64, SL, MVT::i64,
8401 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8403 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8405 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8413 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8425 Op.getValueType() == MVT::i64) {
8426 const SIMachineFunctionInfo *
Info =
8428 if (
Info->get32BitAddressHighBits() == 0)
8437 Src.getValueType() == MVT::i64)
8465 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
8470 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8472 MVT::i32, InsNumElts / 2);
8477 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
8479 if (InsNumElts == 2) {
8492 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
8515 if (NumElts == 4 && EltSize == 16 && KIdx) {
8526 unsigned Idx = KIdx->getZExtValue();
8527 bool InsertLo = Idx < 2;
8531 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8537 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8550 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
8585 EVT ResultVT =
Op.getValueType();
8598 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8601 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8605 if (VecSize == 128) {
8613 }
else if (VecSize == 256) {
8616 for (
unsigned P = 0;
P < 4; ++
P) {
8622 Parts[0], Parts[1]));
8624 Parts[2], Parts[3]));
8630 for (
unsigned P = 0;
P < 8; ++
P) {
8637 Parts[0], Parts[1], Parts[2], Parts[3]));
8640 Parts[4], Parts[5], Parts[6], Parts[7]));
8660 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8675 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8685 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8690 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8691 !(Mask[Elt + 1] & 1);
8697 EVT ResultVT =
Op.getValueType();
8700 const int NewSrcNumElts = 2;
8702 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
8718 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
8740 if (ShouldUseConsecutiveExtract &&
8743 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8744 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8756 if (Idx0 >= SrcNumElts) {
8761 if (Idx1 >= SrcNumElts) {
8766 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8767 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8775 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8776 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8781 if (SubVec0 != SubVec1) {
8782 NewMaskIdx1 += NewSrcNumElts;
8789 {NewMaskIdx0, NewMaskIdx1});
8794 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8795 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8796 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8797 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8816 EVT ResultVT =
Op.getValueType();
8832 EVT VT =
Op.getValueType();
8834 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8835 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
8869 for (
unsigned P = 0;
P < NumParts; ++
P) {
8871 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
8890 if (!Subtarget->isAmdHsaOS())
8933 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET64,
DL, PtrVT, Ptr);
8942 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET,
DL, PtrVT, PtrLo, PtrHi);
8950 EVT PtrVT =
Op.getValueType();
8952 const GlobalValue *GV = GSD->
getGlobal();
8966 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
8981 return DAG.
getNode(AMDGPUISD::LDS,
DL, MVT::i32, GA);
8984 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8985 if (Subtarget->has64BitLiterals()) {
9016 MachinePointerInfo PtrInfo =
9044 SDValue Param = lowerKernargMemParameter(
9055 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
9063 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
9071 unsigned NumElts = Elts.
size();
9073 if (NumElts <= 12) {
9082 for (
unsigned i = 0; i < Elts.
size(); ++i) {
9088 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
9098 EVT SrcVT = Src.getValueType();
9119 bool Unpacked,
bool IsD16,
int DMaskPop,
9120 int NumVDataDwords,
bool IsAtomicPacked16Bit,
9124 EVT ReqRetVT = ResultTypes[0];
9126 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9127 ? (ReqRetNumElts + 1) / 2
9130 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9141 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
9152 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
9154 NumDataDwords - MaskPopDwords);
9159 EVT LegalReqRetVT = ReqRetVT;
9161 if (!
Data.getValueType().isInteger())
9163 Data.getValueType().changeTypeToInteger(),
Data);
9184 if (Result->getNumValues() == 1)
9191 SDValue *LWE,
bool &IsTexFail) {
9211 unsigned DimIdx,
unsigned EndIdx,
9212 unsigned NumGradients) {
9214 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
9222 if (((
I + 1) >= EndIdx) ||
9223 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
9224 I == DimIdx + NumGradients - 1))) {
9246 !
Op.getNode()->hasAnyUseOfValue(0))
9248 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9258 ResultTypes.erase(&ResultTypes[0]);
9264 int NumVDataDwords = 0;
9265 bool AdjustRetType =
false;
9266 bool IsAtomicPacked16Bit =
false;
9269 const unsigned ArgOffset = WithChain ? 2 : 1;
9272 unsigned DMaskLanes = 0;
9274 if (BaseOpcode->
Atomic) {
9275 VData =
Op.getOperand(2);
9277 IsAtomicPacked16Bit =
9278 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9279 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9280 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9281 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9292 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9294 DMask = Is64Bit ? 0xf : 0x3;
9295 NumVDataDwords = Is64Bit ? 4 : 2;
9297 DMask = Is64Bit ? 0x3 : 0x1;
9298 NumVDataDwords = Is64Bit ? 2 : 1;
9301 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
9304 if (BaseOpcode->
Store) {
9305 VData =
Op.getOperand(2);
9309 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9313 VData = handleD16VData(VData, DAG,
true);
9316 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
9317 }
else if (!BaseOpcode->
NoReturn) {
9322 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9330 (!LoadVT.
isVector() && DMaskLanes > 1))
9336 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9337 !(BaseOpcode->
Gather4 && Subtarget->hasImageGather4D16Bug()))
9338 NumVDataDwords = (DMaskLanes + 1) / 2;
9340 NumVDataDwords = DMaskLanes;
9342 AdjustRetType =
true;
9346 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
9353 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9354 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9356 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
9358 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9359 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9363 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
9369 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
9373 "Bias needs to be converted to 16 bit in A16 mode");
9378 if (BaseOpcode->
Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
9382 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
9383 "require 16 bit args for both gradients and addresses");
9388 if (!
ST->hasA16()) {
9389 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
9390 "support 16 bit addresses\n");
9400 if (BaseOpcode->
Gradients && IsG16 &&
ST->hasG16()) {
9402 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9404 IntrOpcode = G16MappingInfo->
G16;
9427 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
9445 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->
Sampler);
9446 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
9447 const bool UseNSA =
ST->hasNSAEncoding() &&
9448 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
9449 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
9450 const bool UsePartialNSA =
9451 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
9454 if (UsePartialNSA) {
9456 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9457 }
else if (!UseNSA) {
9467 uint64_t UnormConst =
9468 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
9470 Unorm = UnormConst ? True : False;
9476 bool IsTexFail =
false;
9477 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9488 NumVDataDwords += 1;
9489 AdjustRetType =
true;
9494 if (AdjustRetType) {
9497 if (DMaskLanes == 0 && !BaseOpcode->
Store) {
9506 MVT::i32, NumVDataDwords)
9509 ResultTypes[0] = NewVT;
9510 if (ResultTypes.size() == 3) {
9514 ResultTypes.erase(&ResultTypes[1]);
9528 Ops.push_back(VData);
9529 if (UsePartialNSA) {
9531 Ops.push_back(VAddr);
9535 Ops.push_back(VAddr);
9538 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9540 Ops.push_back(Rsrc);
9545 Ops.push_back(Samp);
9550 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9551 Ops.push_back(Unorm);
9553 Ops.push_back(IsA16 &&
9554 ST->hasFeature(AMDGPU::FeatureR128A16)
9558 Ops.push_back(IsA16 ? True : False);
9560 if (!Subtarget->hasGFX90AInsts())
9565 "TFE is not supported on this GPU",
DL.getDebugLoc()));
9568 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9571 Ops.push_back(DimInfo->
DA ? True : False);
9573 Ops.push_back(IsD16 ? True : False);
9575 Ops.push_back(
Op.getOperand(0));
9577 int NumVAddrDwords =
9583 NumVDataDwords, NumVAddrDwords);
9584 }
else if (IsGFX11Plus) {
9586 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9587 : AMDGPU::MIMGEncGfx11Default,
9588 NumVDataDwords, NumVAddrDwords);
9589 }
else if (IsGFX10Plus) {
9591 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9592 : AMDGPU::MIMGEncGfx10Default,
9593 NumVDataDwords, NumVAddrDwords);
9595 if (Subtarget->hasGFX90AInsts()) {
9597 NumVDataDwords, NumVAddrDwords);
9601 "requested image instruction is not supported on this GPU",
9606 for (EVT VT : OrigResultTypes) {
9607 if (VT == MVT::Other)
9608 RetValues[Idx++] =
Op.getOperand(0);
9619 NumVDataDwords, NumVAddrDwords);
9622 NumVDataDwords, NumVAddrDwords);
9629 MachineMemOperand *MemRef = MemOp->getMemOperand();
9648 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9649 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9662 MachinePointerInfo(),
9667 if (!
Offset->isDivergent()) {
9674 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9683 !Subtarget->hasScalarDwordx3Loads()) {
9687 AMDGPUISD::SBUFFER_LOAD,
DL, DAG.
getVTList(WidenedVT),
Ops, WidenedVT,
9710 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9712 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
9716 unsigned NumLoads = 1;
9722 if (NumElts == 8 || NumElts == 16) {
9723 NumLoads = NumElts / 4;
9727 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
9732 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
9734 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
9735 for (
unsigned i = 0; i < NumLoads; ++i) {
9737 Loads.
push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD,
DL, VTList,
Ops,
9741 if (NumElts == 8 || NumElts == 16)
9749 if (!Subtarget->hasArchitectedSGPRs())
9754 return DAG.
getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
9761 unsigned Width)
const {
9763 using namespace AMDGPU::Hwreg;
9765 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9804 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
9806 EVT VT =
Op.getValueType();
9808 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
9812 switch (IntrinsicID) {
9813 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9816 return getPreloadedValue(DAG, *MFI, VT,
9819 case Intrinsic::amdgcn_dispatch_ptr:
9820 case Intrinsic::amdgcn_queue_ptr: {
9821 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
9823 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
9828 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9831 return getPreloadedValue(DAG, *MFI, VT, RegID);
9833 case Intrinsic::amdgcn_implicitarg_ptr: {
9835 return getImplicitArgPtr(DAG,
DL);
9836 return getPreloadedValue(DAG, *MFI, VT,
9839 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9845 return getPreloadedValue(DAG, *MFI, VT,
9848 case Intrinsic::amdgcn_dispatch_id: {
9851 case Intrinsic::amdgcn_rcp:
9852 return DAG.
getNode(AMDGPUISD::RCP,
DL, VT,
Op.getOperand(1));
9853 case Intrinsic::amdgcn_rsq:
9854 return DAG.
getNode(AMDGPUISD::RSQ,
DL, VT,
Op.getOperand(1));
9855 case Intrinsic::amdgcn_rsq_legacy:
9859 case Intrinsic::amdgcn_rcp_legacy:
9862 return DAG.
getNode(AMDGPUISD::RCP_LEGACY,
DL, VT,
Op.getOperand(1));
9863 case Intrinsic::amdgcn_rsq_clamp: {
9865 return DAG.
getNode(AMDGPUISD::RSQ_CLAMP,
DL, VT,
Op.getOperand(1));
9877 case Intrinsic::r600_read_ngroups_x:
9878 if (Subtarget->isAmdHsaOS())
9881 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9884 case Intrinsic::r600_read_ngroups_y:
9885 if (Subtarget->isAmdHsaOS())
9888 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9891 case Intrinsic::r600_read_ngroups_z:
9892 if (Subtarget->isAmdHsaOS())
9895 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9898 case Intrinsic::r600_read_local_size_x:
9899 if (Subtarget->isAmdHsaOS())
9902 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9904 case Intrinsic::r600_read_local_size_y:
9905 if (Subtarget->isAmdHsaOS())
9908 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9910 case Intrinsic::r600_read_local_size_z:
9911 if (Subtarget->isAmdHsaOS())
9914 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9916 case Intrinsic::amdgcn_workgroup_id_x:
9917 return lowerWorkGroupId(DAG, *MFI, VT,
9921 case Intrinsic::amdgcn_workgroup_id_y:
9922 return lowerWorkGroupId(DAG, *MFI, VT,
9926 case Intrinsic::amdgcn_workgroup_id_z:
9927 return lowerWorkGroupId(DAG, *MFI, VT,
9931 case Intrinsic::amdgcn_cluster_id_x:
9932 return Subtarget->hasClusters()
9933 ? getPreloadedValue(DAG, *MFI, VT,
9935 : DAG.getPOISON(VT);
9936 case Intrinsic::amdgcn_cluster_id_y:
9937 return Subtarget->hasClusters()
9938 ? getPreloadedValue(DAG, *MFI, VT,
9941 case Intrinsic::amdgcn_cluster_id_z:
9942 return Subtarget->hasClusters()
9943 ? getPreloadedValue(DAG, *MFI, VT,
9946 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9947 return Subtarget->hasClusters()
9948 ? getPreloadedValue(
9952 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9953 return Subtarget->hasClusters()
9954 ? getPreloadedValue(
9958 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9959 return Subtarget->hasClusters()
9960 ? getPreloadedValue(
9964 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9965 return Subtarget->hasClusters()
9968 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9969 return Subtarget->hasClusters()
9970 ? getPreloadedValue(
9974 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9975 return Subtarget->hasClusters()
9976 ? getPreloadedValue(
9980 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9981 return Subtarget->hasClusters()
9982 ? getPreloadedValue(
9986 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9987 return Subtarget->hasClusters()
9988 ? getPreloadedValue(
9992 case Intrinsic::amdgcn_wave_id:
9993 return lowerWaveID(DAG,
Op);
9994 case Intrinsic::amdgcn_lds_kernel_id: {
9996 return getLDSKernelId(DAG,
DL);
9997 return getPreloadedValue(DAG, *MFI, VT,
10000 case Intrinsic::amdgcn_workitem_id_x:
10001 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
10002 case Intrinsic::amdgcn_workitem_id_y:
10003 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
10004 case Intrinsic::amdgcn_workitem_id_z:
10005 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
10006 case Intrinsic::amdgcn_wavefrontsize:
10008 SDLoc(
Op), MVT::i32);
10009 case Intrinsic::amdgcn_s_buffer_load: {
10010 unsigned CPol =
Op.getConstantOperandVal(3);
10017 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
10018 Op.getOperand(3), DAG);
10020 case Intrinsic::amdgcn_fdiv_fast:
10021 return lowerFDIV_FAST(
Op, DAG);
10022 case Intrinsic::amdgcn_sin:
10023 return DAG.
getNode(AMDGPUISD::SIN_HW,
DL, VT,
Op.getOperand(1));
10025 case Intrinsic::amdgcn_cos:
10026 return DAG.
getNode(AMDGPUISD::COS_HW,
DL, VT,
Op.getOperand(1));
10028 case Intrinsic::amdgcn_mul_u24:
10029 return DAG.
getNode(AMDGPUISD::MUL_U24,
DL, VT,
Op.getOperand(1),
10031 case Intrinsic::amdgcn_mul_i24:
10032 return DAG.
getNode(AMDGPUISD::MUL_I24,
DL, VT,
Op.getOperand(1),
10035 case Intrinsic::amdgcn_log_clamp: {
10041 case Intrinsic::amdgcn_fract:
10042 return DAG.
getNode(AMDGPUISD::FRACT,
DL, VT,
Op.getOperand(1));
10044 case Intrinsic::amdgcn_class:
10045 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, VT,
Op.getOperand(1),
10047 case Intrinsic::amdgcn_div_fmas:
10048 return DAG.
getNode(AMDGPUISD::DIV_FMAS,
DL, VT,
Op.getOperand(1),
10049 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10051 case Intrinsic::amdgcn_div_fixup:
10052 return DAG.
getNode(AMDGPUISD::DIV_FIXUP,
DL, VT,
Op.getOperand(1),
10053 Op.getOperand(2),
Op.getOperand(3));
10055 case Intrinsic::amdgcn_div_scale: {
10061 SDValue Denominator =
Op.getOperand(2);
10068 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
10070 return DAG.
getNode(AMDGPUISD::DIV_SCALE,
DL,
Op->getVTList(), Src0,
10071 Denominator, Numerator);
10073 case Intrinsic::amdgcn_icmp: {
10075 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
10076 Op.getConstantOperandVal(2) == 0 &&
10081 case Intrinsic::amdgcn_fcmp: {
10084 case Intrinsic::amdgcn_ballot:
10086 case Intrinsic::amdgcn_fmed3:
10087 return DAG.
getNode(AMDGPUISD::FMED3,
DL, VT,
Op.getOperand(1),
10088 Op.getOperand(2),
Op.getOperand(3));
10089 case Intrinsic::amdgcn_fdot2:
10090 return DAG.
getNode(AMDGPUISD::FDOT2,
DL, VT,
Op.getOperand(1),
10091 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10092 case Intrinsic::amdgcn_fmul_legacy:
10093 return DAG.
getNode(AMDGPUISD::FMUL_LEGACY,
DL, VT,
Op.getOperand(1),
10095 case Intrinsic::amdgcn_sffbh:
10096 return DAG.
getNode(AMDGPUISD::FFBH_I32,
DL, VT,
Op.getOperand(1));
10097 case Intrinsic::amdgcn_sbfe:
10098 return DAG.
getNode(AMDGPUISD::BFE_I32,
DL, VT,
Op.getOperand(1),
10099 Op.getOperand(2),
Op.getOperand(3));
10100 case Intrinsic::amdgcn_ubfe:
10101 return DAG.
getNode(AMDGPUISD::BFE_U32,
DL, VT,
Op.getOperand(1),
10102 Op.getOperand(2),
Op.getOperand(3));
10103 case Intrinsic::amdgcn_cvt_pkrtz:
10104 case Intrinsic::amdgcn_cvt_pknorm_i16:
10105 case Intrinsic::amdgcn_cvt_pknorm_u16:
10106 case Intrinsic::amdgcn_cvt_pk_i16:
10107 case Intrinsic::amdgcn_cvt_pk_u16: {
10109 EVT VT =
Op.getValueType();
10112 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10113 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10114 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10115 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
10116 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10117 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
10118 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10119 Opcode = AMDGPUISD::CVT_PK_I16_I32;
10121 Opcode = AMDGPUISD::CVT_PK_U16_U32;
10124 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
10127 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
10130 case Intrinsic::amdgcn_fmad_ftz:
10131 return DAG.
getNode(AMDGPUISD::FMAD_FTZ,
DL, VT,
Op.getOperand(1),
10132 Op.getOperand(2),
Op.getOperand(3));
10134 case Intrinsic::amdgcn_if_break:
10136 Op->getOperand(1),
Op->getOperand(2)),
10139 case Intrinsic::amdgcn_groupstaticsize: {
10145 const GlobalValue *GV =
10151 case Intrinsic::amdgcn_is_shared:
10152 case Intrinsic::amdgcn_is_private: {
10159 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10163 Subtarget->hasGloballyAddressableScratch()) {
10166 AMDGPU::S_MOV_B32,
DL, MVT::i32,
10167 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10176 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10179 case Intrinsic::amdgcn_perm:
10180 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op.getOperand(1),
10181 Op.getOperand(2),
Op.getOperand(3));
10182 case Intrinsic::amdgcn_reloc_constant: {
10192 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10193 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10194 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10195 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10196 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10197 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10198 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10199 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10200 if (
Op.getOperand(4).getValueType() == MVT::i32)
10206 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10207 Op.getOperand(3), IndexKeyi32);
10209 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10210 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10211 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10212 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10213 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10214 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10215 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10216 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10217 if (
Op.getOperand(4).getValueType() == MVT::i64)
10223 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10224 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10225 Op.getOperand(6)});
10227 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10228 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10229 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10230 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10231 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10232 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10233 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10236 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
10242 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10243 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10244 IndexKey, Op.getOperand(7),
10245 Op.getOperand(8)});
10247 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10248 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10249 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10250 if (
Op.getOperand(6).getValueType() == MVT::i32)
10256 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10257 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10258 IndexKeyi32, Op.getOperand(7)});
10260 case Intrinsic::amdgcn_addrspacecast_nonnull:
10261 return lowerADDRSPACECAST(
Op, DAG);
10262 case Intrinsic::amdgcn_readlane:
10263 case Intrinsic::amdgcn_readfirstlane:
10264 case Intrinsic::amdgcn_writelane:
10265 case Intrinsic::amdgcn_permlane16:
10266 case Intrinsic::amdgcn_permlanex16:
10267 case Intrinsic::amdgcn_permlane64:
10268 case Intrinsic::amdgcn_set_inactive:
10269 case Intrinsic::amdgcn_set_inactive_chain_arg:
10270 case Intrinsic::amdgcn_mov_dpp8:
10271 case Intrinsic::amdgcn_update_dpp:
10273 case Intrinsic::amdgcn_dead: {
10275 for (
const EVT ValTy :
Op.getNode()->values())
10280 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10282 return lowerImage(
Op, ImageDimIntr, DAG,
false);
10293 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10299 unsigned NewOpcode)
const {
10303 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10304 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10322 M->getMemOperand());
10327 unsigned NewOpcode)
const {
10331 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10332 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10350 M->getMemOperand());
10355 unsigned IntrID =
Op.getConstantOperandVal(1);
10359 case Intrinsic::amdgcn_ds_ordered_add:
10360 case Intrinsic::amdgcn_ds_ordered_swap: {
10365 unsigned IndexOperand =
M->getConstantOperandVal(7);
10366 unsigned WaveRelease =
M->getConstantOperandVal(8);
10367 unsigned WaveDone =
M->getConstantOperandVal(9);
10369 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10370 IndexOperand &= ~0x3f;
10371 unsigned CountDw = 0;
10374 CountDw = (IndexOperand >> 24) & 0xf;
10375 IndexOperand &= ~(0xf << 24);
10377 if (CountDw < 1 || CountDw > 4) {
10380 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
10381 DL.getDebugLoc()));
10386 if (IndexOperand) {
10389 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
10392 if (WaveDone && !WaveRelease) {
10396 Fn,
"ds_ordered_count: wave_done requires wave_release",
10397 DL.getDebugLoc()));
10400 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10401 unsigned ShaderType =
10403 unsigned Offset0 = OrderedCountIndex << 2;
10404 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10407 Offset1 |= (CountDw - 1) << 6;
10410 Offset1 |= ShaderType << 2;
10412 unsigned Offset = Offset0 | (Offset1 << 8);
10419 M->getVTList(),
Ops,
M->getMemoryVT(),
10420 M->getMemOperand());
10422 case Intrinsic::amdgcn_raw_buffer_load:
10423 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10424 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10425 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10426 case Intrinsic::amdgcn_raw_buffer_load_format:
10427 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10428 const bool IsFormat =
10429 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10430 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10432 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10433 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10447 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
10449 case Intrinsic::amdgcn_struct_buffer_load:
10450 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10451 case Intrinsic::amdgcn_struct_buffer_load_format:
10452 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10453 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10454 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10455 const bool IsFormat =
10456 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10457 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10459 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10460 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10475 case Intrinsic::amdgcn_raw_tbuffer_load:
10476 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10478 EVT LoadVT =
Op.getValueType();
10479 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10480 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10496 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10498 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
10499 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10502 case Intrinsic::amdgcn_struct_tbuffer_load:
10503 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10505 EVT LoadVT =
Op.getValueType();
10506 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10507 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10523 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
10525 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
10526 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10529 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10530 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10531 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
10532 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10533 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10534 return lowerStructBufferAtomicIntrin(
Op, DAG,
10535 AMDGPUISD::BUFFER_ATOMIC_FADD);
10536 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10537 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10538 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
10539 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10540 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10541 return lowerStructBufferAtomicIntrin(
Op, DAG,
10542 AMDGPUISD::BUFFER_ATOMIC_FMIN);
10543 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10544 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10545 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
10546 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10547 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10548 return lowerStructBufferAtomicIntrin(
Op, DAG,
10549 AMDGPUISD::BUFFER_ATOMIC_FMAX);
10550 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10551 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10552 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
10553 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10554 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10555 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10556 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10557 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10558 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10559 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10560 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10561 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
10562 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10563 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10564 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
10565 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10566 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10567 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
10568 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10569 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10570 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
10571 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10572 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10573 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10574 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10575 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10576 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10577 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10578 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10579 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10580 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10581 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10582 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10583 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10584 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10585 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10586 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10587 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10588 return lowerStructBufferAtomicIntrin(
Op, DAG,
10589 AMDGPUISD::BUFFER_ATOMIC_SWAP);
10590 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10591 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10592 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
10593 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10594 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10595 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
10596 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10597 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10598 return lowerStructBufferAtomicIntrin(
Op, DAG,
10599 AMDGPUISD::BUFFER_ATOMIC_SMIN);
10600 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10601 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10602 return lowerStructBufferAtomicIntrin(
Op, DAG,
10603 AMDGPUISD::BUFFER_ATOMIC_UMIN);
10604 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10605 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10606 return lowerStructBufferAtomicIntrin(
Op, DAG,
10607 AMDGPUISD::BUFFER_ATOMIC_SMAX);
10608 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10609 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10610 return lowerStructBufferAtomicIntrin(
Op, DAG,
10611 AMDGPUISD::BUFFER_ATOMIC_UMAX);
10612 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10613 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10614 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
10615 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10616 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10617 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
10618 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10619 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10620 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
10621 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10622 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10623 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
10624 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10625 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10626 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
10627 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
10628 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
10629 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
10630 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
10631 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
10632 return lowerStructBufferAtomicIntrin(
Op, DAG,
10633 AMDGPUISD::BUFFER_ATOMIC_CSUB);
10634 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10635 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
10636 return lowerRawBufferAtomicIntrin(
Op, DAG,
10637 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10638 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10639 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
10640 return lowerStructBufferAtomicIntrin(
Op, DAG,
10641 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
10642 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10643 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10644 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
10645 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10659 EVT VT =
Op.getValueType();
10663 Op->getVTList(),
Ops, VT,
10664 M->getMemOperand());
10666 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10667 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10668 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
10669 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
10683 EVT VT =
Op.getValueType();
10687 Op->getVTList(),
Ops, VT,
10688 M->getMemOperand());
10690 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10691 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10693 SDValue NodePtr =
M->getOperand(2);
10694 SDValue RayExtent =
M->getOperand(3);
10695 SDValue InstanceMask =
M->getOperand(4);
10696 SDValue RayOrigin =
M->getOperand(5);
10697 SDValue RayDir =
M->getOperand(6);
10699 SDValue TDescr =
M->getOperand(8);
10704 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10709 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10710 const unsigned NumVDataDwords = 10;
10711 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10713 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10714 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10715 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10719 Ops.push_back(NodePtr);
10722 {DAG.getBitcast(MVT::i32, RayExtent),
10723 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10724 Ops.push_back(RayOrigin);
10725 Ops.push_back(RayDir);
10726 Ops.push_back(Offsets);
10727 Ops.push_back(TDescr);
10728 Ops.push_back(
M->getChain());
10731 MachineMemOperand *MemRef =
M->getMemOperand();
10735 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10737 SDValue NodePtr =
M->getOperand(2);
10738 SDValue RayExtent =
M->getOperand(3);
10739 SDValue RayOrigin =
M->getOperand(4);
10740 SDValue RayDir =
M->getOperand(5);
10741 SDValue RayInvDir =
M->getOperand(6);
10742 SDValue TDescr =
M->getOperand(7);
10749 if (!Subtarget->hasGFX10_AEncoding()) {
10759 const unsigned NumVDataDwords = 4;
10760 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10761 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10762 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10765 const unsigned BaseOpcodes[2][2] = {
10766 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10767 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10768 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10772 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10773 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10774 : AMDGPU::MIMGEncGfx10NSA,
10775 NumVDataDwords, NumVAddrDwords);
10779 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10780 : AMDGPU::MIMGEncGfx10Default,
10781 NumVDataDwords, NumVAddrDwords);
10787 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
10790 if (Lanes[0].getValueSizeInBits() == 32) {
10791 for (
unsigned I = 0;
I < 3; ++
I)
10798 Ops.push_back(Lanes[2]);
10810 if (UseNSA && IsGFX11Plus) {
10811 Ops.push_back(NodePtr);
10813 Ops.push_back(RayOrigin);
10818 for (
unsigned I = 0;
I < 3; ++
I) {
10821 {DirLanes[I], InvDirLanes[I]})));
10825 Ops.push_back(RayDir);
10826 Ops.push_back(RayInvDir);
10833 Ops.push_back(NodePtr);
10836 packLanes(RayOrigin,
true);
10837 packLanes(RayDir,
true);
10838 packLanes(RayInvDir,
false);
10843 if (NumVAddrDwords > 12) {
10845 Ops.append(16 -
Ops.size(), Undef);
10851 Ops.push_back(MergedOps);
10854 Ops.push_back(TDescr);
10856 Ops.push_back(
M->getChain());
10859 MachineMemOperand *MemRef =
M->getMemOperand();
10863 case Intrinsic::amdgcn_global_atomic_fmin_num:
10864 case Intrinsic::amdgcn_global_atomic_fmax_num:
10865 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10866 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10873 unsigned Opcode = 0;
10875 case Intrinsic::amdgcn_global_atomic_fmin_num:
10876 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10880 case Intrinsic::amdgcn_global_atomic_fmax_num:
10881 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10888 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
10889 Ops,
M->getMemOperand());
10891 case Intrinsic::amdgcn_s_get_barrier_state:
10892 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10899 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10900 BarID = (BarID >> 4) & 0x3F;
10901 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10904 Ops.push_back(Chain);
10906 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10907 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10915 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
10923 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10924 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10925 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10929 EVT VT =
Op->getValueType(0);
10935 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10937 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10945SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
10952 EVT VT = VTList.
VTs[0];
10955 bool IsTFE = VTList.
NumVTs == 3;
10958 unsigned NumOpDWords = NumValueDWords + 1;
10960 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
10961 MachineMemOperand *OpDWordsMMO =
10963 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
10964 OpDWordsVT, OpDWordsMMO, DAG);
10969 NumValueDWords == 1
10978 if (!Subtarget->hasDwordx3LoadStores() &&
10979 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10983 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
10985 WidenedMemVT, WidenedMMO);
10995 bool ImageStore)
const {
11005 if (Subtarget->hasUnpackedD16VMem()) {
11019 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11030 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
11036 if ((NumElements % 2) == 1) {
11038 unsigned I = Elts.
size() / 2;
11054 if (NumElements == 3) {
11075 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
11078 switch (IntrinsicID) {
11079 case Intrinsic::amdgcn_exp_compr: {
11080 if (!Subtarget->hasCompressedExport()) {
11083 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
11105 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11109 case Intrinsic::amdgcn_struct_tbuffer_store:
11110 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11112 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11114 VData = handleD16VData(VData, DAG);
11115 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11116 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11130 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11131 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11134 M->getMemoryVT(),
M->getMemOperand());
11137 case Intrinsic::amdgcn_raw_tbuffer_store:
11138 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11140 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11142 VData = handleD16VData(VData, DAG);
11143 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11144 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11158 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
11159 : AMDGPUISD::TBUFFER_STORE_FORMAT;
11162 M->getMemoryVT(),
M->getMemOperand());
11165 case Intrinsic::amdgcn_raw_buffer_store:
11166 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11167 case Intrinsic::amdgcn_raw_buffer_store_format:
11168 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11169 const bool IsFormat =
11170 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11171 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11178 VData = handleD16VData(VData, DAG);
11188 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11189 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11203 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
11204 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11209 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
11212 M->getMemoryVT(),
M->getMemOperand());
11215 case Intrinsic::amdgcn_struct_buffer_store:
11216 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11217 case Intrinsic::amdgcn_struct_buffer_store_format:
11218 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11219 const bool IsFormat =
11220 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11221 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11229 VData = handleD16VData(VData, DAG);
11239 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11240 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11254 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
11255 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
11259 EVT VDataType = VData.getValueType().getScalarType();
11261 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
11264 M->getMemoryVT(),
M->getMemOperand());
11266 case Intrinsic::amdgcn_raw_buffer_load_lds:
11267 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11268 case Intrinsic::amdgcn_struct_buffer_load_lds:
11269 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11270 if (!Subtarget->hasVMemToLDSLoad())
11274 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11275 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11276 unsigned OpOffset = HasVIndex ? 1 : 0;
11277 SDValue VOffset =
Op.getOperand(5 + OpOffset);
11279 unsigned Size =
Op->getConstantOperandVal(4);
11285 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11286 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11287 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11288 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11291 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11292 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11293 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11294 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11297 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11298 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11299 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11300 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11303 if (!Subtarget->hasLDSLoadB96_B128())
11305 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11306 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11307 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11308 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11311 if (!Subtarget->hasLDSLoadB96_B128())
11313 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11314 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11315 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11316 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11324 if (HasVIndex && HasVOffset)
11328 else if (HasVIndex)
11329 Ops.push_back(
Op.getOperand(5));
11330 else if (HasVOffset)
11331 Ops.push_back(VOffset);
11333 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11334 Ops.push_back(Rsrc);
11335 Ops.push_back(
Op.getOperand(6 + OpOffset));
11336 Ops.push_back(
Op.getOperand(7 + OpOffset));
11338 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
11351 MachineMemOperand *LoadMMO =
M->getMemOperand();
11356 MachinePointerInfo StorePtrI = LoadPtrI;
11380 case Intrinsic::amdgcn_load_to_lds:
11381 case Intrinsic::amdgcn_global_load_lds: {
11382 if (!Subtarget->hasVMemToLDSLoad())
11386 unsigned Size =
Op->getConstantOperandVal(4);
11391 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11394 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11397 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11400 if (!Subtarget->hasLDSLoadB96_B128())
11402 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11405 if (!Subtarget->hasLDSLoadB96_B128())
11407 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11423 if (
LHS->isDivergent())
11427 RHS.getOperand(0).getValueType() == MVT::i32) {
11430 VOffset =
RHS.getOperand(0);
11434 Ops.push_back(Addr);
11442 Ops.push_back(VOffset);
11445 Ops.push_back(
Op.getOperand(5));
11447 unsigned Aux =
Op.getConstantOperandVal(6);
11455 MachineMemOperand *LoadMMO =
M->getMemOperand();
11457 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
11458 MachinePointerInfo StorePtrI = LoadPtrI;
11477 case Intrinsic::amdgcn_end_cf:
11479 Op->getOperand(2), Chain),
11481 case Intrinsic::amdgcn_s_barrier_init:
11482 case Intrinsic::amdgcn_s_barrier_signal_var: {
11489 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11490 ? AMDGPU::S_BARRIER_INIT_M0
11491 : AMDGPU::S_BARRIER_SIGNAL_M0;
11506 constexpr unsigned ShAmt = 16;
11513 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11518 case Intrinsic::amdgcn_s_wakeup_barrier: {
11519 if (!Subtarget->hasSWakeupBarrier())
11523 case Intrinsic::amdgcn_s_barrier_join: {
11532 switch (IntrinsicID) {
11535 case Intrinsic::amdgcn_s_barrier_join:
11536 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11538 case Intrinsic::amdgcn_s_wakeup_barrier:
11539 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
11543 unsigned BarID = (BarVal >> 4) & 0x3F;
11546 Ops.push_back(Chain);
11548 switch (IntrinsicID) {
11551 case Intrinsic::amdgcn_s_barrier_join:
11552 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11554 case Intrinsic::amdgcn_s_wakeup_barrier:
11555 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
11566 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11572 case Intrinsic::amdgcn_s_prefetch_data: {
11575 return Op.getOperand(0);
11578 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11580 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
11587 Op->getVTList(),
Ops,
M->getMemoryVT(),
11588 M->getMemOperand());
11590 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11591 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11592 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11601 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11603 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11619 return PtrVT == MVT::i64;
11633std::pair<SDValue, SDValue>
11663 unsigned Overflow = ImmOffset & ~MaxImm;
11664 ImmOffset -= Overflow;
11665 if ((int32_t)Overflow < 0) {
11666 Overflow += ImmOffset;
11671 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
11690void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
11692 Align Alignment)
const {
11694 SDLoc
DL(CombinedOffset);
11696 uint32_t
Imm =
C->getZExtValue();
11697 uint32_t SOffset, ImmOffset;
11698 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11708 uint32_t SOffset, ImmOffset;
11711 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
11719 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11728SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
11731 return MaybePointer;
11745 SDValue NumRecords =
Op->getOperand(3);
11751 if (Subtarget->has45BitNumRecordsBufferResource()) {
11770 SDValue ExtShiftedStrideVec =
11782 DAG.
getNode(
ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11784 DAG.
getNode(
ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11789 auto [LowHalf, HighHalf] =
11790 DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11800 NumRecords, Flags);
11812 bool IsTFE)
const {
11817 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
11818 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
11821 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
11833 ? AMDGPUISD::BUFFER_LOAD_UBYTE
11834 : AMDGPUISD::BUFFER_LOAD_USHORT;
11836 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
11850 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11854 Ops[1] = BufferStoreExt;
11855 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
11856 : AMDGPUISD::BUFFER_STORE_SHORT;
11859 M->getMemOperand());
11884 DAGCombinerInfo &DCI)
const {
11885 SelectionDAG &DAG = DCI.DAG;
11900 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
11907 "unexpected vector extload");
11920 "unexpected fp extload");
11938 DCI.AddToWorklist(Cvt.
getNode());
11943 DCI.AddToWorklist(Cvt.
getNode());
11954 if (
Info.isEntryFunction())
11955 return Info.getUserSGPRInfo().hasFlatScratchInit();
11963 EVT MemVT =
Load->getMemoryVT();
11964 MachineMemOperand *MMO =
Load->getMemOperand();
11976 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12004 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
12005 "Custom lowering for non-i32 vectors hasn't been implemented.");
12008 unsigned AS =
Load->getAddressSpace();
12015 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12019 !Subtarget->hasMultiDwordFlatScratchAddressing())
12029 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
12032 Alignment >=
Align(4) && NumElements < 32) {
12034 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12046 if (NumElements > 4)
12049 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12059 switch (Subtarget->getMaxPrivateElementSize()) {
12065 if (NumElements > 2)
12070 if (NumElements > 4)
12073 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12082 auto Flags =
Load->getMemOperand()->getFlags();
12084 Load->getAlign(), Flags, &
Fast) &&
12093 MemVT, *
Load->getMemOperand())) {
12102 EVT VT =
Op.getValueType();
12139 EVT VT =
Op.getValueType();
12140 const SDNodeFlags
Flags =
Op->getFlags();
12142 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
12148 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12151 if (CLHS->isExactlyValue(1.0)) {
12164 return DAG.
getNode(AMDGPUISD::RCP, SL, VT,
RHS);
12168 if (CLHS->isExactlyValue(-1.0)) {
12171 return DAG.
getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
12177 if (!AllowInaccurateRcp &&
12178 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
12192 EVT VT =
Op.getValueType();
12193 const SDNodeFlags
Flags =
Op->getFlags();
12195 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
12196 if (!AllowInaccurateDiv)
12217 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
12227 Opcode = AMDGPUISD::FMUL_W_CHAIN;
12231 return DAG.
getNode(Opcode, SL, VTList,
12240 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
12250 Opcode = AMDGPUISD::FMA_W_CHAIN;
12254 return DAG.
getNode(Opcode, SL, VTList,
12260 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12261 return FastLowered;
12264 EVT VT =
Op.getValueType();
12271 if (VT == MVT::bf16) {
12294 unsigned FMADOpCode =
12298 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt,
Op->getFlags());
12301 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12303 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
12304 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12314 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst,
RHS,
LHS,
12320 SDNodeFlags
Flags =
Op->getFlags();
12330 const APFloat K0Val(0x1p+96f);
12333 const APFloat K1Val(0x1p-32f);
12360 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
12361 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
12362 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12367 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12368 return FastLowered;
12374 SDNodeFlags
Flags =
Op->getFlags();
12375 Flags.setNoFPExcept(
true);
12383 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
12392 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
12396 using namespace AMDGPU::Hwreg;
12397 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12401 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
12402 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
12405 const bool HasDynamicDenormals =
12411 if (!PreservesDenormals) {
12416 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12419 if (HasDynamicDenormals) {
12423 SavedDenormMode =
SDValue(GetReg, 0);
12429 SDNode *EnableDenorm;
12430 if (Subtarget->hasDenormModeInst()) {
12431 const SDValue EnableDenormValue =
12434 EnableDenorm = DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
12438 const SDValue EnableDenormValue =
12440 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12441 {EnableDenormValue,
BitField, Glue});
12451 ApproxRcp, One, NegDivScale0, Flags);
12454 ApproxRcp, Fma0, Flags);
12460 NumeratorScaled,
Mul, Flags);
12466 NumeratorScaled, Fma3, Flags);
12468 if (!PreservesDenormals) {
12469 SDNode *DisableDenorm;
12470 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12474 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12476 DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
12480 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
12481 const SDValue DisableDenormValue =
12482 HasDynamicDenormals
12487 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12498 {Fma4, Fma1, Fma3, Scale},
Flags);
12500 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas,
RHS,
LHS, Flags);
12504 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
12505 return FastLowered;
12513 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
12519 SDValue Rcp = DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
12537 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12567 DAG.
getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3,
Mul, Scale);
12569 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas,
Y,
X);
12573 EVT VT =
Op.getValueType();
12575 if (VT == MVT::f32)
12576 return LowerFDIV32(
Op, DAG);
12578 if (VT == MVT::f64)
12579 return LowerFDIV64(
Op, DAG);
12581 if (VT == MVT::f16 || VT == MVT::bf16)
12582 return LowerFDIV16(
Op, DAG);
12591 EVT ResultExpVT =
Op->getValueType(1);
12592 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12602 if (Subtarget->hasFractBug()) {
12620 EVT VT =
Store->getMemoryVT();
12622 if (VT == MVT::i1) {
12626 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
12630 Store->getValue().getValueType().getScalarType() == MVT::i32);
12632 unsigned AS =
Store->getAddressSpace();
12640 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12644 !Subtarget->hasMultiDwordFlatScratchAddressing())
12651 if (NumElements > 4)
12654 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12658 VT, *
Store->getMemOperand()))
12664 switch (Subtarget->getMaxPrivateElementSize()) {
12668 if (NumElements > 2)
12672 if (NumElements > 4 ||
12673 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12681 auto Flags =
Store->getMemOperand()->getFlags();
12700 assert(!Subtarget->has16BitInsts());
12701 SDNodeFlags
Flags =
Op->getFlags();
12715 SDNodeFlags
Flags =
Op->getFlags();
12716 MVT VT =
Op.getValueType().getSimpleVT();
12824 SDNodeFlags
Flags =
Op->getFlags();
12887 EVT VT =
Op.getValueType();
12897 if (Subtarget->hasTrigReducedRange()) {
12899 TrigVal = DAG.
getNode(AMDGPUISD::FRACT,
DL, VT, MulVal, Flags);
12904 switch (
Op.getOpcode()) {
12906 return DAG.
getNode(AMDGPUISD::COS_HW, SDLoc(
Op), VT, TrigVal, Flags);
12908 return DAG.
getNode(AMDGPUISD::SIN_HW, SDLoc(
Op), VT, TrigVal, Flags);
12931 EVT VT =
Op.getValueType();
12939 Op->getVTList(),
Ops, VT,
12948SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
12949 DAGCombinerInfo &DCI)
const {
12950 EVT VT =
N->getValueType(0);
12952 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12955 SelectionDAG &DAG = DCI.DAG;
12959 EVT SrcVT = Src.getValueType();
12965 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12968 DCI.AddToWorklist(Cvt.
getNode());
12971 if (ScalarVT != MVT::f32) {
12983 DAGCombinerInfo &DCI)
const {
12994 SelectionDAG &DAG = DCI.DAG;
13013 for (
unsigned I = 0;
I != NumElts; ++
I) {
13037 if (NewElts.
size() == 1)
13059 for (
unsigned I = 0;
I != NumElts; ++
I) {
13094SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
13096 DAGCombinerInfo &DCI)
const {
13113 SelectionDAG &DAG = DCI.DAG;
13126 AM.BaseOffs =
Offset.getSExtValue();
13131 EVT VT =
N->getValueType(0);
13137 Flags.setNoUnsignedWrap(
13138 N->getFlags().hasNoUnsignedWrap() &&
13150 switch (
N->getOpcode()) {
13161 DAGCombinerInfo &DCI)
const {
13162 SelectionDAG &DAG = DCI.DAG;
13169 SDValue NewPtr = performSHLPtrCombine(Ptr.
getNode(),
N->getAddressSpace(),
13170 N->getMemoryVT(), DCI);
13174 NewOps[PtrIdx] = NewPtr;
13183 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13184 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13193SDValue SITargetLowering::splitBinaryBitConstantOp(
13197 uint32_t ValLo =
Lo_32(Val);
13198 uint32_t ValHi =
Hi_32(Val);
13205 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
13219 if (V.getValueType() != MVT::i1)
13221 switch (V.getOpcode()) {
13226 case AMDGPUISD::FP_CLASS:
13238 return V.getResNo() == 1;
13240 unsigned IntrinsicID = V.getConstantOperandVal(0);
13241 switch (IntrinsicID) {
13242 case Intrinsic::amdgcn_is_shared:
13243 case Intrinsic::amdgcn_is_private:
13260 if (!(
C & 0x000000ff))
13261 ZeroByteMask |= 0x000000ff;
13262 if (!(
C & 0x0000ff00))
13263 ZeroByteMask |= 0x0000ff00;
13264 if (!(
C & 0x00ff0000))
13265 ZeroByteMask |= 0x00ff0000;
13266 if (!(
C & 0xff000000))
13267 ZeroByteMask |= 0xff000000;
13268 uint32_t NonZeroByteMask = ~ZeroByteMask;
13269 if ((NonZeroByteMask &
C) != NonZeroByteMask)
13282 assert(V.getValueSizeInBits() == 32);
13284 if (V.getNumOperands() != 2)
13293 switch (V.getOpcode()) {
13298 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13303 return (0x03020100 & ~ConstMask) | ConstMask;
13310 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
13316 return uint32_t(0x0c0c0c0c03020100ull >>
C);
13323 DAGCombinerInfo &DCI)
const {
13324 if (DCI.isBeforeLegalize())
13327 SelectionDAG &DAG = DCI.DAG;
13328 EVT VT =
N->getValueType(0);
13333 if (VT == MVT::i64 && CRHS) {
13335 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
13339 if (CRHS && VT == MVT::i32) {
13349 unsigned Shift = CShift->getZExtValue();
13351 unsigned Offset = NB + Shift;
13352 if ((
Offset & (Bits - 1)) == 0) {
13355 DAG.
getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
LHS->getOperand(0),
13376 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13378 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
13391 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
13396 if (
X !=
LHS.getOperand(1))
13400 const ConstantFPSDNode *C1 =
13417 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
X,
13423 if (
RHS.getOpcode() ==
ISD::SETCC &&
LHS.getOpcode() == AMDGPUISD::FP_CLASS)
13426 if (
LHS.getOpcode() ==
ISD::SETCC &&
RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
13434 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
13435 LHS.getOperand(0) ==
LHS.getOperand(1))) {
13437 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
13438 :
Mask->getZExtValue() & OrdMask;
13441 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
RHS.getOperand(0),
13459 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13462 if (LHSMask != ~0u && RHSMask != ~0u) {
13465 if (LHSMask > RHSMask) {
13472 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13473 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13476 if (!(LHSUsedLanes & RHSUsedLanes) &&
13479 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13485 uint32_t
Mask = LHSMask & RHSMask;
13486 for (
unsigned I = 0;
I < 32;
I += 8) {
13487 uint32_t ByteSel = 0xff <<
I;
13488 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13489 Mask &= (0x0c <<
I) & 0xffffffff;
13494 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
13497 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
13547static const std::optional<ByteProvider<SDValue>>
13549 unsigned Depth = 0) {
13552 return std::nullopt;
13554 if (
Op.getValueSizeInBits() < 8)
13555 return std::nullopt;
13557 if (
Op.getValueType().isVector())
13560 switch (
Op->getOpcode()) {
13572 NarrowVT = VTSign->getVT();
13575 return std::nullopt;
13578 if (SrcIndex >= NarrowByteWidth)
13579 return std::nullopt;
13587 return std::nullopt;
13589 uint64_t BitShift = ShiftOp->getZExtValue();
13591 if (BitShift % 8 != 0)
13592 return std::nullopt;
13594 SrcIndex += BitShift / 8;
13612static const std::optional<ByteProvider<SDValue>>
13614 unsigned StartingIndex = 0) {
13618 return std::nullopt;
13620 unsigned BitWidth =
Op.getScalarValueSizeInBits();
13622 return std::nullopt;
13624 return std::nullopt;
13626 bool IsVec =
Op.getValueType().isVector();
13627 switch (
Op.getOpcode()) {
13630 return std::nullopt;
13635 return std::nullopt;
13639 return std::nullopt;
13642 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
13643 return std::nullopt;
13644 if (!
LHS ||
LHS->isConstantZero())
13646 if (!
RHS ||
RHS->isConstantZero())
13648 return std::nullopt;
13653 return std::nullopt;
13657 return std::nullopt;
13659 uint32_t BitMask = BitMaskOp->getZExtValue();
13661 uint32_t IndexMask = 0xFF << (Index * 8);
13663 if ((IndexMask & BitMask) != IndexMask) {
13666 if (IndexMask & BitMask)
13667 return std::nullopt;
13676 return std::nullopt;
13680 if (!ShiftOp ||
Op.getValueType().isVector())
13681 return std::nullopt;
13683 uint64_t BitsProvided =
Op.getValueSizeInBits();
13684 if (BitsProvided % 8 != 0)
13685 return std::nullopt;
13687 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13689 return std::nullopt;
13691 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13692 uint64_t ByteShift = BitShift / 8;
13694 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13695 uint64_t BytesProvided = BitsProvided / 8;
13696 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13697 NewIndex %= BytesProvided;
13704 return std::nullopt;
13708 return std::nullopt;
13710 uint64_t BitShift = ShiftOp->getZExtValue();
13712 return std::nullopt;
13714 auto BitsProvided =
Op.getScalarValueSizeInBits();
13715 if (BitsProvided % 8 != 0)
13716 return std::nullopt;
13718 uint64_t BytesProvided = BitsProvided / 8;
13719 uint64_t ByteShift = BitShift / 8;
13724 return BytesProvided - ByteShift > Index
13732 return std::nullopt;
13736 return std::nullopt;
13738 uint64_t BitShift = ShiftOp->getZExtValue();
13739 if (BitShift % 8 != 0)
13740 return std::nullopt;
13741 uint64_t ByteShift = BitShift / 8;
13747 return Index < ByteShift
13750 Depth + 1, StartingIndex);
13759 return std::nullopt;
13767 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13769 if (NarrowBitWidth % 8 != 0)
13770 return std::nullopt;
13771 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13773 if (Index >= NarrowByteWidth)
13775 ? std::optional<ByteProvider<SDValue>>(
13783 return std::nullopt;
13787 if (NarrowByteWidth >= Index) {
13792 return std::nullopt;
13799 return std::nullopt;
13805 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13806 if (NarrowBitWidth % 8 != 0)
13807 return std::nullopt;
13808 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13813 if (Index >= NarrowByteWidth) {
13815 ? std::optional<ByteProvider<SDValue>>(
13820 if (NarrowByteWidth > Index) {
13824 return std::nullopt;
13829 return std::nullopt;
13832 Depth + 1, StartingIndex);
13838 return std::nullopt;
13839 auto VecIdx = IdxOp->getZExtValue();
13840 auto ScalarSize =
Op.getScalarValueSizeInBits();
13841 if (ScalarSize < 32)
13842 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13844 StartingIndex, Index);
13847 case AMDGPUISD::PERM: {
13849 return std::nullopt;
13853 return std::nullopt;
13856 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13857 if (IdxMask > 0x07 && IdxMask != 0x0c)
13858 return std::nullopt;
13860 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13861 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13863 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
13869 return std::nullopt;
13884 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
13891 auto MemVT = L->getMemoryVT();
13894 return L->getMemoryVT().getSizeInBits() == 16;
13904 int Low8 = Mask & 0xff;
13905 int Hi8 = (Mask & 0xff00) >> 8;
13907 assert(Low8 < 8 && Hi8 < 8);
13909 bool IsConsecutive = (Hi8 - Low8 == 1);
13914 bool Is16Aligned = !(Low8 % 2);
13916 return IsConsecutive && Is16Aligned;
13924 int Low16 = PermMask & 0xffff;
13925 int Hi16 = (PermMask & 0xffff0000) >> 16;
13935 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13937 if (!OtherOpIs16Bit)
13945 unsigned DWordOffset) {
13950 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13955 if (Src.getValueType().isVector()) {
13956 auto ScalarTySize = Src.getScalarValueSizeInBits();
13957 auto ScalarTy = Src.getValueType().getScalarType();
13958 if (ScalarTySize == 32) {
13962 if (ScalarTySize > 32) {
13965 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13966 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13973 assert(ScalarTySize < 32);
13974 auto NumElements =
TypeSize / ScalarTySize;
13975 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13976 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13977 auto NumElementsIn32 = 32 / ScalarTySize;
13978 auto NumAvailElements = DWordOffset < Trunc32Elements
13980 : NumElements - NormalizedTrunc;
13993 auto ShiftVal = 32 * DWordOffset;
14001 [[maybe_unused]]
EVT VT =
N->getValueType(0);
14006 for (
int i = 0; i < 4; i++) {
14008 std::optional<ByteProvider<SDValue>>
P =
14011 if (!
P ||
P->isConstantZero())
14016 if (PermNodes.
size() != 4)
14019 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
14020 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
14022 for (
size_t i = 0; i < PermNodes.
size(); i++) {
14023 auto PermOp = PermNodes[i];
14026 int SrcByteAdjust = 4;
14030 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
14031 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
14033 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
14034 ((PermOp.SrcOffset / 4) != SecondSrc->second))
14038 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
14039 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
14042 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
14044 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
14047 SDValue Op = *PermNodes[FirstSrc.first].Src;
14049 assert(
Op.getValueSizeInBits() == 32);
14053 int Low16 = PermMask & 0xffff;
14054 int Hi16 = (PermMask & 0xffff0000) >> 16;
14056 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
14057 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
14060 if (WellFormedLow && WellFormedHi)
14064 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
14073 (
N->getOperand(0) ==
Op ||
N->getOperand(0) == OtherOp) &&
14074 (
N->getOperand(1) ==
Op ||
N->getOperand(1) == OtherOp))
14079 assert(
Op.getValueType().isByteSized() &&
14090 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op, OtherOp,
14097 DAGCombinerInfo &DCI)
const {
14098 SelectionDAG &DAG = DCI.DAG;
14102 EVT VT =
N->getValueType(0);
14103 if (VT == MVT::i1) {
14105 if (
LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14106 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
14108 if (Src !=
RHS.getOperand(0))
14113 if (!CLHS || !CRHS)
14117 static const uint32_t MaxMask = 0x3ff;
14122 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1, Src,
14131 LHS.getOpcode() == AMDGPUISD::PERM &&
14137 Sel |=
LHS.getConstantOperandVal(2);
14139 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14146 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14150 auto usesCombinedOperand = [](SDNode *OrUse) {
14153 !OrUse->getValueType(0).isVector())
14157 for (
auto *VUser : OrUse->users()) {
14158 if (!VUser->getValueType(0).isVector())
14165 if (VUser->getOpcode() == VectorwiseOp)
14171 if (!
any_of(
N->users(), usesCombinedOperand))
14177 if (LHSMask != ~0u && RHSMask != ~0u) {
14180 if (LHSMask > RHSMask) {
14187 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14188 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14191 if (!(LHSUsedLanes & RHSUsedLanes) &&
14194 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14196 LHSMask &= ~RHSUsedLanes;
14197 RHSMask &= ~LHSUsedLanes;
14199 LHSMask |= LHSUsedLanes & 0x04040404;
14201 uint32_t Sel = LHSMask | RHSMask;
14204 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14209 if (LHSMask == ~0u || RHSMask == ~0u) {
14250 return IdentitySrc;
14256 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14271 if (SrcVT == MVT::i32) {
14276 DCI.AddToWorklist(LowOr.
getNode());
14277 DCI.AddToWorklist(HiBits.getNode());
14288 N->getOperand(0), CRHS))
14296 DAGCombinerInfo &DCI)
const {
14297 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
14304 SelectionDAG &DAG = DCI.DAG;
14306 EVT VT =
N->getValueType(0);
14307 if (CRHS && VT == MVT::i64) {
14309 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
14316 unsigned Opc =
LHS.getOpcode();
14346 LHS->getOperand(0), FNegLHS, FNegRHS);
14355 DAGCombinerInfo &DCI)
const {
14356 if (!Subtarget->has16BitInsts() ||
14360 EVT VT =
N->getValueType(0);
14361 if (VT != MVT::i32)
14365 if (Src.getValueType() != MVT::i16)
14372SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
14373 DAGCombinerInfo &DCI)
const {
14379 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
14380 VTSign->getVT() == MVT::i8) ||
14381 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
14382 VTSign->getVT() == MVT::i16))) {
14383 assert(Subtarget->hasScalarSubwordLoads() &&
14384 "s_buffer_load_{u8, i8} are supported "
14385 "in GFX12 (or newer) architectures.");
14386 EVT VT = Src.getValueType();
14387 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
14388 ? AMDGPUISD::SBUFFER_LOAD_BYTE
14389 : AMDGPUISD::SBUFFER_LOAD_SHORT;
14391 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14398 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14399 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14403 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
14404 VTSign->getVT() == MVT::i8) ||
14405 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
14406 VTSign->getVT() == MVT::i16)) &&
14415 Src.getOperand(6), Src.getOperand(7)};
14418 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14419 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
14420 ? AMDGPUISD::BUFFER_LOAD_BYTE
14421 : AMDGPUISD::BUFFER_LOAD_SHORT;
14422 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14423 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14424 return DCI.DAG.getMergeValues(
14425 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
14431 DAGCombinerInfo &DCI)
const {
14432 SelectionDAG &DAG = DCI.DAG;
14439 if (
N->getOperand(0).isUndef())
14446 DAGCombinerInfo &DCI)
const {
14447 EVT VT =
N->getValueType(0);
14457 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(
N), VT, N0,
14464 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(
N), VT, N0.
getOperand(0),
14472 unsigned MaxDepth)
const {
14473 unsigned Opcode =
Op.getOpcode();
14478 const auto &
F = CFP->getValueAPF();
14479 if (
F.isNaN() &&
F.isSignaling())
14481 if (!
F.isDenormal())
14513 case AMDGPUISD::FMUL_LEGACY:
14514 case AMDGPUISD::FMAD_FTZ:
14515 case AMDGPUISD::RCP:
14516 case AMDGPUISD::RSQ:
14517 case AMDGPUISD::RSQ_CLAMP:
14518 case AMDGPUISD::RCP_LEGACY:
14519 case AMDGPUISD::RCP_IFLAG:
14520 case AMDGPUISD::LOG:
14521 case AMDGPUISD::EXP:
14522 case AMDGPUISD::DIV_SCALE:
14523 case AMDGPUISD::DIV_FMAS:
14524 case AMDGPUISD::DIV_FIXUP:
14525 case AMDGPUISD::FRACT:
14526 case AMDGPUISD::CVT_PKRTZ_F16_F32:
14527 case AMDGPUISD::CVT_F32_UBYTE0:
14528 case AMDGPUISD::CVT_F32_UBYTE1:
14529 case AMDGPUISD::CVT_F32_UBYTE2:
14530 case AMDGPUISD::CVT_F32_UBYTE3:
14531 case AMDGPUISD::FP_TO_FP16:
14532 case AMDGPUISD::SIN_HW:
14533 case AMDGPUISD::COS_HW:
14544 if (
Op.getValueType() == MVT::i32) {
14550 if (RHS->getZExtValue() == 0xffff0000) {
14560 return Op.getValueType().getScalarType() != MVT::f16;
14570 case AMDGPUISD::CLAMP:
14571 case AMDGPUISD::FMED3:
14572 case AMDGPUISD::FMAX3:
14573 case AMDGPUISD::FMIN3:
14574 case AMDGPUISD::FMAXIMUM3:
14575 case AMDGPUISD::FMINIMUM3: {
14581 if (Subtarget->supportsMinMaxDenormModes() ||
14591 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
14603 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
14630 if (
Op.getValueType() == MVT::i16) {
14641 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
14643 switch (IntrinsicID) {
14644 case Intrinsic::amdgcn_cvt_pkrtz:
14645 case Intrinsic::amdgcn_cubeid:
14646 case Intrinsic::amdgcn_frexp_mant:
14647 case Intrinsic::amdgcn_fdot2:
14648 case Intrinsic::amdgcn_rcp:
14649 case Intrinsic::amdgcn_rsq:
14650 case Intrinsic::amdgcn_rsq_clamp:
14651 case Intrinsic::amdgcn_rcp_legacy:
14652 case Intrinsic::amdgcn_rsq_legacy:
14653 case Intrinsic::amdgcn_trig_preop:
14654 case Intrinsic::amdgcn_tanh:
14655 case Intrinsic::amdgcn_log:
14656 case Intrinsic::amdgcn_exp2:
14657 case Intrinsic::amdgcn_sqrt:
14675 unsigned MaxDepth)
const {
14678 unsigned Opcode =
MI->getOpcode();
14680 if (Opcode == AMDGPU::G_FCANONICALIZE)
14683 std::optional<FPValueAndVReg> FCR;
14686 if (FCR->Value.isSignaling())
14688 if (!FCR->Value.isDenormal())
14699 case AMDGPU::G_FADD:
14700 case AMDGPU::G_FSUB:
14701 case AMDGPU::G_FMUL:
14702 case AMDGPU::G_FCEIL:
14703 case AMDGPU::G_FFLOOR:
14704 case AMDGPU::G_FRINT:
14705 case AMDGPU::G_FNEARBYINT:
14706 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14707 case AMDGPU::G_INTRINSIC_TRUNC:
14708 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14709 case AMDGPU::G_FMA:
14710 case AMDGPU::G_FMAD:
14711 case AMDGPU::G_FSQRT:
14712 case AMDGPU::G_FDIV:
14713 case AMDGPU::G_FREM:
14714 case AMDGPU::G_FPOW:
14715 case AMDGPU::G_FPEXT:
14716 case AMDGPU::G_FLOG:
14717 case AMDGPU::G_FLOG2:
14718 case AMDGPU::G_FLOG10:
14719 case AMDGPU::G_FPTRUNC:
14720 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14721 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14722 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14723 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14724 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14726 case AMDGPU::G_FNEG:
14727 case AMDGPU::G_FABS:
14728 case AMDGPU::G_FCOPYSIGN:
14730 case AMDGPU::G_FMINNUM:
14731 case AMDGPU::G_FMAXNUM:
14732 case AMDGPU::G_FMINNUM_IEEE:
14733 case AMDGPU::G_FMAXNUM_IEEE:
14734 case AMDGPU::G_FMINIMUM:
14735 case AMDGPU::G_FMAXIMUM:
14736 case AMDGPU::G_FMINIMUMNUM:
14737 case AMDGPU::G_FMAXIMUMNUM: {
14738 if (Subtarget->supportsMinMaxDenormModes() ||
14745 case AMDGPU::G_BUILD_VECTOR:
14750 case AMDGPU::G_INTRINSIC:
14751 case AMDGPU::G_INTRINSIC_CONVERGENT:
14753 case Intrinsic::amdgcn_fmul_legacy:
14754 case Intrinsic::amdgcn_fmad_ftz:
14755 case Intrinsic::amdgcn_sqrt:
14756 case Intrinsic::amdgcn_fmed3:
14757 case Intrinsic::amdgcn_sin:
14758 case Intrinsic::amdgcn_cos:
14759 case Intrinsic::amdgcn_log:
14760 case Intrinsic::amdgcn_exp2:
14761 case Intrinsic::amdgcn_log_clamp:
14762 case Intrinsic::amdgcn_rcp:
14763 case Intrinsic::amdgcn_rcp_legacy:
14764 case Intrinsic::amdgcn_rsq:
14765 case Intrinsic::amdgcn_rsq_clamp:
14766 case Intrinsic::amdgcn_rsq_legacy:
14767 case Intrinsic::amdgcn_div_scale:
14768 case Intrinsic::amdgcn_div_fmas:
14769 case Intrinsic::amdgcn_div_fixup:
14770 case Intrinsic::amdgcn_fract:
14771 case Intrinsic::amdgcn_cvt_pkrtz:
14772 case Intrinsic::amdgcn_cubeid:
14773 case Intrinsic::amdgcn_cubema:
14774 case Intrinsic::amdgcn_cubesc:
14775 case Intrinsic::amdgcn_cubetc:
14776 case Intrinsic::amdgcn_frexp_mant:
14777 case Intrinsic::amdgcn_fdot2:
14778 case Intrinsic::amdgcn_trig_preop:
14779 case Intrinsic::amdgcn_tanh:
14798 if (
C.isDenormal()) {
14812 if (
C.isSignaling()) {
14835SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
14836 DAGCombinerInfo &DCI)
const {
14837 SelectionDAG &DAG = DCI.DAG;
14839 EVT VT =
N->getValueType(0);
14848 EVT VT =
N->getValueType(0);
14849 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
14865 EVT EltVT =
Lo.getValueType();
14868 for (
unsigned I = 0;
I != 2; ++
I) {
14872 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14873 }
else if (
Op.isUndef()) {
14909 return AMDGPUISD::FMAX3;
14911 return AMDGPUISD::FMAXIMUM3;
14913 return AMDGPUISD::SMAX3;
14915 return AMDGPUISD::UMAX3;
14919 return AMDGPUISD::FMIN3;
14921 return AMDGPUISD::FMINIMUM3;
14923 return AMDGPUISD::SMIN3;
14925 return AMDGPUISD::UMIN3;
14946 if (!MinK || !MaxK)
14958 unsigned Med3Opc =
Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
14959 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14960 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
15019 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15025 if (
Info->getMode().DX10Clamp) {
15034 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
15066 case AMDGPUISD::FMIN_LEGACY:
15067 case AMDGPUISD::FMAX_LEGACY:
15068 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
15079 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
15088 DAGCombinerInfo &DCI)
const {
15089 SelectionDAG &DAG = DCI.DAG;
15121 if (
SDValue Med3 = performIntMed3ImmCombine(
15126 if (
SDValue Med3 = performIntMed3ImmCombine(
15132 if (
SDValue Med3 = performIntMed3ImmCombine(
15137 if (
SDValue Med3 = performIntMed3ImmCombine(
15150 (
Opc == AMDGPUISD::FMIN_LEGACY &&
15151 Op0.
getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
15152 (VT == MVT::f32 || VT == MVT::f64 ||
15153 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15154 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15155 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15156 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15158 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1))
15165 const SDNodeFlags
Flags =
N->getFlags();
15167 !Subtarget->hasIEEEMinimumMaximumInsts() &&
Flags.hasNoNaNs()) {
15170 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
15180 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15181 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15190 DAGCombinerInfo &DCI)
const {
15191 EVT VT =
N->getValueType(0);
15195 SelectionDAG &DAG = DCI.DAG;
15206 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
15210 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15214 if (
Info->getMode().DX10Clamp) {
15227 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
15234 DAGCombinerInfo &DCI)
const {
15238 return DCI.DAG.getUNDEF(
N->getValueType(0));
15246 bool IsDivergentIdx,
15251 unsigned VecSize = EltSize * NumElem;
15254 if (VecSize <= 64 && EltSize < 32)
15263 if (IsDivergentIdx)
15267 unsigned NumInsts = NumElem +
15268 ((EltSize + 31) / 32) * NumElem ;
15272 if (Subtarget->useVGPRIndexMode())
15273 return NumInsts <= 16;
15277 if (Subtarget->hasMovrel())
15278 return NumInsts <= 15;
15284 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
15299SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
15300 DAGCombinerInfo &DCI)
const {
15306 EVT ResVT =
N->getValueType(0);
15330 if (!
C ||
C->getZExtValue() != 0x1f)
15346 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15374 DCI.AddToWorklist(Elt0.
getNode());
15375 DCI.AddToWorklist(Elt1.
getNode());
15397 if (!DCI.isBeforeLegalize())
15405 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15408 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15409 unsigned EltIdx = BitIndex / 32;
15410 unsigned LeftoverBitIdx = BitIndex % 32;
15414 DCI.AddToWorklist(Cast.
getNode());
15418 DCI.AddToWorklist(Elt.
getNode());
15421 DCI.AddToWorklist(Srl.
getNode());
15425 DCI.AddToWorklist(Trunc.
getNode());
15427 if (VecEltVT == ResVT) {
15439SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
15440 DAGCombinerInfo &DCI)
const {
15451 SelectionDAG &DAG = DCI.DAG;
15471 Src.getOperand(0).getValueType() == MVT::f16) {
15472 return Src.getOperand(0);
15476 APFloat Val = CFP->getValueAPF();
15477 bool LosesInfo =
true;
15487 DAGCombinerInfo &DCI)
const {
15488 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15489 "combine only useful on gfx8");
15491 SDValue TruncSrc =
N->getOperand(0);
15492 EVT VT =
N->getValueType(0);
15493 if (VT != MVT::f16)
15496 if (TruncSrc.
getOpcode() != AMDGPUISD::FMED3 ||
15500 SelectionDAG &DAG = DCI.DAG;
15531unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
15533 const SDNode *N1)
const {
15538 if (((VT == MVT::f32 &&
15540 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15560 EVT VT =
N->getValueType(0);
15561 if (VT != MVT::i32 && VT != MVT::i64)
15567 unsigned Opc =
N->getOpcode();
15622 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
15641 DAGCombinerInfo &DCI)
const {
15644 SelectionDAG &DAG = DCI.DAG;
15645 EVT VT =
N->getValueType(0);
15655 if (!
N->isDivergent() && Subtarget->hasSMulHi())
15659 if (NumBits <= 32 || NumBits > 64)
15670 if (!Subtarget->hasFullRate64Ops()) {
15671 unsigned NumUsers = 0;
15672 for (SDNode *User :
LHS->
users()) {
15675 if (!
User->isAnyAdd())
15699 bool MulSignedLo =
false;
15700 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15709 if (VT != MVT::i64) {
15732 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15734 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15735 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15737 if (!MulLHSUnsigned32) {
15744 if (!MulRHSUnsigned32) {
15755 if (VT != MVT::i64)
15761SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
15762 DAGCombinerInfo &DCI)
const {
15772 SelectionDAG &DAG = DCI.DAG;
15787 unsigned Opcode =
N->getOpcode();
15791 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
15802static std::optional<ByteProvider<SDValue>>
15805 if (!Byte0 || Byte0->isConstantZero()) {
15806 return std::nullopt;
15809 if (Byte1 && !Byte1->isConstantZero()) {
15810 return std::nullopt;
15816 unsigned FirstCs =
First & 0x0c0c0c0c;
15817 unsigned SecondCs = Second & 0x0c0c0c0c;
15818 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
15819 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15821 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15822 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15823 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15824 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15826 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15850 for (
int BPI = 0; BPI < 2; BPI++) {
15853 BPP = {Src1, Src0};
15855 unsigned ZeroMask = 0x0c0c0c0c;
15856 unsigned FMask = 0xFF << (8 * (3 - Step));
15858 unsigned FirstMask =
15859 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15860 unsigned SecondMask =
15861 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15865 int FirstGroup = -1;
15866 for (
int I = 0;
I < 2;
I++) {
15868 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
15869 return IterElt.SrcOp == *BPP.first.Src &&
15870 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15874 if (Match != Srcs.
end()) {
15875 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
15880 if (FirstGroup != -1) {
15882 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
15883 return IterElt.SrcOp == *BPP.second.Src &&
15884 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15887 if (Match != Srcs.
end()) {
15888 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
15890 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15898 unsigned ZeroMask = 0x0c0c0c0c;
15899 unsigned FMask = 0xFF << (8 * (3 - Step));
15903 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15907 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15916 if (Srcs.
size() == 1) {
15917 auto *Elt = Srcs.
begin();
15921 if (Elt->PermMask == 0x3020100)
15924 return DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15928 auto *FirstElt = Srcs.
begin();
15929 auto *SecondElt = std::next(FirstElt);
15936 auto FirstMask = FirstElt->PermMask;
15937 auto SecondMask = SecondElt->PermMask;
15939 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15940 unsigned FirstPlusFour = FirstMask | 0x04040404;
15943 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15955 FirstElt = std::next(SecondElt);
15956 if (FirstElt == Srcs.
end())
15959 SecondElt = std::next(FirstElt);
15962 if (SecondElt == Srcs.
end()) {
15967 DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
15968 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
15974 return Perms.
size() == 2
15980 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15981 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15982 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15983 EntryMask += ZeroMask;
15988 auto Opcode =
Op.getOpcode();
15990 return (Opcode ==
ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
15991 Opcode == AMDGPUISD::MUL_I24);
15994static std::optional<bool>
16005 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
16008 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
16010 assert(!(S0IsUnsigned && S0IsSigned));
16011 assert(!(S1IsUnsigned && S1IsSigned));
16019 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
16025 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
16026 return std::nullopt;
16038 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
16039 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
16044 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
16050 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
16051 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
16052 return std::nullopt;
16058 DAGCombinerInfo &DCI)
const {
16059 SelectionDAG &DAG = DCI.DAG;
16060 EVT VT =
N->getValueType(0);
16066 if (Subtarget->hasMad64_32()) {
16067 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16072 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
16076 if (VT == MVT::i64) {
16077 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16082 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
16084 std::optional<bool> IsSigned;
16090 int ChainLength = 0;
16091 for (
int I = 0;
I < 4;
I++) {
16095 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
16098 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
16103 TempNode->getOperand(MulIdx), *Src0, *Src1,
16104 TempNode->getOperand(MulIdx)->getOperand(0),
16105 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
16109 IsSigned = *IterIsSigned;
16110 if (*IterIsSigned != *IsSigned)
16113 auto AddIdx = 1 - MulIdx;
16116 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
16117 Src2s.
push_back(TempNode->getOperand(AddIdx));
16127 TempNode->getOperand(AddIdx), *Src0, *Src1,
16128 TempNode->getOperand(AddIdx)->getOperand(0),
16129 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
16133 if (*IterIsSigned != *IsSigned)
16137 ChainLength =
I + 2;
16141 TempNode = TempNode->getOperand(AddIdx);
16143 ChainLength =
I + 1;
16144 if (TempNode->getNumOperands() < 2)
16146 LHS = TempNode->getOperand(0);
16147 RHS = TempNode->getOperand(1);
16150 if (ChainLength < 2)
16156 if (ChainLength < 4) {
16166 bool UseOriginalSrc =
false;
16167 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
16168 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
16169 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
16170 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
16171 SmallVector<unsigned, 4> SrcBytes;
16172 auto Src0Mask = Src0s.
begin()->PermMask;
16173 SrcBytes.
push_back(Src0Mask & 0xFF000000);
16174 bool UniqueEntries =
true;
16175 for (
auto I = 1;
I < 4;
I++) {
16176 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
16179 UniqueEntries =
false;
16185 if (UniqueEntries) {
16186 UseOriginalSrc =
true;
16188 auto *FirstElt = Src0s.
begin();
16192 auto *SecondElt = Src1s.
begin();
16194 SecondElt->DWordOffset);
16203 if (!UseOriginalSrc) {
16210 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16213 : Intrinsic::amdgcn_udot4,
16223 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16228 unsigned Opc =
LHS.getOpcode();
16240 auto Cond =
RHS.getOperand(0);
16245 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16262 DAGCombinerInfo &DCI)
const {
16263 SelectionDAG &DAG = DCI.DAG;
16265 EVT VT =
N->getValueType(0);
16278 SDNodeFlags ShlFlags = N1->
getFlags();
16282 SDNodeFlags NewShlFlags =
16287 DCI.AddToWorklist(Inner.
getNode());
16294 if (Subtarget->hasMad64_32()) {
16295 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16304 if (VT == MVT::i64) {
16305 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16318 if (!YIsConstant && !ZIsConstant && !
X->isDivergent() &&
16319 Y->isDivergent() !=
Z->isDivergent()) {
16328 if (
Y->isDivergent())
16331 SDNodeFlags ReassocFlags =
16334 DCI.AddToWorklist(UniformInner.
getNode());
16342 DAGCombinerInfo &DCI)
const {
16343 SelectionDAG &DAG = DCI.DAG;
16344 EVT VT =
N->getValueType(0);
16346 if (VT == MVT::i64) {
16347 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16351 if (VT != MVT::i32)
16360 unsigned Opc =
RHS.getOpcode();
16367 auto Cond =
RHS.getOperand(0);
16372 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16390SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
16391 DAGCombinerInfo &DCI)
const {
16393 if (
N->getValueType(0) != MVT::i32)
16399 SelectionDAG &DAG = DCI.DAG;
16404 unsigned LHSOpc =
LHS.getOpcode();
16405 unsigned Opc =
N->getOpcode();
16409 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
16415 DAGCombinerInfo &DCI)
const {
16419 SelectionDAG &DAG = DCI.DAG;
16420 EVT VT =
N->getValueType(0);
16432 if (
A ==
LHS.getOperand(1)) {
16433 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16434 if (FusedOp != 0) {
16436 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
16444 if (
A ==
RHS.getOperand(1)) {
16445 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16446 if (FusedOp != 0) {
16448 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
16457 DAGCombinerInfo &DCI)
const {
16461 SelectionDAG &DAG = DCI.DAG;
16463 EVT VT =
N->getValueType(0);
16476 if (
A ==
LHS.getOperand(1)) {
16477 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16478 if (FusedOp != 0) {
16482 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
16491 if (
A ==
RHS.getOperand(1)) {
16492 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16493 if (FusedOp != 0) {
16495 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
16504 DAGCombinerInfo &DCI)
const {
16505 SelectionDAG &DAG = DCI.DAG;
16507 EVT VT =
N->getValueType(0);
16516 SDNodeFlags
Flags =
N->getFlags();
16517 SDNodeFlags RHSFlags =
RHS->getFlags();
16523 bool IsNegative =
false;
16524 if (CLHS->isExactlyValue(1.0) ||
16525 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16531 DAG.
getNode(AMDGPUISD::RSQ, SL, VT,
RHS.getOperand(0), Flags);
16541 DAGCombinerInfo &DCI)
const {
16542 SelectionDAG &DAG = DCI.DAG;
16543 EVT VT =
N->getValueType(0);
16547 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
16548 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16563 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16568 const ConstantFPSDNode *FalseNode =
16578 if (ScalarVT == MVT::f32 &&
16584 if (TrueNodeExpVal == INT_MIN)
16587 if (FalseNodeExpVal == INT_MIN)
16607 DAGCombinerInfo &DCI)
const {
16608 SelectionDAG &DAG = DCI.DAG;
16609 EVT VT =
N->getValueType(0);
16612 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16630 (
N->getFlags().hasAllowContract() &&
16631 FMA->getFlags().hasAllowContract())) {
16665 if (Vec1 == Vec2 || Vec3 == Vec4)
16671 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16672 return DAG.
getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
16680 DAGCombinerInfo &DCI)
const {
16681 SelectionDAG &DAG = DCI.DAG;
16686 EVT VT =
LHS.getValueType();
16715 return LHS.getOperand(0);
16723 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
16730 const APInt &CT =
LHS.getConstantOperandAPInt(1);
16731 const APInt &CF =
LHS.getConstantOperandAPInt(2);
16739 return LHS.getOperand(0);
16771 DAG.
getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
16776 {Op0Hi, Op1Hi, CarryInHi});
16786 DCI.CombineTo(
LHS.getNode(), Result);
16790 if (VT != MVT::f32 && VT != MVT::f64 &&
16791 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16806 const unsigned IsInfMask =
16808 const unsigned IsFiniteMask =
16813 return DAG.
getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1,
LHS.getOperand(0),
16822SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
16823 DAGCombinerInfo &DCI)
const {
16824 SelectionDAG &DAG = DCI.DAG;
16826 unsigned Offset =
N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
16845 unsigned ShiftOffset = 8 *
Offset;
16847 ShiftOffset -=
C->getZExtValue();
16849 ShiftOffset +=
C->getZExtValue();
16851 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16852 return DAG.
getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
16853 MVT::f32, Shifted);
16864 DCI.AddToWorklist(
N);
16871 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
16877 DAGCombinerInfo &DCI)
const {
16882 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16886 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16887 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
16890 APFloat One(
F.getSemantics(),
"1.0");
16892 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
16898 DAGCombinerInfo &DCI)
const {
16919 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
16920 bool isInteger =
LHS.getValueType().isInteger();
16923 if (!isFloatingPoint && !isInteger)
16928 if (!isEquality && !isNonEquality)
16945 if (isFloatingPoint) {
16947 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16958 if (!(isEquality && TrueVal == ConstVal) &&
16959 !(isNonEquality && FalseVal == ConstVal))
16966 SelectLHS, SelectRHS);
16971 switch (
N->getOpcode()) {
16987 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
16997 switch (
N->getOpcode()) {
16999 return performAddCombine(
N, DCI);
17001 return performPtrAddCombine(
N, DCI);
17003 return performSubCombine(
N, DCI);
17006 return performAddCarrySubCarryCombine(
N, DCI);
17008 return performFAddCombine(
N, DCI);
17010 return performFSubCombine(
N, DCI);
17012 return performFDivCombine(
N, DCI);
17014 return performFMulCombine(
N, DCI);
17016 return performSetCCCombine(
N, DCI);
17018 if (
auto Res = performSelectCombine(
N, DCI))
17033 case AMDGPUISD::FMIN_LEGACY:
17034 case AMDGPUISD::FMAX_LEGACY:
17035 return performMinMaxCombine(
N, DCI);
17037 return performFMACombine(
N, DCI);
17039 return performAndCombine(
N, DCI);
17041 return performOrCombine(
N, DCI);
17044 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
17045 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
17051 return performXorCombine(
N, DCI);
17053 return performZeroExtendCombine(
N, DCI);
17055 return performSignExtendInRegCombine(
N, DCI);
17056 case AMDGPUISD::FP_CLASS:
17057 return performClassCombine(
N, DCI);
17059 return performFCanonicalizeCombine(
N, DCI);
17060 case AMDGPUISD::RCP:
17061 return performRcpCombine(
N, DCI);
17063 case AMDGPUISD::FRACT:
17064 case AMDGPUISD::RSQ:
17065 case AMDGPUISD::RCP_LEGACY:
17066 case AMDGPUISD::RCP_IFLAG:
17067 case AMDGPUISD::RSQ_CLAMP: {
17076 return performUCharToFloatCombine(
N, DCI);
17078 return performFCopySignCombine(
N, DCI);
17079 case AMDGPUISD::CVT_F32_UBYTE0:
17080 case AMDGPUISD::CVT_F32_UBYTE1:
17081 case AMDGPUISD::CVT_F32_UBYTE2:
17082 case AMDGPUISD::CVT_F32_UBYTE3:
17083 return performCvtF32UByteNCombine(
N, DCI);
17084 case AMDGPUISD::FMED3:
17085 return performFMed3Combine(
N, DCI);
17086 case AMDGPUISD::CVT_PKRTZ_F16_F32:
17087 return performCvtPkRTZCombine(
N, DCI);
17088 case AMDGPUISD::CLAMP:
17089 return performClampCombine(
N, DCI);
17092 EVT VT =
N->getValueType(0);
17095 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
17098 EVT EltVT = Src.getValueType();
17099 if (EltVT != MVT::i16)
17109 return performExtractVectorEltCombine(
N, DCI);
17111 return performInsertVectorEltCombine(
N, DCI);
17113 return performFPRoundCombine(
N, DCI);
17122 return performMemSDNodeCombine(MemNode, DCI);
17153 unsigned Opcode =
Node->getMachineOpcode();
17156 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17157 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
17160 SDNode *
Users[5] = {
nullptr};
17162 unsigned DmaskIdx =
17163 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17164 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
17165 unsigned NewDmask = 0;
17166 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17167 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17168 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
17169 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
17170 unsigned TFCLane = 0;
17171 bool HasChain =
Node->getNumValues() > 1;
17173 if (OldDmask == 0) {
17181 TFCLane = OldBitsSet;
17185 for (SDUse &Use :
Node->uses()) {
17188 if (
Use.getResNo() != 0)
17191 SDNode *
User =
Use.getUser();
17194 if (!
User->isMachineOpcode() ||
17195 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17207 if (UsesTFC && Lane == TFCLane) {
17212 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17214 Dmask &= ~(1 << Comp);
17222 NewDmask |= 1 << Comp;
17227 bool NoChannels = !NewDmask;
17234 if (OldBitsSet == 1)
17240 if (NewDmask == OldDmask)
17249 unsigned NewChannels = BitsSet + UsesTFC;
17253 assert(NewOpcode != -1 &&
17254 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
17255 "failed to find equivalent MIMG op");
17263 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
17265 MVT ResultVT = NewChannels == 1
17268 : NewChannels == 5 ? 8
17270 SDVTList NewVTList =
17273 MachineSDNode *NewNode =
17282 if (NewChannels == 1) {
17292 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17297 if (i || !NoChannels)
17302 if (NewUser != User) {
17312 Idx = AMDGPU::sub1;
17315 Idx = AMDGPU::sub2;
17318 Idx = AMDGPU::sub3;
17321 Idx = AMDGPU::sub4;
17332 Op =
Op.getOperand(0);
17353 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17357 Node->getOperand(0), SL, VReg, SrcVal,
17363 return ToResultReg.
getNode();
17368 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
17370 Ops.push_back(
Node->getOperand(i));
17376 Node->getOperand(i).getValueType(),
17377 Node->getOperand(i)),
17389 unsigned Opcode =
Node->getMachineOpcode();
17391 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
17392 !
TII->isGather4(Opcode) &&
17394 return adjustWritemask(
Node, DAG);
17397 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17403 case AMDGPU::V_DIV_SCALE_F32_e64:
17404 case AMDGPU::V_DIV_SCALE_F64_e64: {
17414 (Src0 == Src1 || Src0 == Src2))
17470 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
17471 unsigned InitIdx = 0;
17473 if (
TII->isImage(
MI)) {
17481 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
17482 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
17483 unsigned D16Val = D16 ? D16->getImm() : 0;
17485 if (!TFEVal && !LWEVal)
17496 assert(MO_Dmask &&
"Expected dmask operand in instruction");
17498 unsigned dmask = MO_Dmask->
getImm();
17503 bool Packed = !Subtarget->hasUnpackedD16VMem();
17505 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17512 uint32_t DstSize =
TRI.getRegSizeInBits(*DstRC) / 32;
17513 if (DstSize < InitIdx)
17517 InitIdx =
TRI.getRegSizeInBits(*DstRC) / 32;
17525 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
17526 unsigned NewDst = 0;
17531 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17532 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17535 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17536 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
17556 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
17568 if (
TII->isVOP3(
MI.getOpcode())) {
17570 TII->legalizeOperandsVOP3(
MRI,
MI);
17572 if (
TII->isMAI(
MI)) {
17577 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17578 AMDGPU::OpName::scale_src0);
17579 if (Src0Idx != -1) {
17580 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17581 AMDGPU::OpName::scale_src1);
17582 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
17583 TII->usesConstantBus(
MRI,
MI, Src1Idx))
17584 TII->legalizeOpWithMove(
MI, Src1Idx);
17591 if (
TII->isImage(
MI))
17592 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
17666std::pair<unsigned, const TargetRegisterClass *>
17673 if (Constraint.
size() == 1) {
17677 if (VT == MVT::Other)
17680 switch (Constraint[0]) {
17687 RC = &AMDGPU::SReg_32RegClass;
17690 RC = &AMDGPU::SGPR_64RegClass;
17695 return std::pair(0U,
nullptr);
17702 return std::pair(0U,
nullptr);
17704 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17705 : &AMDGPU::VGPR_32_Lo256RegClass;
17708 RC = Subtarget->has1024AddressableVGPRs()
17709 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
17712 return std::pair(0U,
nullptr);
17717 if (!Subtarget->hasMAIInsts())
17721 return std::pair(0U,
nullptr);
17723 RC = &AMDGPU::AGPR_32RegClass;
17728 return std::pair(0U,
nullptr);
17733 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
17737 RC = &AMDGPU::AV_32RegClass;
17740 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
17742 return std::pair(0U,
nullptr);
17751 return std::pair(0U, RC);
17754 if (Kind !=
'\0') {
17756 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17757 }
else if (Kind ==
's') {
17758 RC = &AMDGPU::SGPR_32RegClass;
17759 }
else if (Kind ==
'a') {
17760 RC = &AMDGPU::AGPR_32RegClass;
17766 return std::pair(0U,
nullptr);
17772 return std::pair(0U,
nullptr);
17776 RC =
TRI->getVGPRClassForBitWidth(Width);
17778 RC =
TRI->getSGPRClassForBitWidth(Width);
17780 RC =
TRI->getAGPRClassForBitWidth(Width);
17782 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17787 return std::pair(0U,
nullptr);
17789 return std::pair(Reg, RC);
17795 return std::pair(0U,
nullptr);
17796 if (Idx < RC->getNumRegs())
17798 return std::pair(0U,
nullptr);
17804 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
17810 if (Constraint.
size() == 1) {
17811 switch (Constraint[0]) {
17821 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
17829 if (Constraint.
size() == 1) {
17830 switch (Constraint[0]) {
17838 }
else if (Constraint.
size() == 2) {
17839 if (Constraint ==
"VA")
17857 std::vector<SDValue> &
Ops,
17872 unsigned Size =
Op.getScalarValueSizeInBits();
17876 if (
Size == 16 && !Subtarget->has16BitInsts())
17880 Val =
C->getSExtValue();
17884 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17888 if (
Size != 16 ||
Op.getNumOperands() != 2)
17890 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
17893 Val =
C->getSExtValue();
17897 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17907 if (Constraint.
size() == 1) {
17908 switch (Constraint[0]) {
17923 }
else if (Constraint.
size() == 2) {
17924 if (Constraint ==
"DA") {
17925 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
17926 int64_t LoBits =
static_cast<int32_t
>(Val);
17930 if (Constraint ==
"DB") {
17938 unsigned MaxSize)
const {
17939 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
17940 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17942 MVT VT =
Op.getSimpleValueType();
17967 switch (UnalignedClassID) {
17968 case AMDGPU::VReg_64RegClassID:
17969 return AMDGPU::VReg_64_Align2RegClassID;
17970 case AMDGPU::VReg_96RegClassID:
17971 return AMDGPU::VReg_96_Align2RegClassID;
17972 case AMDGPU::VReg_128RegClassID:
17973 return AMDGPU::VReg_128_Align2RegClassID;
17974 case AMDGPU::VReg_160RegClassID:
17975 return AMDGPU::VReg_160_Align2RegClassID;
17976 case AMDGPU::VReg_192RegClassID:
17977 return AMDGPU::VReg_192_Align2RegClassID;
17978 case AMDGPU::VReg_224RegClassID:
17979 return AMDGPU::VReg_224_Align2RegClassID;
17980 case AMDGPU::VReg_256RegClassID:
17981 return AMDGPU::VReg_256_Align2RegClassID;
17982 case AMDGPU::VReg_288RegClassID:
17983 return AMDGPU::VReg_288_Align2RegClassID;
17984 case AMDGPU::VReg_320RegClassID:
17985 return AMDGPU::VReg_320_Align2RegClassID;
17986 case AMDGPU::VReg_352RegClassID:
17987 return AMDGPU::VReg_352_Align2RegClassID;
17988 case AMDGPU::VReg_384RegClassID:
17989 return AMDGPU::VReg_384_Align2RegClassID;
17990 case AMDGPU::VReg_512RegClassID:
17991 return AMDGPU::VReg_512_Align2RegClassID;
17992 case AMDGPU::VReg_1024RegClassID:
17993 return AMDGPU::VReg_1024_Align2RegClassID;
17994 case AMDGPU::AReg_64RegClassID:
17995 return AMDGPU::AReg_64_Align2RegClassID;
17996 case AMDGPU::AReg_96RegClassID:
17997 return AMDGPU::AReg_96_Align2RegClassID;
17998 case AMDGPU::AReg_128RegClassID:
17999 return AMDGPU::AReg_128_Align2RegClassID;
18000 case AMDGPU::AReg_160RegClassID:
18001 return AMDGPU::AReg_160_Align2RegClassID;
18002 case AMDGPU::AReg_192RegClassID:
18003 return AMDGPU::AReg_192_Align2RegClassID;
18004 case AMDGPU::AReg_256RegClassID:
18005 return AMDGPU::AReg_256_Align2RegClassID;
18006 case AMDGPU::AReg_512RegClassID:
18007 return AMDGPU::AReg_512_Align2RegClassID;
18008 case AMDGPU::AReg_1024RegClassID:
18009 return AMDGPU::AReg_1024_Align2RegClassID;
18025 if (Info->isEntryFunction()) {
18032 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
18034 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
18035 :
TRI->getAlignedHighSGPRForRC(MF, 2,
18036 &AMDGPU::SGPR_64RegClass);
18037 Info->setSGPRForEXECCopy(SReg);
18039 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
18040 Info->getStackPtrOffsetReg()));
18041 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
18042 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
18046 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
18047 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
18049 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
18050 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
18052 Info->limitOccupancy(MF);
18054 if (ST.isWave32() && !MF.
empty()) {
18055 for (
auto &
MBB : MF) {
18056 for (
auto &
MI :
MBB) {
18057 TII->fixImplicitOperands(
MI);
18067 if (ST.needsAlignedVGPRs()) {
18068 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
18074 if (NewClassID != -1)
18075 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
18084 const APInt &DemandedElts,
18086 unsigned Depth)
const {
18088 unsigned Opc =
Op.getOpcode();
18091 unsigned IID =
Op.getConstantOperandVal(0);
18093 case Intrinsic::amdgcn_mbcnt_lo:
18094 case Intrinsic::amdgcn_mbcnt_hi: {
18100 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
18110 Op, Known, DemandedElts, DAG,
Depth);
18126 unsigned MaxValue =
18133 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
18137 unsigned Src1Cst = 0;
18138 if (Src1.
isImm()) {
18139 Src1Cst = Src1.
getImm();
18140 }
else if (Src1.
isReg()) {
18144 Src1Cst = Cst->Value.getZExtValue();
18155 if (Width >= BFEWidth)
18164 Known = Known.
sext(BFEWidth);
18166 Known = Known.
zext(BFEWidth);
18172 unsigned Depth)
const {
18175 switch (
MI->getOpcode()) {
18176 case AMDGPU::S_BFE_I32:
18179 case AMDGPU::S_BFE_U32:
18182 case AMDGPU::S_BFE_I64:
18185 case AMDGPU::S_BFE_U64:
18188 case AMDGPU::G_INTRINSIC:
18189 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18192 case Intrinsic::amdgcn_workitem_id_x:
18195 case Intrinsic::amdgcn_workitem_id_y:
18198 case Intrinsic::amdgcn_workitem_id_z:
18201 case Intrinsic::amdgcn_mbcnt_lo:
18202 case Intrinsic::amdgcn_mbcnt_hi: {
18214 case Intrinsic::amdgcn_groupstaticsize: {
18225 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18228 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18231 case AMDGPU::G_AMDGPU_SMED3:
18232 case AMDGPU::G_AMDGPU_UMED3: {
18233 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
18260 unsigned Depth)
const {
18267 AttributeList Attrs =
18269 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
18296 if (Header->getAlignment() != PrefAlign)
18297 return Header->getAlignment();
18299 unsigned LoopSize = 0;
18304 LoopSize +=
MBB->getAlignment().value() / 2;
18307 LoopSize +=
TII->getInstSizeInBytes(
MI);
18308 if (LoopSize > 192)
18313 if (LoopSize <= 64)
18316 if (LoopSize <= 128)
18317 return CacheLineAlign;
18323 auto I = Exit->getFirstNonDebugInstr();
18324 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18325 return CacheLineAlign;
18334 if (PreTerm == Pre->
begin() ||
18335 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18339 auto ExitHead = Exit->getFirstNonDebugInstr();
18340 if (ExitHead == Exit->end() ||
18341 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18346 return CacheLineAlign;
18354 N =
N->getOperand(0).getNode();
18364 switch (
N->getOpcode()) {
18372 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
18373 return !
TRI->isSGPRReg(
MRI, Reg);
18379 return !
TRI->isSGPRReg(
MRI, Reg);
18383 unsigned AS = L->getAddressSpace();
18393 case AMDGPUISD::ATOMIC_CMP_SWAP:
18394 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
18395 case AMDGPUISD::BUFFER_ATOMIC_ADD:
18396 case AMDGPUISD::BUFFER_ATOMIC_SUB:
18397 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
18398 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
18399 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
18400 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
18401 case AMDGPUISD::BUFFER_ATOMIC_AND:
18402 case AMDGPUISD::BUFFER_ATOMIC_OR:
18403 case AMDGPUISD::BUFFER_ATOMIC_XOR:
18404 case AMDGPUISD::BUFFER_ATOMIC_INC:
18405 case AMDGPUISD::BUFFER_ATOMIC_DEC:
18406 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
18407 case AMDGPUISD::BUFFER_ATOMIC_FADD:
18408 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
18409 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
18415 return A->readMem() &&
A->writeMem();
18436 switch (Ty.getScalarSizeInBits()) {
18448 const APInt &DemandedElts,
18451 unsigned Depth)
const {
18452 if (
Op.getOpcode() == AMDGPUISD::CLAMP) {
18456 if (Info->getMode().DX10Clamp)
18468 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
18488 <<
"Hardware instruction generated for atomic "
18490 <<
" operation at memory scope " << MemScope;
18495 Type *EltTy = VT->getElementType();
18496 return VT->getNumElements() == 2 &&
18516 unsigned BW =
IT->getBitWidth();
18517 return BW == 32 || BW == 64;
18531 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
18532 return BW == 32 || BW == 64;
18535 if (Ty->isFloatTy() || Ty->isDoubleTy())
18539 return VT->getNumElements() == 2 &&
18540 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18550 bool HasSystemScope) {
18557 if (HasSystemScope) {
18566 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
18579 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
18605 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
18618 bool HasSystemScope =
18650 if (!
IT ||
IT->getBitWidth() != 32)
18656 if (Subtarget->hasEmulatedSystemScopeAtomics())
18672 if (!HasSystemScope &&
18673 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18685 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
18693 ConstVal && ConstVal->isNullValue())
18731 if (Ty->isFloatTy()) {
18736 if (Ty->isDoubleTy()) {
18757 if (Ty->isFloatTy() &&
18758 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18771 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18775 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
18779 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18784 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
18789 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18793 if (Ty->isFloatTy()) {
18796 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18799 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18804 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18812 if (Subtarget->hasFlatAtomicFaddF32Inst())
18821 if (Subtarget->hasLDSFPAtomicAddF32()) {
18822 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18824 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18852 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18854 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18858 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18860 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18913 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18914 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18915 : &AMDGPU::SReg_32RegClass;
18916 if (!
TRI->isSGPRClass(RC) && !isDivergent)
18917 return TRI->getEquivalentSGPRClass(RC);
18918 if (
TRI->isSGPRClass(RC) && isDivergent) {
18919 if (Subtarget->hasGFX90AInsts())
18920 return TRI->getEquivalentAVClass(RC);
18921 return TRI->getEquivalentVGPRClass(RC);
18934 unsigned WaveSize) {
18939 if (!
IT ||
IT->getBitWidth() != WaveSize)
18944 if (!Visited.
insert(V).second)
18946 bool Result =
false;
18947 for (
const auto *U : V->users()) {
18949 if (V == U->getOperand(1)) {
18954 case Intrinsic::amdgcn_if_break:
18955 case Intrinsic::amdgcn_if:
18956 case Intrinsic::amdgcn_else:
18961 if (V == U->getOperand(0)) {
18966 case Intrinsic::amdgcn_end_cf:
18967 case Intrinsic::amdgcn_loop:
18973 Result =
hasCFUser(U, Visited, WaveSize);
18982 const Value *V)
const {
18984 if (CI->isInlineAsm()) {
18993 for (
auto &TC : TargetConstraints) {
19007 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
19035 return MRI.hasOneNonDBGUse(N0);
19042 if (
I.getMetadata(
"amdgpu.noclobber"))
19044 if (
I.getMetadata(
"amdgpu.last.use"))
19108 Alignment = RMW->getAlign();
19121 bool FullFlatEmulation =
19123 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
19124 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
19125 RMW->getType()->isDoubleTy()));
19128 bool ReturnValueIsUsed = !AI->
use_empty();
19137 if (FullFlatEmulation) {
19148 std::prev(BB->
end())->eraseFromParent();
19149 Builder.SetInsertPoint(BB);
19151 Value *LoadedShared =
nullptr;
19152 if (FullFlatEmulation) {
19153 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
19154 {Addr},
nullptr,
"is.shared");
19155 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
19156 Builder.SetInsertPoint(SharedBB);
19157 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19163 LoadedShared = Clone;
19165 Builder.CreateBr(PhiBB);
19166 Builder.SetInsertPoint(CheckPrivateBB);
19169 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19170 {Addr},
nullptr,
"is.private");
19171 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19173 Builder.SetInsertPoint(PrivateBB);
19175 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19178 Value *LoadedPrivate;
19180 LoadedPrivate = Builder.CreateAlignedLoad(
19181 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
19184 LoadedPrivate, RMW->getValOperand());
19186 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19188 auto [ResultLoad, Equal] =
19194 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19197 Builder.CreateBr(PhiBB);
19199 Builder.SetInsertPoint(GlobalBB);
19203 if (FullFlatEmulation) {
19204 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19213 if (!FullFlatEmulation) {
19218 MDNode *RangeNotPrivate =
19221 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
19225 Builder.CreateBr(PhiBB);
19227 Builder.SetInsertPoint(PhiBB);
19229 if (ReturnValueIsUsed) {
19232 if (FullFlatEmulation)
19239 Builder.CreateBr(ExitBB);
19243 unsigned PtrOpIdx) {
19244 Value *PtrOp =
I->getOperand(PtrOpIdx);
19251 I->setOperand(PtrOpIdx, ASCast);
19263 ConstVal && ConstVal->isNullValue()) {
19293 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19301 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19316 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static void getCoopAtomicOperandsInfo(const CallBase &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
AMDGPUArgumentUsageInfo & getArgUsageInfo()
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned AndSaveExecOpc
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf()
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
MachineFunctionAnalysisManager * getMFAM()
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
constexpr bool empty() const
empty - Check if the string is empty.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
constexpr bool isZero() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
constexpr bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
constexpr bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
unsigned AtomicNoRetBaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const