41#include "llvm/IR/IntrinsicsAMDGPU.h"
42#include "llvm/IR/IntrinsicsR600.h"
53#define DEBUG_TYPE "si-lower"
59 cl::desc(
"Do not align and prefetch loops"),
63 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
64 cl::desc(
"Use indirect register addressing for divergent indexes"),
78 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
79 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
81 return AMDGPU::SGPR0 +
Reg;
153 if (Subtarget->has16BitInsts()) {
154 if (Subtarget->useRealTrue16Insts()) {
196 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
197 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
198 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
199 MVT::i1, MVT::v32i32},
203 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
204 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
205 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
206 MVT::i1, MVT::v32i32},
213 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
214 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
215 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
216 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
217 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
275 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
282 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
283 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
284 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
287 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
288 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
289 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
293 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
294 MVT::v3i16, MVT::v4i16, MVT::Other},
299 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
315 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
316 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
317 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
318 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
319 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
320 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
321 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
322 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
354 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
368 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
382 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
396 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
410 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
425 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
426 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
429 if (Subtarget->hasPkMovB32()) {
450 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
451 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
456 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
460 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
461 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
462 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
463 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
487 if (Subtarget->hasSMemRealTime() ||
492 if (Subtarget->has16BitInsts()) {
499 if (Subtarget->hasMadMacF32Insts())
502 if (!Subtarget->hasBFI())
506 if (!Subtarget->hasBCNT(32))
509 if (!Subtarget->hasBCNT(64))
512 if (Subtarget->hasFFBH())
515 if (Subtarget->hasFFBL())
526 if (Subtarget->hasBFE())
530 if (Subtarget->hasIntClamp())
533 if (Subtarget->hasAddNoCarry())
538 {ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
539 {MVT::f32, MVT::f64},
Custom);
545 {MVT::f32, MVT::f64},
Legal);
547 if (Subtarget->haveRoundOpsF64())
570 if (Subtarget->has16BitInsts()) {
619 ISD::FSIN, ISD::FROUND},
623 if (Subtarget->hasBF16TransInsts())
642 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
643 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
644 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
777 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
778 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
779 MVT::v32f16, MVT::v32bf16},
783 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
789 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
793 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
797 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
798 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
806 if (Subtarget->hasVOP3PInsts()) {
817 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
820 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
821 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
822 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
825 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
833 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
839 {ISD::FMAXNUM, ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXIMUMNUM},
840 {MVT::v2f16, MVT::v4f16},
Custom);
846 if (Subtarget->hasBF16PackedInsts()) {
847 for (
MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
853 if (Subtarget->hasPackedFP32Ops()) {
857 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
864 if (Subtarget->has16BitInsts()) {
877 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
878 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
879 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
880 MVT::v32f16, MVT::v32bf16},
885 if (Subtarget->hasVectorMulU64())
887 else if (Subtarget->hasScalarSMulU64())
890 if (Subtarget->hasMad64_32())
893 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
896 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
898 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
901 if (Subtarget->hasMinimum3Maximum3F32())
904 if (Subtarget->hasMinimum3Maximum3PKF16()) {
908 if (!Subtarget->hasMinimum3Maximum3F16())
913 if (Subtarget->hasVOP3PInsts()) {
916 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
920 if (Subtarget->hasIntMinMax64())
925 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
926 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
931 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
932 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
933 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
934 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
938 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
939 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
940 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
941 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
956 if (Subtarget->hasBF16ConversionInsts()) {
961 if (Subtarget->hasBF16PackedInsts()) {
967 if (Subtarget->hasBF16TransInsts()) {
971 if (Subtarget->hasCvtPkF16F32Inst()) {
973 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1023 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1032 ISD::ATOMIC_CMP_SWAP,
1033 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
1035 ISD::ATOMIC_LOAD_ADD,
1036 ISD::ATOMIC_LOAD_SUB,
1037 ISD::ATOMIC_LOAD_AND,
1038 ISD::ATOMIC_LOAD_OR,
1039 ISD::ATOMIC_LOAD_XOR,
1040 ISD::ATOMIC_LOAD_NAND,
1041 ISD::ATOMIC_LOAD_MIN,
1042 ISD::ATOMIC_LOAD_MAX,
1043 ISD::ATOMIC_LOAD_UMIN,
1044 ISD::ATOMIC_LOAD_UMAX,
1045 ISD::ATOMIC_LOAD_FADD,
1046 ISD::ATOMIC_LOAD_FMIN,
1047 ISD::ATOMIC_LOAD_FMAX,
1048 ISD::ATOMIC_LOAD_UINC_WRAP,
1049 ISD::ATOMIC_LOAD_UDEC_WRAP,
1062 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1075 EVT DestVT,
EVT SrcVT)
const {
1077 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1078 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1080 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1087 LLT DestTy,
LLT SrcTy)
const {
1088 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1089 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1091 SrcTy.getScalarSizeInBits() == 16 &&
1112 if (Subtarget->has16BitInsts()) {
1115 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1117 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1121 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1143 if (
Size == 16 && Subtarget->has16BitInsts())
1144 return (NumElts + 1) / 2;
1150 return NumElts * ((
Size + 31) / 32);
1159 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1167 if (
Size == 16 && Subtarget->has16BitInsts()) {
1168 if (ScalarVT == MVT::bf16) {
1169 RegisterVT = MVT::i32;
1170 IntermediateVT = MVT::v2bf16;
1172 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1173 IntermediateVT = RegisterVT;
1175 NumIntermediates = (NumElts + 1) / 2;
1176 return NumIntermediates;
1181 IntermediateVT = RegisterVT;
1182 NumIntermediates = NumElts;
1183 return NumIntermediates;
1188 RegisterVT = MVT::i16;
1189 IntermediateVT = ScalarVT;
1190 NumIntermediates = NumElts;
1191 return NumIntermediates;
1195 RegisterVT = MVT::i32;
1196 IntermediateVT = ScalarVT;
1197 NumIntermediates = NumElts;
1198 return NumIntermediates;
1202 RegisterVT = MVT::i32;
1203 IntermediateVT = RegisterVT;
1204 NumIntermediates = NumElts * ((
Size + 31) / 32);
1205 return NumIntermediates;
1210 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1215 unsigned MaxNumLanes) {
1216 assert(MaxNumLanes != 0);
1220 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1231 unsigned MaxNumLanes) {
1237 assert(ST->getNumContainedTypes() == 2 &&
1238 ST->getContainedType(1)->isIntegerTy(32));
1252 return MVT::amdgpuBufferFatPointer;
1254 DL.getPointerSizeInBits(AS) == 192)
1255 return MVT::amdgpuBufferStridedPointer;
1264 DL.getPointerSizeInBits(AS) == 160) ||
1266 DL.getPointerSizeInBits(AS) == 192))
1273 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1274 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1275 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1277 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1278 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1279 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1280 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1281 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1283 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1284 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1285 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1286 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1287 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1289 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1290 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1291 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1292 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1293 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1332 unsigned IntrID)
const {
1334 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1352 if (RsrcIntr->IsImage) {
1367 Info.ptrVal = RsrcArg;
1370 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1379 if (RsrcIntr->IsImage) {
1380 unsigned MaxNumLanes = 4;
1395 std::numeric_limits<unsigned>::max());
1405 if (RsrcIntr->IsImage) {
1426 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1428 Info.memVT = MVT::i32;
1435 case Intrinsic::amdgcn_raw_buffer_load_lds:
1436 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1437 case Intrinsic::amdgcn_struct_buffer_load_lds:
1438 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1444 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1445 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1446 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1447 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1450 std::numeric_limits<unsigned>::max());
1460 case Intrinsic::amdgcn_ds_ordered_add:
1461 case Intrinsic::amdgcn_ds_ordered_swap: {
1474 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1475 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1478 Info.ptrVal =
nullptr;
1483 case Intrinsic::amdgcn_ds_append:
1484 case Intrinsic::amdgcn_ds_consume: {
1497 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1498 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1499 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1504 Info.memVT = MVT::i64;
1510 case Intrinsic::amdgcn_global_atomic_csub: {
1519 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1520 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1521 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1524 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1527 ->getElementType(0));
1535 case Intrinsic::amdgcn_global_atomic_fmin_num:
1536 case Intrinsic::amdgcn_global_atomic_fmax_num:
1537 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1538 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1539 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1540 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1550 case Intrinsic::amdgcn_flat_load_monitor_b32:
1551 case Intrinsic::amdgcn_flat_load_monitor_b64:
1552 case Intrinsic::amdgcn_flat_load_monitor_b128:
1553 case Intrinsic::amdgcn_global_load_monitor_b32:
1554 case Intrinsic::amdgcn_global_load_monitor_b64:
1555 case Intrinsic::amdgcn_global_load_monitor_b128:
1556 case Intrinsic::amdgcn_cluster_load_b32:
1557 case Intrinsic::amdgcn_cluster_load_b64:
1558 case Intrinsic::amdgcn_cluster_load_b128:
1559 case Intrinsic::amdgcn_ds_load_tr6_b96:
1560 case Intrinsic::amdgcn_ds_load_tr4_b64:
1561 case Intrinsic::amdgcn_ds_load_tr8_b64:
1562 case Intrinsic::amdgcn_ds_load_tr16_b128:
1563 case Intrinsic::amdgcn_global_load_tr6_b96:
1564 case Intrinsic::amdgcn_global_load_tr4_b64:
1565 case Intrinsic::amdgcn_global_load_tr_b64:
1566 case Intrinsic::amdgcn_global_load_tr_b128:
1567 case Intrinsic::amdgcn_ds_read_tr4_b64:
1568 case Intrinsic::amdgcn_ds_read_tr6_b96:
1569 case Intrinsic::amdgcn_ds_read_tr8_b64:
1570 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1578 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1579 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1580 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1588 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1589 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1590 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1598 case Intrinsic::amdgcn_ds_gws_init:
1599 case Intrinsic::amdgcn_ds_gws_barrier:
1600 case Intrinsic::amdgcn_ds_gws_sema_v:
1601 case Intrinsic::amdgcn_ds_gws_sema_br:
1602 case Intrinsic::amdgcn_ds_gws_sema_p:
1603 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1613 Info.memVT = MVT::i32;
1615 Info.align =
Align(4);
1617 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1623 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1624 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1625 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1626 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1627 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1628 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1629 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1630 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1637 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1638 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1639 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1640 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1647 case Intrinsic::amdgcn_load_to_lds:
1648 case Intrinsic::amdgcn_global_load_lds: {
1659 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1660 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1661 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1662 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1672 Info.memVT = MVT::i32;
1674 Info.align =
Align(4);
1679 case Intrinsic::amdgcn_s_prefetch_data:
1680 case Intrinsic::amdgcn_flat_prefetch:
1681 case Intrinsic::amdgcn_global_prefetch: {
1696 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1699 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1700 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1712 Type *&AccessTy)
const {
1714 switch (
II->getIntrinsicID()) {
1715 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1716 case Intrinsic::amdgcn_cluster_load_b128:
1717 case Intrinsic::amdgcn_cluster_load_b64:
1718 case Intrinsic::amdgcn_cluster_load_b32:
1719 case Intrinsic::amdgcn_ds_append:
1720 case Intrinsic::amdgcn_ds_consume:
1721 case Intrinsic::amdgcn_ds_load_tr8_b64:
1722 case Intrinsic::amdgcn_ds_load_tr16_b128:
1723 case Intrinsic::amdgcn_ds_load_tr4_b64:
1724 case Intrinsic::amdgcn_ds_load_tr6_b96:
1725 case Intrinsic::amdgcn_ds_read_tr4_b64:
1726 case Intrinsic::amdgcn_ds_read_tr6_b96:
1727 case Intrinsic::amdgcn_ds_read_tr8_b64:
1728 case Intrinsic::amdgcn_ds_read_tr16_b64:
1729 case Intrinsic::amdgcn_ds_ordered_add:
1730 case Intrinsic::amdgcn_ds_ordered_swap:
1731 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1732 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1733 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1734 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1735 case Intrinsic::amdgcn_flat_load_monitor_b128:
1736 case Intrinsic::amdgcn_flat_load_monitor_b32:
1737 case Intrinsic::amdgcn_flat_load_monitor_b64:
1738 case Intrinsic::amdgcn_global_atomic_csub:
1739 case Intrinsic::amdgcn_global_atomic_fmax_num:
1740 case Intrinsic::amdgcn_global_atomic_fmin_num:
1741 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1742 case Intrinsic::amdgcn_global_load_monitor_b128:
1743 case Intrinsic::amdgcn_global_load_monitor_b32:
1744 case Intrinsic::amdgcn_global_load_monitor_b64:
1745 case Intrinsic::amdgcn_global_load_tr_b64:
1746 case Intrinsic::amdgcn_global_load_tr_b128:
1747 case Intrinsic::amdgcn_global_load_tr4_b64:
1748 case Intrinsic::amdgcn_global_load_tr6_b96:
1749 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1750 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1751 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1752 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1753 Ptr =
II->getArgOperand(0);
1755 case Intrinsic::amdgcn_load_to_lds:
1756 case Intrinsic::amdgcn_global_load_lds:
1757 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1758 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1759 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1760 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1761 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1762 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1763 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1764 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1765 Ptr =
II->getArgOperand(1);
1770 AccessTy =
II->getType();
1776 unsigned AddrSpace)
const {
1777 if (!Subtarget->hasFlatInstOffsets()) {
1788 return AM.
Scale == 0 &&
1789 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1790 AM.
BaseOffs, AddrSpace, FlatVariant));
1794 if (Subtarget->hasFlatGlobalInsts())
1797 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1810 return isLegalMUBUFAddressingMode(AM);
1813bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1824 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1836 if (AM.HasBaseReg) {
1868 return isLegalMUBUFAddressingMode(AM);
1870 if (!Subtarget->hasScalarSubwordLoads()) {
1875 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1923 return Subtarget->enableFlatScratch()
1925 : isLegalMUBUFAddressingMode(AM);
1972 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1981 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
1984 Align RequiredAlignment(
1986 if (Subtarget->hasLDSMisalignedBug() &&
Size > 32 &&
1987 Alignment < RequiredAlignment)
2002 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
2008 RequiredAlignment =
Align(4);
2010 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2026 *IsFast = (Alignment >= RequiredAlignment) ? 64
2027 : (Alignment <
Align(4)) ? 32
2034 if (!Subtarget->hasDS96AndDS128())
2040 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2049 *IsFast = (Alignment >= RequiredAlignment) ? 96
2050 : (Alignment <
Align(4)) ? 32
2057 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2063 RequiredAlignment =
Align(8);
2065 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2074 *IsFast = (Alignment >= RequiredAlignment) ? 128
2075 : (Alignment <
Align(4)) ? 32
2092 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2094 return Alignment >= RequiredAlignment ||
2095 Subtarget->hasUnalignedDSAccessEnabled();
2103 bool AlignedBy4 = Alignment >=
Align(4);
2104 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2106 *IsFast = AlignedBy4 ?
Size : 1;
2111 *IsFast = AlignedBy4;
2122 return Alignment >=
Align(4) ||
2123 Subtarget->hasUnalignedBufferAccessEnabled();
2135 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2150 return Size >= 32 && Alignment >=
Align(4);
2155 unsigned *IsFast)
const {
2157 Alignment, Flags, IsFast);
2162 const AttributeList &FuncAttributes)
const {
2168 if (
Op.size() >= 16 &&
2172 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2190 unsigned DestAS)
const {
2193 Subtarget->hasGloballyAddressableScratch()) {
2223 unsigned Index)
const {
2239 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2266 auto [InputPtrReg, RC, ArgTy] =
2276 Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2282 const SDLoc &SL)
const {
2289 const SDLoc &SL)
const {
2292 std::optional<uint32_t> KnownSize =
2294 if (KnownSize.has_value())
2320 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2329SDValue SITargetLowering::lowerKernargMemParameter(
2341 int64_t OffsetDiff =
Offset - AlignDownOffset;
2347 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2356 ArgVal = DAG.
getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2357 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2367 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2376 const SDLoc &SL)
const {
2386 return DAG.
getNode(ISD::BITCAST, SL, ValVT, Val);
2445 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2448 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2449 if (ConvertedVal == ArgValue)
2450 return ConvertedVal;
2455SDValue SITargetLowering::lowerWorkGroupId(
2460 if (!Subtarget->hasClusters())
2461 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2469 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2470 SDLoc SL(ClusterIdXYZ);
2471 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2474 SDValue ClusterWorkGroupIdXYZ =
2475 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2485 return ClusterIdXYZ;
2487 using namespace AMDGPU::Hwreg;
2491 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2502SDValue SITargetLowering::getPreloadedValue(
2505 const ArgDescriptor *
Reg =
nullptr;
2506 const TargetRegisterClass *RC;
2510 const ArgDescriptor WorkGroupIDX =
2518 const ArgDescriptor WorkGroupIDZ =
2520 const ArgDescriptor ClusterWorkGroupIDX =
2522 const ArgDescriptor ClusterWorkGroupIDY =
2524 const ArgDescriptor ClusterWorkGroupIDZ =
2526 const ArgDescriptor ClusterWorkGroupMaxIDX =
2528 const ArgDescriptor ClusterWorkGroupMaxIDY =
2530 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2532 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2535 auto LoadConstant = [&](
unsigned N) {
2539 if (Subtarget->hasArchitectedSGPRs() &&
2546 Reg = &WorkGroupIDX;
2547 RC = &AMDGPU::SReg_32RegClass;
2551 Reg = &WorkGroupIDY;
2552 RC = &AMDGPU::SReg_32RegClass;
2556 Reg = &WorkGroupIDZ;
2557 RC = &AMDGPU::SReg_32RegClass;
2561 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2562 return LoadConstant(0);
2563 Reg = &ClusterWorkGroupIDX;
2564 RC = &AMDGPU::SReg_32RegClass;
2568 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2569 return LoadConstant(0);
2570 Reg = &ClusterWorkGroupIDY;
2571 RC = &AMDGPU::SReg_32RegClass;
2575 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2576 return LoadConstant(0);
2577 Reg = &ClusterWorkGroupIDZ;
2578 RC = &AMDGPU::SReg_32RegClass;
2583 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2584 Reg = &ClusterWorkGroupMaxIDX;
2585 RC = &AMDGPU::SReg_32RegClass;
2590 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2591 Reg = &ClusterWorkGroupMaxIDY;
2592 RC = &AMDGPU::SReg_32RegClass;
2597 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2598 Reg = &ClusterWorkGroupMaxIDZ;
2599 RC = &AMDGPU::SReg_32RegClass;
2603 Reg = &ClusterWorkGroupMaxFlatID;
2604 RC = &AMDGPU::SReg_32RegClass;
2635 for (
unsigned I = 0,
E = Ins.size(), PSInputNum = 0;
I !=
E; ++
I) {
2639 "vector type argument should have been split");
2644 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2652 "unexpected vector split in ps argument type");
2666 Info->markPSInputAllocated(PSInputNum);
2668 Info->markPSInputEnabled(PSInputNum);
2684 if (Info.hasWorkItemIDX()) {
2690 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2694 if (Info.hasWorkItemIDY()) {
2695 assert(Info.hasWorkItemIDX());
2696 if (Subtarget->hasPackedTID()) {
2697 Info.setWorkItemIDY(
2700 unsigned Reg = AMDGPU::VGPR1;
2708 if (Info.hasWorkItemIDZ()) {
2709 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2710 if (Subtarget->hasPackedTID()) {
2711 Info.setWorkItemIDZ(
2714 unsigned Reg = AMDGPU::VGPR2;
2734 if (RegIdx == ArgVGPRs.
size()) {
2741 unsigned Reg = ArgVGPRs[RegIdx];
2753 unsigned NumArgRegs) {
2756 if (RegIdx == ArgSGPRs.
size())
2759 unsigned Reg = ArgSGPRs[RegIdx];
2801 const unsigned Mask = 0x3ff;
2804 if (Info.hasWorkItemIDX()) {
2806 Info.setWorkItemIDX(Arg);
2809 if (Info.hasWorkItemIDY()) {
2811 Info.setWorkItemIDY(Arg);
2814 if (Info.hasWorkItemIDZ())
2826 const unsigned Mask = 0x3ff;
2835 auto &
ArgInfo = Info.getArgInfo();
2847 if (Info.hasImplicitArgPtr())
2855 if (Info.hasWorkGroupIDX())
2858 if (Info.hasWorkGroupIDY())
2861 if (Info.hasWorkGroupIDZ())
2864 if (Info.hasLDSKernelId())
2875 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
2876 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2882 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
2883 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2888 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
2889 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2895 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2901 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
2910 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2915 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
2916 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2921 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
2922 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2937 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
2939 bool InPreloadSequence =
true;
2941 bool AlignedForImplictArgs =
false;
2942 unsigned ImplicitArgOffset = 0;
2943 for (
auto &Arg :
F.args()) {
2944 if (!InPreloadSequence || !Arg.hasInRegAttr())
2947 unsigned ArgIdx = Arg.getArgNo();
2950 if (InIdx < Ins.size() &&
2951 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
2954 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2955 Ins[InIdx].getOrigArgIndex() == ArgIdx;
2957 assert(ArgLocs[ArgIdx].isMemLoc());
2958 auto &ArgLoc = ArgLocs[InIdx];
2960 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2962 unsigned NumAllocSGPRs =
2963 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2966 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
2967 if (!AlignedForImplictArgs) {
2969 alignTo(LastExplicitArgOffset,
2970 Subtarget->getAlignmentForImplicitArgPtr()) -
2971 LastExplicitArgOffset;
2972 AlignedForImplictArgs =
true;
2974 ArgOffset += ImplicitArgOffset;
2978 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2979 assert(InIdx >= 1 &&
"No previous SGPR");
2980 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2981 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2985 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2986 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2989 InPreloadSequence =
false;
2995 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2997 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2999 if (PreloadRegs->
size() > 1)
3000 RC = &AMDGPU::SGPR_32RegClass;
3001 for (
auto &Reg : *PreloadRegs) {
3007 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3016 if (Info.hasLDSKernelId()) {
3017 Register Reg = Info.addLDSKernelId();
3018 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3027 bool IsShader)
const {
3028 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3029 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
3035 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3037 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3041 unsigned NumRequiredSystemSGPRs =
3042 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3043 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3044 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3045 Register Reg = Info.addReservedUserSGPR();
3046 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3051 if (!HasArchitectedSGPRs) {
3052 if (Info.hasWorkGroupIDX()) {
3053 Register Reg = Info.addWorkGroupIDX();
3054 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3058 if (Info.hasWorkGroupIDY()) {
3059 Register Reg = Info.addWorkGroupIDY();
3060 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3064 if (Info.hasWorkGroupIDZ()) {
3065 Register Reg = Info.addWorkGroupIDZ();
3066 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3071 if (Info.hasWorkGroupInfo()) {
3072 Register Reg = Info.addWorkGroupInfo();
3073 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3077 if (Info.hasPrivateSegmentWaveByteOffset()) {
3079 unsigned PrivateSegmentWaveByteOffsetReg;
3082 PrivateSegmentWaveByteOffsetReg =
3083 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3087 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3089 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3092 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3094 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3095 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3098 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
3099 Info.getNumPreloadedSGPRs() >= 16);
3114 if (HasStackObjects)
3115 Info.setHasNonSpillStackObjects(
true);
3120 HasStackObjects =
true;
3124 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3126 if (!ST.enableFlatScratch()) {
3127 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3134 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3136 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3146 Info.setScratchRSrcReg(ReservedBufferReg);
3165 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
3166 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3173 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3174 if (!
MRI.isLiveIn(
Reg)) {
3175 Info.setStackPtrOffsetReg(
Reg);
3180 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3187 if (ST.getFrameLowering()->hasFP(MF)) {
3188 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3204 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3213 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3214 RC = &AMDGPU::SGPR_64RegClass;
3215 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3216 RC = &AMDGPU::SGPR_32RegClass;
3222 Entry->addLiveIn(*
I);
3227 for (
auto *Exit : Exits)
3229 TII->get(TargetOpcode::COPY), *
I)
3244 bool IsError =
false;
3248 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3266 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3267 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3269 if (!Subtarget->enableFlatScratch())
3274 !Subtarget->hasArchitectedSGPRs())
3275 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3276 !Info->hasWorkGroupIDZ());
3279 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3297 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3298 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3301 Info->markPSInputAllocated(0);
3302 Info->markPSInputEnabled(0);
3304 if (Subtarget->isAmdPalOS()) {
3313 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3314 if ((PsInputBits & 0x7F) == 0 ||
3315 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3318 }
else if (IsKernel) {
3319 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3321 Splits.
append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3331 if (IsKernel && Subtarget->hasKernargPreload())
3335 }
else if (!IsGraphics) {
3340 if (!Subtarget->enableFlatScratch())
3352 Info->setNumWaveDispatchSGPRs(
3354 Info->setNumWaveDispatchVGPRs(
3356 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3357 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3362 if (IsWholeWaveFunc) {
3364 {MVT::i1, MVT::Other}, Chain);
3376 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3387 if (IsEntryFunc && VA.
isMemLoc()) {
3410 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3414 int64_t OffsetDiff =
Offset - AlignDownOffset;
3421 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3431 ArgVal = DAG.
getNode(ISD::BITCAST,
DL, MemVT, ArgVal);
3432 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3433 Ins[i].Flags.isSExt(), &Ins[i]);
3441 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3444 if (PreloadRegs.
size() == 1) {
3445 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
3450 TRI->getRegSizeInBits(*RC)));
3458 for (
auto Reg : PreloadRegs) {
3465 PreloadRegs.size()),
3482 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3483 Ins[i].Flags.isSExt(), &Ins[i]);
3495 "hidden argument in kernel signature was not preloaded",
3501 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3502 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3522 if (!IsEntryFunc && VA.
isMemLoc()) {
3523 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3534 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3535 RC = &AMDGPU::VGPR_32RegClass;
3536 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3537 RC = &AMDGPU::SGPR_32RegClass;
3557 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3573 Info->setBytesInStackArgArea(StackArgSize);
3575 return Chains.
empty() ? Chain
3584 const Type *RetTy)
const {
3592 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3597 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3598 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3599 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3600 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3623 Info->setIfReturnsVoid(Outs.
empty());
3624 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3643 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3644 ++
I, ++RealRVLocIdx) {
3648 SDValue Arg = OutVals[RealRVLocIdx];
3671 ReadFirstLane, Arg);
3678 if (!Info->isEntryFunction()) {
3684 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3686 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3787 auto &ArgUsageInfo =
3789 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3817 const auto [OutgoingArg, ArgRC, ArgTy] =
3822 const auto [IncomingArg, IncomingArgRC, Ty] =
3824 assert(IncomingArgRC == ArgRC);
3827 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3835 InputReg = getImplicitArgPtr(DAG,
DL);
3837 std::optional<uint32_t> Id =
3839 if (Id.has_value()) {
3850 if (OutgoingArg->isRegister()) {
3851 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3852 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3855 unsigned SpecialArgOffset =
3866 auto [OutgoingArg, ArgRC, Ty] =
3869 std::tie(OutgoingArg, ArgRC, Ty) =
3872 std::tie(OutgoingArg, ArgRC, Ty) =
3887 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3888 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3889 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3894 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
3902 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
3912 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
3921 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3922 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3933 : IncomingArgY ? *IncomingArgY
3940 if (OutgoingArg->isRegister()) {
3942 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3968 if (Callee->isDivergent())
3975 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3979 if (!CallerPreserved)
3982 bool CCMatch = CallerCC == CalleeCC;
3995 if (Arg.hasByValAttr())
4009 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
4010 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4019 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4032 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4034 if (!CCVA.isRegLoc())
4039 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4041 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4065enum ChainCallArgIdx {
4087 bool UsesDynamicVGPRs =
false;
4088 if (IsChainCallConv) {
4093 auto RequestedExecIt =
4095 return Arg.OrigArgIndex == 2;
4097 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4099 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4102 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4105 "Haven't popped all the special args");
4108 CLI.
Args[ChainCallArgIdx::Exec];
4109 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4117 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4119 ChainCallSpecialArgs.
push_back(Arg.Node);
4122 PushNodeOrTargetConstant(RequestedExecArg);
4128 if (FlagsValue.
isZero()) {
4129 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4131 "no additional args allowed if flags == 0");
4133 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4137 if (!Subtarget->isWave32()) {
4139 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4142 UsesDynamicVGPRs =
true;
4143 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4144 CLI.
Args.end(), PushNodeOrTargetConstant);
4153 bool IsSibCall =
false;
4167 "unsupported call to variadic function ");
4175 "unsupported required tail call to function ");
4180 Outs, OutVals, Ins, DAG);
4184 "site marked musttail or on llvm.amdgcn.cs.chain");
4191 if (!TailCallOpt && IsTailCall)
4231 auto *
TRI = Subtarget->getRegisterInfo();
4238 if (!IsSibCall || IsChainCallConv) {
4239 if (!Subtarget->enableFlatScratch()) {
4245 RegsToPass.emplace_back(IsChainCallConv
4246 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4247 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4254 const unsigned NumSpecialInputs = RegsToPass.size();
4256 MVT PtrVT = MVT::i32;
4259 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4287 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4295 int32_t
Offset = LocMemOffset;
4302 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4308 ? Flags.getNonZeroByValAlign()
4335 if (Outs[i].Flags.isByVal()) {
4337 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4340 Outs[i].Flags.getNonZeroByValAlign(),
4342 nullptr, std::nullopt, DstInfo,
4348 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4354 if (!MemOpChains.
empty())
4362 TokenGlue = DAG.
getNode(ISD::CONVERGENCECTRL_GLUE,
DL, MVT::Glue,
4370 unsigned ArgIdx = 0;
4371 for (
auto [Reg, Val] : RegsToPass) {
4372 if (ArgIdx++ >= NumSpecialInputs &&
4373 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4399 if (IsTailCall && !IsSibCall) {
4404 std::vector<SDValue>
Ops({Chain});
4410 Ops.push_back(Callee);
4427 Ops.push_back(Callee);
4438 if (IsChainCallConv)
4443 for (
auto &[Reg, Val] : RegsToPass)
4447 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4448 assert(Mask &&
"Missing call preserved mask for calling convention");
4458 MVT::Glue, GlueOps),
4463 Ops.push_back(InGlue);
4483 if (Info->isWholeWaveFunction())
4491 Chain =
Call.getValue(0);
4492 InGlue =
Call.getValue(1);
4494 uint64_t CalleePopBytes = NumBytes;
4515 EVT VT =
Op.getValueType();
4529 "Stack grows upwards for AMDGPU");
4531 Chain = BaseAddr.getValue(1);
4533 if (Alignment > StackAlign) {
4535 << Subtarget->getWavefrontSizeLog2();
4536 uint64_t StackAlignMask = ScaledAlignment - 1;
4543 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4549 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4560 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4576 if (
Op.getValueType() != MVT::i32)
4595 assert(
Op.getValueType() == MVT::i32);
4604 Op.getOperand(0), IntrinID, GetRoundBothImm);
4638 SDValue RoundModeTimesNumBits =
4658 TableEntry, EnumOffset);
4674 static_cast<uint32_t>(ConstMode->getZExtValue()),
4686 if (UseReducedTable) {
4692 SDValue RoundModeTimesNumBits =
4712 SDValue RoundModeTimesNumBits =
4721 NewMode = TruncTable;
4730 ReadFirstLaneID, NewMode);
4743 IntrinID, RoundBothImm, NewMode);
4749 if (
Op->isDivergent() &&
4750 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4760 if (Subtarget->hasSafeSmemPrefetch())
4768 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4777 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4778 EVT SrcVT = Src.getValueType();
4787 EVT DstVT =
Op.getValueType();
4791 return DAG.
getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4796 if (
Op.getValueType() != MVT::i64)
4810 Op.getOperand(0), IntrinID, ModeHwRegImm);
4812 Op.getOperand(0), IntrinID, TrapHwRegImm);
4819 SDValue Result = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4826 if (
Op.getOperand(1).getValueType() != MVT::i64)
4838 ReadFirstLaneID, NewModeReg);
4840 ReadFirstLaneID, NewTrapReg);
4842 unsigned ModeHwReg =
4845 unsigned TrapHwReg =
4853 IntrinID, ModeHwRegImm, NewModeReg);
4856 IntrinID, TrapHwRegImm, NewTrapReg);
4865 .
Case(
"m0", AMDGPU::M0)
4866 .
Case(
"exec", AMDGPU::EXEC)
4867 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4868 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4869 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4870 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4871 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4876 if (!Subtarget->hasFlatScrRegister() &&
4877 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4879 "\" for subtarget."));
4884 case AMDGPU::EXEC_LO:
4885 case AMDGPU::EXEC_HI:
4886 case AMDGPU::FLAT_SCR_LO:
4887 case AMDGPU::FLAT_SCR_HI:
4892 case AMDGPU::FLAT_SCR:
4911 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4920static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4942 auto Next = std::next(
I);
4953 MBB.addSuccessor(LoopBB);
4955 return std::pair(LoopBB, RemainderBB);
4962 auto I =
MI.getIterator();
4963 auto E = std::next(
I);
4985 Src->setIsKill(
false);
4995 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
5001 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5004 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5028 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5029 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5039 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
5040 Register NewExec =
MRI.createVirtualRegister(BoolRC);
5042 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5043 Register CondReg =
MRI.createVirtualRegister(BoolRC);
5051 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5058 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5062 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5070 MRI.setSimpleHint(NewExec, CondReg);
5072 if (UseGPRIdxMode) {
5074 SGPRIdxReg = CurrentIdxReg;
5076 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5077 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5087 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5118 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5119 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5127 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5129 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
5130 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
5146 InitResultReg, DstReg, PhiReg, TmpExec,
5147 Offset, UseGPRIdxMode, SGPRIdxReg);
5153 LoopBB->removeSuccessor(RemainderBB);
5155 LoopBB->addSuccessor(LandingPad);
5166static std::pair<unsigned, int>
5170 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5175 return std::pair(AMDGPU::sub0,
Offset);
5215 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5232 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5233 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5242 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5245 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5249 if (UseGPRIdxMode) {
5256 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5269 MI.eraseFromParent();
5278 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5279 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5285 UseGPRIdxMode, SGPRIdxReg);
5289 if (UseGPRIdxMode) {
5291 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5293 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5298 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5303 MI.eraseFromParent();
5320 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5330 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5332 if (Idx->
getReg() == AMDGPU::NoRegister) {
5343 MI.eraseFromParent();
5348 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5352 if (UseGPRIdxMode) {
5356 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5365 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5366 TRI.getRegSizeInBits(*VecRC), 32,
false);
5372 MI.eraseFromParent();
5382 Register PhiReg =
MRI.createVirtualRegister(VecRC);
5386 UseGPRIdxMode, SGPRIdxReg);
5389 if (UseGPRIdxMode) {
5391 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5393 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5399 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5400 TRI.getRegSizeInBits(*VecRC), 32,
false);
5401 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5407 MI.eraseFromParent();
5423 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5424 if (ST.hasScalarAddSub64()) {
5425 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5435 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5436 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5439 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5441 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5444 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5446 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5448 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5449 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5458 MI.eraseFromParent();
5464 case AMDGPU::S_MIN_U32:
5465 return std::numeric_limits<uint32_t>::max();
5466 case AMDGPU::S_MIN_I32:
5467 return std::numeric_limits<int32_t>::max();
5468 case AMDGPU::S_MAX_U32:
5469 return std::numeric_limits<uint32_t>::min();
5470 case AMDGPU::S_MAX_I32:
5471 return std::numeric_limits<int32_t>::min();
5472 case AMDGPU::S_ADD_I32:
5473 case AMDGPU::S_SUB_I32:
5474 case AMDGPU::S_OR_B32:
5475 case AMDGPU::S_XOR_B32:
5476 return std::numeric_limits<uint32_t>::min();
5477 case AMDGPU::S_AND_B32:
5478 return std::numeric_limits<uint32_t>::max();
5481 "Unexpected opcode in getIdentityValueFor32BitWaveReduction");
5487 case AMDGPU::V_CMP_LT_U64_e64:
5488 return std::numeric_limits<uint64_t>::max();
5489 case AMDGPU::V_CMP_LT_I64_e64:
5490 return std::numeric_limits<int64_t>::max();
5491 case AMDGPU::V_CMP_GT_U64_e64:
5492 return std::numeric_limits<uint64_t>::min();
5493 case AMDGPU::V_CMP_GT_I64_e64:
5494 return std::numeric_limits<int64_t>::min();
5495 case AMDGPU::S_ADD_U64_PSEUDO:
5496 case AMDGPU::S_SUB_U64_PSEUDO:
5497 case AMDGPU::S_OR_B64:
5498 case AMDGPU::S_XOR_B64:
5499 return std::numeric_limits<uint64_t>::min();
5500 case AMDGPU::S_AND_B64:
5501 return std::numeric_limits<uint64_t>::max();
5504 "Unexpected opcode in getIdentityValueFor64BitWaveReduction");
5509 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5510 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5511 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5512 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5513 Opc == AMDGPU::S_XOR_B32;
5527 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
5532 case AMDGPU::S_MIN_U32:
5533 case AMDGPU::S_MIN_I32:
5534 case AMDGPU::S_MAX_U32:
5535 case AMDGPU::S_MAX_I32:
5536 case AMDGPU::S_AND_B32:
5537 case AMDGPU::S_OR_B32: {
5543 case AMDGPU::V_CMP_LT_U64_e64:
5544 case AMDGPU::V_CMP_LT_I64_e64:
5545 case AMDGPU::V_CMP_GT_U64_e64:
5546 case AMDGPU::V_CMP_GT_I64_e64:
5547 case AMDGPU::S_AND_B64:
5548 case AMDGPU::S_OR_B64: {
5554 case AMDGPU::S_XOR_B32:
5555 case AMDGPU::S_XOR_B64:
5556 case AMDGPU::S_ADD_I32:
5557 case AMDGPU::S_ADD_U64_PSEUDO:
5558 case AMDGPU::S_SUB_I32:
5559 case AMDGPU::S_SUB_U64_PSEUDO: {
5562 Register ExecMask =
MRI.createVirtualRegister(WaveMaskRegClass);
5564 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5566 bool IsWave32 = ST.isWave32();
5567 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5568 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5569 unsigned BitCountOpc =
5570 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5574 auto NewAccumulator =
5579 case AMDGPU::S_XOR_B32:
5580 case AMDGPU::S_XOR_B64: {
5586 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5589 .
addReg(NewAccumulator->getOperand(0).getReg())
5592 if (
Opc == AMDGPU::S_XOR_B32) {
5598 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5600 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5604 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5607 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5609 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5619 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5627 case AMDGPU::S_SUB_I32: {
5628 Register NegatedVal =
MRI.createVirtualRegister(DstRegClass);
5636 .
addReg(NewAccumulator->getOperand(0).getReg());
5639 case AMDGPU::S_ADD_I32: {
5642 .
addReg(NewAccumulator->getOperand(0).getReg());
5645 case AMDGPU::S_ADD_U64_PSEUDO:
5646 case AMDGPU::S_SUB_U64_PSEUDO: {
5647 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5648 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5650 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5652 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5653 Register CarryReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5654 Register AddReg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5656 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5658 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5662 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub0);
5665 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub0, Src1SubRC);
5667 MI,
MRI,
MI.getOperand(1), Src1RC, AMDGPU::sub1, Src1SubRC);
5669 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5672 .
addReg(NewAccumulator->getOperand(0).getReg())
5682 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5684 : NewAccumulator->getOperand(0).getReg();
5695 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5701 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5707 BuildMI(BB,
MI,
DL,
TII->get(TargetOpcode::REG_SEQUENCE), DstReg)
5739 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
5740 Register IdentityValReg =
MRI.createVirtualRegister(DstRegClass);
5741 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
5742 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5743 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5744 Register FF1Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5745 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
5747 bool IsWave32 = ST.isWave32();
5748 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5749 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5756 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), IdentityValReg)
5760 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO), IdentityValReg)
5769 I = ComputeLoop->begin();
5771 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
5775 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
5779 I = ComputeLoop->end();
5782 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
5786 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5795 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5797 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5798 Register LaneValReg =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5801 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5803 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub0, SrcSubRC);
5805 MI,
MRI,
MI.getOperand(1), SrcRC, AMDGPU::sub1, SrcSubRC);
5807 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5811 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
5815 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
5816 TII->get(TargetOpcode::REG_SEQUENCE), LaneValReg)
5822 case AMDGPU::S_OR_B64:
5823 case AMDGPU::S_AND_B64:
5824 case AMDGPU::S_XOR_B64: {
5827 .
addReg(LaneValue->getOperand(0).getReg())
5831 case AMDGPU::V_CMP_GT_I64_e64:
5832 case AMDGPU::V_CMP_GT_U64_e64:
5833 case AMDGPU::V_CMP_LT_I64_e64:
5834 case AMDGPU::V_CMP_LT_U64_e64: {
5835 Register LaneMaskReg =
MRI.createVirtualRegister(WaveMaskRegClass);
5837 MRI.createVirtualRegister(WaveMaskRegClass);
5840 TRI->getSubRegisterClass(VregClass, AMDGPU::sub0);
5841 Register AccumulatorVReg =
MRI.createVirtualRegister(VregClass);
5844 VregClass, AMDGPU::sub0, VSubRegClass);
5847 VregClass, AMDGPU::sub1, VSubRegClass);
5848 BuildMI(*ComputeLoop,
I,
DL,
TII->get(TargetOpcode::REG_SEQUENCE),
5855 .
addReg(LaneValue->getOperand(0).getReg())
5856 .
addReg(AccumulatorVReg);
5858 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5859 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
5863 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
5864 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
5865 .
addReg(LaneValue->getOperand(0).getReg())
5869 case AMDGPU::S_ADD_U64_PSEUDO:
5870 case AMDGPU::S_SUB_U64_PSEUDO: {
5873 .
addReg(LaneValue->getOperand(0).getReg());
5880 unsigned BITSETOpc =
5881 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
5882 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
5888 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
5891 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
5893 .
addReg(NewActiveBitsReg)
5895 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
5900 MI.eraseFromParent();
5915 switch (
MI.getOpcode()) {
5916 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
5918 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
5920 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
5922 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
5924 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
5926 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
5928 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
5930 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
5932 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
5934 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
5936 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
5938 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
5940 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
5942 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
5944 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
5946 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
5948 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
5950 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
5952 case AMDGPU::S_UADDO_PSEUDO:
5953 case AMDGPU::S_USUBO_PSEUDO: {
5959 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5961 : AMDGPU::S_SUB_U32;
5969 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5972 MI.eraseFromParent();
5975 case AMDGPU::S_ADD_U64_PSEUDO:
5976 case AMDGPU::S_SUB_U64_PSEUDO: {
5979 case AMDGPU::V_ADD_U64_PSEUDO:
5980 case AMDGPU::V_SUB_U64_PSEUDO: {
5981 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5987 if (ST.hasAddSubU64Insts()) {
5989 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
5990 : AMDGPU::V_SUB_U64_e64),
5995 TII->legalizeOperands(*
I);
5996 MI.eraseFromParent();
6000 if (IsAdd && ST.hasLshlAddU64Inst()) {
6006 TII->legalizeOperands(*
Add);
6007 MI.eraseFromParent();
6011 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6013 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6014 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6016 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
6017 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
6021 : &AMDGPU::VReg_64RegClass;
6024 : &AMDGPU::VReg_64RegClass;
6027 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6029 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6032 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6034 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6037 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6039 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6042 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6049 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6063 TII->legalizeOperands(*LoHalf);
6064 TII->legalizeOperands(*HiHalf);
6065 MI.eraseFromParent();
6068 case AMDGPU::S_ADD_CO_PSEUDO:
6069 case AMDGPU::S_SUB_CO_PSEUDO: {
6080 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6081 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6086 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6087 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6091 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6093 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6098 if (ST.isWave64()) {
6099 if (ST.hasScalarCompareEq64()) {
6106 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6108 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6110 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6111 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6113 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6127 unsigned Opc =
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6128 ? AMDGPU::S_ADDC_U32
6129 : AMDGPU::S_SUBB_U32;
6134 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6140 MI.eraseFromParent();
6143 case AMDGPU::SI_INIT_M0: {
6146 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6149 MI.eraseFromParent();
6152 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6155 TII->get(AMDGPU::S_CMP_EQ_U32))
6160 case AMDGPU::GET_GROUPSTATICSIZE: {
6164 .
add(
MI.getOperand(0))
6166 MI.eraseFromParent();
6169 case AMDGPU::GET_SHADERCYCLESHILO: {
6182 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6184 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6185 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6187 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
6188 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6190 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6194 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6199 .
add(
MI.getOperand(0))
6204 MI.eraseFromParent();
6207 case AMDGPU::SI_INDIRECT_SRC_V1:
6208 case AMDGPU::SI_INDIRECT_SRC_V2:
6209 case AMDGPU::SI_INDIRECT_SRC_V4:
6210 case AMDGPU::SI_INDIRECT_SRC_V8:
6211 case AMDGPU::SI_INDIRECT_SRC_V9:
6212 case AMDGPU::SI_INDIRECT_SRC_V10:
6213 case AMDGPU::SI_INDIRECT_SRC_V11:
6214 case AMDGPU::SI_INDIRECT_SRC_V12:
6215 case AMDGPU::SI_INDIRECT_SRC_V16:
6216 case AMDGPU::SI_INDIRECT_SRC_V32:
6218 case AMDGPU::SI_INDIRECT_DST_V1:
6219 case AMDGPU::SI_INDIRECT_DST_V2:
6220 case AMDGPU::SI_INDIRECT_DST_V4:
6221 case AMDGPU::SI_INDIRECT_DST_V8:
6222 case AMDGPU::SI_INDIRECT_DST_V9:
6223 case AMDGPU::SI_INDIRECT_DST_V10:
6224 case AMDGPU::SI_INDIRECT_DST_V11:
6225 case AMDGPU::SI_INDIRECT_DST_V12:
6226 case AMDGPU::SI_INDIRECT_DST_V16:
6227 case AMDGPU::SI_INDIRECT_DST_V32:
6229 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
6230 case AMDGPU::SI_KILL_I1_PSEUDO:
6232 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
6236 Register SrcCond =
MI.getOperand(3).getReg();
6238 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6239 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6240 const auto *CondRC =
TRI->getWaveMaskRegClass();
6241 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
6245 : &AMDGPU::VReg_64RegClass;
6248 : &AMDGPU::VReg_64RegClass;
6251 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6253 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6256 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6258 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6261 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6263 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6284 MI.eraseFromParent();
6287 case AMDGPU::SI_BR_UNDEF: {
6289 .
add(
MI.getOperand(0));
6291 MI.eraseFromParent();
6294 case AMDGPU::ADJCALLSTACKUP:
6295 case AMDGPU::ADJCALLSTACKDOWN: {
6302 case AMDGPU::SI_CALL_ISEL: {
6303 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
6306 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
6312 MI.eraseFromParent();
6315 case AMDGPU::V_ADD_CO_U32_e32:
6316 case AMDGPU::V_SUB_CO_U32_e32:
6317 case AMDGPU::V_SUBREV_CO_U32_e32: {
6319 unsigned Opc =
MI.getOpcode();
6321 bool NeedClampOperand =
false;
6322 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
6324 NeedClampOperand =
true;
6328 if (
TII->isVOP3(*
I)) {
6331 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
6332 if (NeedClampOperand)
6335 TII->legalizeOperands(*
I);
6337 MI.eraseFromParent();
6340 case AMDGPU::V_ADDC_U32_e32:
6341 case AMDGPU::V_SUBB_U32_e32:
6342 case AMDGPU::V_SUBBREV_U32_e32:
6345 TII->legalizeOperands(
MI);
6347 case AMDGPU::DS_GWS_INIT:
6348 case AMDGPU::DS_GWS_SEMA_BR:
6349 case AMDGPU::DS_GWS_BARRIER:
6350 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
6352 case AMDGPU::DS_GWS_SEMA_V:
6353 case AMDGPU::DS_GWS_SEMA_P:
6354 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
6362 case AMDGPU::S_SETREG_B32: {
6378 const unsigned SetMask = WidthMask <<
Offset;
6381 unsigned SetDenormOp = 0;
6382 unsigned SetRoundOp = 0;
6390 SetRoundOp = AMDGPU::S_ROUND_MODE;
6391 SetDenormOp = AMDGPU::S_DENORM_MODE;
6393 SetRoundOp = AMDGPU::S_ROUND_MODE;
6395 SetDenormOp = AMDGPU::S_DENORM_MODE;
6398 if (SetRoundOp || SetDenormOp) {
6400 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
6401 unsigned ImmVal = Def->getOperand(1).getImm();
6415 MI.eraseFromParent();
6424 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
6428 case AMDGPU::S_INVERSE_BALLOT_U32:
6429 case AMDGPU::S_INVERSE_BALLOT_U64:
6432 MI.setDesc(
TII->get(AMDGPU::COPY));
6434 case AMDGPU::ENDPGM_TRAP: {
6436 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
6456 MI.eraseFromParent();
6459 case AMDGPU::SIMULATED_TRAP: {
6460 assert(Subtarget->hasPrivEnabledTrap2NopBug());
6462 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
6463 MI.eraseFromParent();
6466 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
6467 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
6473 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
6474 Register OriginalExec = Setup->getOperand(0).getReg();
6476 MI.getOperand(0).setReg(OriginalExec);
6513 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
6517 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
6544 if (!Subtarget->hasMadMacF32Insts())
6545 return Subtarget->hasFastFMAF32();
6551 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
6554 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
6570 switch (Ty.getScalarSizeInBits()) {
6588 if (Ty.getScalarSizeInBits() == 16)
6590 if (Ty.getScalarSizeInBits() == 32)
6591 return Subtarget->hasMadMacF32Insts() &&
6601 EVT VT =
N->getValueType(0);
6603 return Subtarget->hasMadMacF32Insts() &&
6605 if (VT == MVT::f16) {
6606 return Subtarget->hasMadF16() &&
6621 unsigned Opc =
Op.getOpcode();
6622 EVT VT =
Op.getValueType();
6623 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6624 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6625 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6626 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6627 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6628 VT == MVT::v32bf16);
6644 [[maybe_unused]]
EVT VT =
Op.getValueType();
6646 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
6647 VT == MVT::v16i32) &&
6648 "Unexpected ValueType.");
6657 unsigned Opc =
Op.getOpcode();
6658 EVT VT =
Op.getValueType();
6659 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
6660 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
6661 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
6662 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6663 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
6664 VT == MVT::v32bf16);
6672 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
6674 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
6681 unsigned Opc =
Op.getOpcode();
6682 EVT VT =
Op.getValueType();
6683 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
6684 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
6685 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
6686 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
6687 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
6688 VT == MVT::v32bf16);
6693 : std::pair(Op0, Op0);
6702 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
6704 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
6710 switch (
Op.getOpcode()) {
6714 return LowerBRCOND(
Op, DAG);
6716 return LowerRETURNADDR(
Op, DAG);
6719 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
6720 "Load should return a value and a chain");
6724 EVT VT =
Op.getValueType();
6726 return lowerFSQRTF32(
Op, DAG);
6728 return lowerFSQRTF64(
Op, DAG);
6733 return LowerTrig(
Op, DAG);
6735 return LowerSELECT(
Op, DAG);
6737 return LowerFDIV(
Op, DAG);
6739 return LowerFFREXP(
Op, DAG);
6740 case ISD::ATOMIC_CMP_SWAP:
6741 return LowerATOMIC_CMP_SWAP(
Op, DAG);
6743 return LowerSTORE(
Op, DAG);
6747 return LowerGlobalAddress(MFI,
Op, DAG);
6750 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
6752 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
6754 return LowerINTRINSIC_VOID(
Op, DAG);
6755 case ISD::ADDRSPACECAST:
6756 return lowerADDRSPACECAST(
Op, DAG);
6758 return lowerINSERT_SUBVECTOR(
Op, DAG);
6760 return lowerINSERT_VECTOR_ELT(
Op, DAG);
6762 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
6764 return lowerVECTOR_SHUFFLE(
Op, DAG);
6766 return lowerSCALAR_TO_VECTOR(
Op, DAG);
6768 return lowerBUILD_VECTOR(
Op, DAG);
6771 return lowerFP_ROUND(
Op, DAG);
6773 return lowerTRAP(
Op, DAG);
6774 case ISD::DEBUGTRAP:
6775 return lowerDEBUGTRAP(
Op, DAG);
6784 return lowerFMINNUM_FMAXNUM(
Op, DAG);
6785 case ISD::FMINIMUMNUM:
6786 case ISD::FMAXIMUMNUM:
6787 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
6790 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
6793 return lowerFLDEXP(
Op, DAG);
6810 case ISD::FMINNUM_IEEE:
6811 case ISD::FMAXNUM_IEEE:
6818 return lowerFCOPYSIGN(
Op, DAG);
6820 return lowerMUL(
Op, DAG);
6823 return lowerXMULO(
Op, DAG);
6826 return lowerXMUL_LOHI(
Op, DAG);
6827 case ISD::DYNAMIC_STACKALLOC:
6829 case ISD::STACKSAVE:
6833 case ISD::SET_ROUNDING:
6837 case ISD::FP_EXTEND:
6840 case ISD::GET_FPENV:
6842 case ISD::SET_FPENV:
6861 EVT FittingLoadVT = LoadVT;
6886 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6890 return DAG.
getNode(ISD::BITCAST,
DL, FittingLoadVT, Result);
6893SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
6896 bool IsIntrinsic)
const {
6899 bool Unpacked = Subtarget->hasUnpackedD16VMem();
6900 EVT LoadVT =
M->getValueType(0);
6902 EVT EquivLoadVT = LoadVT;
6916 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
6920 M->getMemoryVT(),
M->getMemOperand());
6931 EVT LoadVT =
M->getValueType(0);
6937 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
6938 bool IsTFE =
M->getNumValues() == 3;
6951 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
6955 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
6956 M->getMemOperand(), DAG);
6960 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
6962 M->getMemOperand(), DAG);
6970 EVT VT =
N->getValueType(0);
6971 unsigned CondCode =
N->getConstantOperandVal(3);
6982 EVT CmpVT =
LHS.getValueType();
6983 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6984 unsigned PromoteOp =
7004 EVT VT =
N->getValueType(0);
7006 unsigned CondCode =
N->getConstantOperandVal(3);
7015 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7016 Src0 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7017 Src1 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7033 EVT VT =
N->getValueType(0);
7042 Op0 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
7043 Op1 = DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
7057 Exec = AMDGPU::EXEC_LO;
7059 Exec = AMDGPU::EXEC;
7076 EVT VT =
N->getValueType(0);
7078 unsigned IID =
N->getConstantOperandVal(0);
7079 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7080 IID == Intrinsic::amdgcn_permlanex16;
7081 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7082 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7086 unsigned SplitSize = 32;
7087 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7088 ST->hasDPALU_DPP() &&
7096 case Intrinsic::amdgcn_permlane16:
7097 case Intrinsic::amdgcn_permlanex16:
7098 case Intrinsic::amdgcn_update_dpp:
7103 case Intrinsic::amdgcn_writelane:
7106 case Intrinsic::amdgcn_readlane:
7107 case Intrinsic::amdgcn_set_inactive:
7108 case Intrinsic::amdgcn_set_inactive_chain_arg:
7109 case Intrinsic::amdgcn_mov_dpp8:
7112 case Intrinsic::amdgcn_readfirstlane:
7113 case Intrinsic::amdgcn_permlane64:
7121 std::reverse(Operands.
begin(), Operands.
end());
7123 if (
SDNode *GL =
N->getGluedNode()) {
7124 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7125 GL = GL->getOperand(0).getNode();
7135 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7136 IID == Intrinsic::amdgcn_mov_dpp8 ||
7137 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7138 Src1 =
N->getOperand(2);
7139 if (IID == Intrinsic::amdgcn_writelane ||
7140 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
7141 Src2 =
N->getOperand(3);
7144 if (ValSize == SplitSize) {
7154 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7159 if (IID == Intrinsic::amdgcn_writelane) {
7164 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7166 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7169 if (ValSize % SplitSize != 0)
7173 EVT VT =
N->getValueType(0);
7177 unsigned NumOperands =
N->getNumOperands();
7179 SDNode *GL =
N->getGluedNode();
7184 for (
unsigned i = 0; i != NE; ++i) {
7185 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7187 SDValue Operand =
N->getOperand(j);
7196 Operands[j] = Operand;
7201 Operands[NumOperands - 1] =
7202 DAG.
getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7217 if (SplitSize == 32) {
7219 return unrollLaneOp(LaneOp.
getNode());
7225 unsigned SubVecNumElt =
7229 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
7230 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
7234 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
7239 if (IID == Intrinsic::amdgcn_writelane)
7244 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
7245 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
7246 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
7247 EltIdx += SubVecNumElt;
7261 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
7264 if (IID == Intrinsic::amdgcn_writelane)
7267 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
7275 switch (
N->getOpcode()) {
7287 unsigned IID =
N->getConstantOperandVal(0);
7289 case Intrinsic::amdgcn_make_buffer_rsrc:
7290 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
7292 case Intrinsic::amdgcn_cvt_pkrtz: {
7298 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
7301 case Intrinsic::amdgcn_cvt_pknorm_i16:
7302 case Intrinsic::amdgcn_cvt_pknorm_u16:
7303 case Intrinsic::amdgcn_cvt_pk_i16:
7304 case Intrinsic::amdgcn_cvt_pk_u16: {
7310 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
7312 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
7314 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
7319 EVT VT =
N->getValueType(0);
7324 Results.push_back(DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
7328 case Intrinsic::amdgcn_s_buffer_load: {
7334 if (!Subtarget->hasScalarSubwordLoads())
7340 EVT VT =
Op.getValueType();
7341 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
7353 if (!
Offset->isDivergent()) {
7372 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
7377 case Intrinsic::amdgcn_dead: {
7378 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
7389 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
7390 Results.push_back(Res.getOperand(
I));
7394 Results.push_back(Res.getValue(1));
7403 EVT VT =
N->getValueType(0);
7408 EVT SelectVT = NewVT;
7409 if (NewVT.
bitsLT(MVT::i32)) {
7412 SelectVT = MVT::i32;
7418 if (NewVT != SelectVT)
7424 if (
N->getValueType(0) != MVT::v2f16)
7428 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7436 if (
N->getValueType(0) != MVT::v2f16)
7440 SDValue BC = DAG.
getNode(ISD::BITCAST, SL, MVT::i32,
N->getOperand(0));
7448 if (
N->getValueType(0) != MVT::f16)
7463 if (U.get() !=
Value)
7466 if (U.getUser()->getOpcode() == Opcode)
7472unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
7475 case Intrinsic::amdgcn_if:
7477 case Intrinsic::amdgcn_else:
7479 case Intrinsic::amdgcn_loop:
7481 case Intrinsic::amdgcn_end_cf:
7501 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
7528 SDNode *Intr = BRCOND.getOperand(1).getNode();
7545 Intr =
LHS.getNode();
7553 assert(BR &&
"brcond missing unconditional branch user");
7558 unsigned CFNode = isCFIntrinsic(Intr);
7578 Ops.push_back(Target);
7601 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
7620 MVT VT =
Op.getSimpleValueType();
7623 if (
Op.getConstantOperandVal(0) != 0)
7627 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7629 if (
Info->isEntryFunction())
7646 return Op.getValueType().bitsLE(VT)
7654 EVT DstVT =
Op.getValueType();
7661 unsigned Opc =
Op.getOpcode();
7673 EVT SrcVT = Src.getValueType();
7674 EVT DstVT =
Op.getValueType();
7677 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
7680 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
7687 if (DstVT == MVT::f16) {
7692 if (!Subtarget->has16BitInsts()) {
7695 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7697 if (
Op->getFlags().hasApproximateFuncs()) {
7704 return DAG.
getNode(ISD::BITCAST,
DL, MVT::f16, Trunc);
7708 "custom lower FP_ROUND for f16 or bf16");
7709 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
7722 EVT VT =
Op.getValueType();
7724 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7725 bool IsIEEEMode =
Info->getMode().IEEE;
7734 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7741SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
7743 EVT VT =
Op.getValueType();
7745 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
7746 bool IsIEEEMode =
Info->getMode().IEEE;
7751 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
7759 EVT VT =
Op.getValueType();
7763 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
7764 !Subtarget->hasMinimum3Maximum3F16() &&
7765 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
7766 "should not need to widen f16 minimum/maximum to v2f16");
7780 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
7788 EVT VT =
Op.getValueType();
7792 EVT ExpVT =
Exp.getValueType();
7793 if (ExpVT == MVT::i16)
7814 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
7817 return DAG.
getNode(ISD::FLDEXP,
DL, VT,
Op.getOperand(0), TruncExp);
7821 switch (
Op->getOpcode()) {
7851 DAGCombinerInfo &DCI)
const {
7852 const unsigned Opc =
Op.getOpcode();
7860 :
Op->getOperand(0).getValueType();
7863 if (DCI.isBeforeLegalizeOps() ||
7867 auto &DAG = DCI.DAG;
7873 LHS =
Op->getOperand(1);
7874 RHS =
Op->getOperand(2);
7876 LHS =
Op->getOperand(0);
7877 RHS =
Op->getOperand(1);
7916 if (MagVT == SignVT)
7923 SDValue SignAsInt32 = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
7926 SDValue SignAsHalf16 = DAG.
getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
7933 EVT VT =
Op.getValueType();
7939 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
7966 if (
Op->isDivergent())
7979 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
7981 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
7984 if (Op0SignBits >= 33 && Op1SignBits >= 33)
7986 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
7992 EVT VT =
Op.getValueType();
7999 const APInt &
C = RHSC->getAPIntValue();
8001 if (
C.isPowerOf2()) {
8003 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
8030 if (
Op->isDivergent()) {
8034 if (Subtarget->hasSMulHi()) {
8045 if (!Subtarget->isTrapHandlerEnabled() ||
8047 return lowerTrapEndpgm(
Op, DAG);
8049 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
8050 : lowerTrapHsaQueuePtr(
Op, DAG);
8060SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
8062 ImplicitParameter Param)
const {
8082 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
8085 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8088 if (UserSGPR == AMDGPU::NoRegister) {
8114 if (Subtarget->hasPrivEnabledTrap2NopBug())
8127 if (!Subtarget->isTrapHandlerEnabled() ||
8131 "debugtrap handler not supported",
8142SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
8144 if (Subtarget->hasApertureRegs()) {
8146 ? AMDGPU::SRC_SHARED_BASE
8147 : AMDGPU::SRC_PRIVATE_BASE;
8148 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
8149 !Subtarget->hasGloballyAddressableScratch()) &&
8150 "Cannot use src_private_base with globally addressable scratch!");
8171 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
8175 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8177 if (UserSGPR == AMDGPU::NoRegister) {
8211 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
8222 const AMDGPUTargetMachine &TM =
8225 unsigned DestAS, SrcAS;
8227 bool IsNonNull =
false;
8229 SrcAS = ASC->getSrcAddressSpace();
8230 Src = ASC->getOperand(0);
8231 DestAS = ASC->getDestAddressSpace();
8234 Op.getConstantOperandVal(0) ==
8235 Intrinsic::amdgcn_addrspacecast_nonnull);
8236 Src =
Op->getOperand(1);
8237 SrcAS =
Op->getConstantOperandVal(2);
8238 DestAS =
Op->getConstantOperandVal(3);
8251 Subtarget->hasGloballyAddressableScratch()) {
8256 AMDGPU::S_MOV_B32, SL, MVT::i32,
8257 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
8265 unsigned NullVal = TM.getNullPointerValue(DestAS);
8280 Subtarget->hasGloballyAddressableScratch()) {
8289 if (Subtarget->isWave64())
8295 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
8298 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8303 AMDGPU::S_MOV_B64, SL, MVT::i64,
8304 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
8306 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
8308 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
8310 CvtPtr = DAG.
getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
8316 unsigned NullVal = TM.getNullPointerValue(SrcAS);
8328 Op.getValueType() == MVT::i64) {
8329 const SIMachineFunctionInfo *
Info =
8333 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
8337 Src.getValueType() == MVT::i64)
8357 EVT InsVT =
Ins.getValueType();
8365 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
8370 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
8372 MVT::i32, InsNumElts / 2);
8374 Vec = DAG.
getNode(ISD::BITCAST, SL, NewVecVT, Vec);
8375 Ins = DAG.
getNode(ISD::BITCAST, SL, NewInsVT, Ins);
8377 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
8379 if (InsNumElts == 2) {
8389 return DAG.
getNode(ISD::BITCAST, SL, VecVT, Vec);
8392 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
8415 if (NumElts == 4 && EltSize == 16 && KIdx) {
8423 SDValue LoVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
8424 SDValue HiVec = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
8426 unsigned Idx = KIdx->getZExtValue();
8427 bool InsertLo = Idx < 2;
8430 DAG.
getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
8431 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
8433 InsHalf = DAG.
getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
8437 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
8450 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
8478 return DAG.
getNode(ISD::BITCAST, SL, VecVT, BFI);
8485 EVT ResultVT =
Op.getValueType();
8498 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
8501 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
8505 if (VecSize == 128) {
8513 }
else if (VecSize == 256) {
8516 for (
unsigned P = 0;
P < 4; ++
P) {
8522 Parts[0], Parts[1]));
8524 Parts[2], Parts[3]));
8530 for (
unsigned P = 0;
P < 8; ++
P) {
8537 Parts[0], Parts[1], Parts[2], Parts[3]));
8540 Parts[4], Parts[5], Parts[6], Parts[7]));
8560 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
8575 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
8577 return DAG.
getNode(ISD::BITCAST, SL, ResultVT, Result);
8585 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
8590 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
8591 !(Mask[Elt + 1] & 1);
8597 EVT ResultVT =
Op.getValueType();
8600 const int NewSrcNumElts = 2;
8602 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
8618 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
8640 if (ShouldUseConsecutiveExtract &&
8643 int VecIdx = Idx < SrcNumElts ? 0 : 1;
8644 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
8656 if (Idx0 >= SrcNumElts) {
8661 if (Idx1 >= SrcNumElts) {
8666 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
8667 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
8675 int NewMaskIdx0 = Idx0 - AlignedIdx0;
8676 int NewMaskIdx1 = Idx1 - AlignedIdx1;
8681 if (SubVec0 != SubVec1) {
8682 NewMaskIdx1 += NewSrcNumElts;
8689 {NewMaskIdx0, NewMaskIdx1});
8694 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
8695 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
8696 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
8697 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
8716 EVT ResultVT =
Op.getValueType();
8732 EVT VT =
Op.getValueType();
8734 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
8735 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
8744 return DAG.
getNode(ISD::BITCAST, SL, VT, ExtLo);
8753 return DAG.
getNode(ISD::BITCAST, SL, VT, ShlHi);
8760 return DAG.
getNode(ISD::BITCAST, SL, VT,
Or);
8769 for (
unsigned P = 0;
P < NumParts; ++
P) {
8771 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
8777 return DAG.
getNode(ISD::BITCAST, SL, VT, Blend);
8790 if (!Subtarget->isAmdHsaOS())
8850 EVT PtrVT =
Op.getValueType();
8852 const GlobalValue *GV = GSD->
getGlobal();
8866 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
8884 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
8885 if (Subtarget->has64BitLiterals()) {
8916 MachinePointerInfo PtrInfo =
8944 SDValue Param = lowerKernargMemParameter(
8955 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
8963 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
8971 unsigned NumElts = Elts.
size();
8973 if (NumElts <= 12) {
8982 for (
unsigned i = 0; i < Elts.
size(); ++i) {
8988 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
8998 EVT SrcVT = Src.getValueType();
9019 bool Unpacked,
bool IsD16,
int DMaskPop,
9020 int NumVDataDwords,
bool IsAtomicPacked16Bit,
9024 EVT ReqRetVT = ResultTypes[0];
9026 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
9027 ? (ReqRetNumElts + 1) / 2
9030 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
9041 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
9052 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
9054 NumDataDwords - MaskPopDwords);
9059 EVT LegalReqRetVT = ReqRetVT;
9061 if (!
Data.getValueType().isInteger())
9063 Data.getValueType().changeTypeToInteger(),
Data);
9084 if (Result->getNumValues() == 1)
9091 SDValue *LWE,
bool &IsTexFail) {
9111 unsigned DimIdx,
unsigned EndIdx,
9112 unsigned NumGradients) {
9114 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
9122 if (((
I + 1) >= EndIdx) ||
9123 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
9124 I == DimIdx + NumGradients - 1))) {
9146 !
Op.getNode()->hasAnyUseOfValue(0))
9148 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
9158 ResultTypes.erase(&ResultTypes[0]);
9164 int NumVDataDwords = 0;
9165 bool AdjustRetType =
false;
9166 bool IsAtomicPacked16Bit =
false;
9169 const unsigned ArgOffset = WithChain ? 2 : 1;
9172 unsigned DMaskLanes = 0;
9174 if (BaseOpcode->
Atomic) {
9175 VData =
Op.getOperand(2);
9177 IsAtomicPacked16Bit =
9178 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
9179 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
9180 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
9181 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
9192 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
9194 DMask = Is64Bit ? 0xf : 0x3;
9195 NumVDataDwords = Is64Bit ? 4 : 2;
9197 DMask = Is64Bit ? 0x3 : 0x1;
9198 NumVDataDwords = Is64Bit ? 2 : 1;
9201 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
9204 if (BaseOpcode->
Store) {
9205 VData =
Op.getOperand(2);
9209 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9213 VData = handleD16VData(VData, DAG,
true);
9216 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
9217 }
else if (!BaseOpcode->
NoReturn) {
9222 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
9230 (!LoadVT.
isVector() && DMaskLanes > 1))
9236 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
9237 !(BaseOpcode->
Gather4 && Subtarget->hasImageGather4D16Bug()))
9238 NumVDataDwords = (DMaskLanes + 1) / 2;
9240 NumVDataDwords = DMaskLanes;
9242 AdjustRetType =
true;
9246 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
9253 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9254 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9256 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
9258 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
9259 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
9263 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
9269 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
9273 "Bias needs to be converted to 16 bit in A16 mode");
9278 if (BaseOpcode->
Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
9282 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
9283 "require 16 bit args for both gradients and addresses");
9288 if (!
ST->hasA16()) {
9289 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
9290 "support 16 bit addresses\n");
9300 if (BaseOpcode->
Gradients && IsG16 &&
ST->hasG16()) {
9302 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
9304 IntrOpcode = G16MappingInfo->
G16;
9327 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
9345 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->
Sampler);
9346 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
9347 const bool UseNSA =
ST->hasNSAEncoding() &&
9348 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
9349 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
9350 const bool UsePartialNSA =
9351 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
9354 if (UsePartialNSA) {
9356 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
9357 }
else if (!UseNSA) {
9367 uint64_t UnormConst =
9368 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
9370 Unorm = UnormConst ? True : False;
9376 bool IsTexFail =
false;
9377 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
9388 NumVDataDwords += 1;
9389 AdjustRetType =
true;
9394 if (AdjustRetType) {
9397 if (DMaskLanes == 0 && !BaseOpcode->
Store) {
9406 MVT::i32, NumVDataDwords)
9409 ResultTypes[0] = NewVT;
9410 if (ResultTypes.size() == 3) {
9414 ResultTypes.erase(&ResultTypes[1]);
9428 Ops.push_back(VData);
9429 if (UsePartialNSA) {
9431 Ops.push_back(VAddr);
9435 Ops.push_back(VAddr);
9438 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
9440 Ops.push_back(Rsrc);
9445 Ops.push_back(Samp);
9450 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9451 Ops.push_back(Unorm);
9453 Ops.push_back(IsA16 &&
9454 ST->hasFeature(AMDGPU::FeatureR128A16)
9458 Ops.push_back(IsA16 ? True : False);
9460 if (!Subtarget->hasGFX90AInsts())
9465 "TFE is not supported on this GPU",
DL.getDebugLoc()));
9468 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
9471 Ops.push_back(DimInfo->
DA ? True : False);
9473 Ops.push_back(IsD16 ? True : False);
9475 Ops.push_back(
Op.getOperand(0));
9477 int NumVAddrDwords =
9483 NumVDataDwords, NumVAddrDwords);
9484 }
else if (IsGFX11Plus) {
9486 UseNSA ? AMDGPU::MIMGEncGfx11NSA
9487 : AMDGPU::MIMGEncGfx11Default,
9488 NumVDataDwords, NumVAddrDwords);
9489 }
else if (IsGFX10Plus) {
9491 UseNSA ? AMDGPU::MIMGEncGfx10NSA
9492 : AMDGPU::MIMGEncGfx10Default,
9493 NumVDataDwords, NumVAddrDwords);
9495 if (Subtarget->hasGFX90AInsts()) {
9497 NumVDataDwords, NumVAddrDwords);
9501 "requested image instruction is not supported on this GPU",
9506 for (EVT VT : OrigResultTypes) {
9507 if (VT == MVT::Other)
9508 RetValues[Idx++] =
Op.getOperand(0);
9519 NumVDataDwords, NumVAddrDwords);
9522 NumVDataDwords, NumVAddrDwords);
9529 MachineMemOperand *MemRef = MemOp->getMemOperand();
9548 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
9549 NumVDataDwords, IsAtomicPacked16Bit,
DL);
9562 MachinePointerInfo(),
9567 if (!
Offset->isDivergent()) {
9574 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9583 !Subtarget->hasScalarDwordx3Loads()) {
9610 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
9612 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
9616 unsigned NumLoads = 1;
9622 if (NumElts == 8 || NumElts == 16) {
9623 NumLoads = NumElts / 4;
9627 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
9632 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
9634 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
9635 for (
unsigned i = 0; i < NumLoads; ++i) {
9641 if (NumElts == 8 || NumElts == 16)
9649 if (!Subtarget->hasArchitectedSGPRs())
9661 unsigned Width)
const {
9663 using namespace AMDGPU::Hwreg;
9665 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
9704 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
9706 EVT VT =
Op.getValueType();
9708 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
9712 switch (IntrinsicID) {
9713 case Intrinsic::amdgcn_implicit_buffer_ptr: {
9716 return getPreloadedValue(DAG, *MFI, VT,
9719 case Intrinsic::amdgcn_dispatch_ptr:
9720 case Intrinsic::amdgcn_queue_ptr: {
9721 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
9723 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
9728 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
9731 return getPreloadedValue(DAG, *MFI, VT, RegID);
9733 case Intrinsic::amdgcn_implicitarg_ptr: {
9735 return getImplicitArgPtr(DAG,
DL);
9736 return getPreloadedValue(DAG, *MFI, VT,
9739 case Intrinsic::amdgcn_kernarg_segment_ptr: {
9745 return getPreloadedValue(DAG, *MFI, VT,
9748 case Intrinsic::amdgcn_dispatch_id: {
9751 case Intrinsic::amdgcn_rcp:
9753 case Intrinsic::amdgcn_rsq:
9755 case Intrinsic::amdgcn_rsq_legacy:
9759 case Intrinsic::amdgcn_rcp_legacy:
9763 case Intrinsic::amdgcn_rsq_clamp: {
9774 return DAG.
getNode(ISD::FMAXNUM,
DL, VT, Tmp,
9777 case Intrinsic::r600_read_ngroups_x:
9778 if (Subtarget->isAmdHsaOS())
9781 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9784 case Intrinsic::r600_read_ngroups_y:
9785 if (Subtarget->isAmdHsaOS())
9788 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9791 case Intrinsic::r600_read_ngroups_z:
9792 if (Subtarget->isAmdHsaOS())
9795 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
9798 case Intrinsic::r600_read_local_size_x:
9799 if (Subtarget->isAmdHsaOS())
9802 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9804 case Intrinsic::r600_read_local_size_y:
9805 if (Subtarget->isAmdHsaOS())
9808 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9810 case Intrinsic::r600_read_local_size_z:
9811 if (Subtarget->isAmdHsaOS())
9814 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
9816 case Intrinsic::amdgcn_workgroup_id_x:
9817 return lowerWorkGroupId(DAG, *MFI, VT,
9821 case Intrinsic::amdgcn_workgroup_id_y:
9822 return lowerWorkGroupId(DAG, *MFI, VT,
9826 case Intrinsic::amdgcn_workgroup_id_z:
9827 return lowerWorkGroupId(DAG, *MFI, VT,
9831 case Intrinsic::amdgcn_cluster_id_x:
9832 return Subtarget->hasClusters()
9833 ? getPreloadedValue(DAG, *MFI, VT,
9835 : DAG.getPOISON(VT);
9836 case Intrinsic::amdgcn_cluster_id_y:
9837 return Subtarget->hasClusters()
9838 ? getPreloadedValue(DAG, *MFI, VT,
9841 case Intrinsic::amdgcn_cluster_id_z:
9842 return Subtarget->hasClusters()
9843 ? getPreloadedValue(DAG, *MFI, VT,
9846 case Intrinsic::amdgcn_cluster_workgroup_id_x:
9847 return Subtarget->hasClusters()
9848 ? getPreloadedValue(
9852 case Intrinsic::amdgcn_cluster_workgroup_id_y:
9853 return Subtarget->hasClusters()
9854 ? getPreloadedValue(
9858 case Intrinsic::amdgcn_cluster_workgroup_id_z:
9859 return Subtarget->hasClusters()
9860 ? getPreloadedValue(
9864 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
9865 return Subtarget->hasClusters()
9868 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
9869 return Subtarget->hasClusters()
9870 ? getPreloadedValue(
9874 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
9875 return Subtarget->hasClusters()
9876 ? getPreloadedValue(
9880 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
9881 return Subtarget->hasClusters()
9882 ? getPreloadedValue(
9886 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
9887 return Subtarget->hasClusters()
9888 ? getPreloadedValue(
9892 case Intrinsic::amdgcn_wave_id:
9893 return lowerWaveID(DAG,
Op);
9894 case Intrinsic::amdgcn_lds_kernel_id: {
9896 return getLDSKernelId(DAG,
DL);
9897 return getPreloadedValue(DAG, *MFI, VT,
9900 case Intrinsic::amdgcn_workitem_id_x:
9901 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
9902 case Intrinsic::amdgcn_workitem_id_y:
9903 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
9904 case Intrinsic::amdgcn_workitem_id_z:
9905 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
9906 case Intrinsic::amdgcn_wavefrontsize:
9908 SDLoc(
Op), MVT::i32);
9909 case Intrinsic::amdgcn_s_buffer_load: {
9910 unsigned CPol =
Op.getConstantOperandVal(3);
9917 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
9918 Op.getOperand(3), DAG);
9920 case Intrinsic::amdgcn_fdiv_fast:
9921 return lowerFDIV_FAST(
Op, DAG);
9922 case Intrinsic::amdgcn_sin:
9925 case Intrinsic::amdgcn_cos:
9928 case Intrinsic::amdgcn_mul_u24:
9931 case Intrinsic::amdgcn_mul_i24:
9935 case Intrinsic::amdgcn_log_clamp: {
9941 case Intrinsic::amdgcn_fract:
9944 case Intrinsic::amdgcn_class:
9947 case Intrinsic::amdgcn_div_fmas:
9949 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9951 case Intrinsic::amdgcn_div_fixup:
9953 Op.getOperand(2),
Op.getOperand(3));
9955 case Intrinsic::amdgcn_div_scale: {
9968 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
9971 Denominator, Numerator);
9973 case Intrinsic::amdgcn_icmp: {
9975 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
9976 Op.getConstantOperandVal(2) == 0 &&
9981 case Intrinsic::amdgcn_fcmp: {
9984 case Intrinsic::amdgcn_ballot:
9986 case Intrinsic::amdgcn_fmed3:
9988 Op.getOperand(2),
Op.getOperand(3));
9989 case Intrinsic::amdgcn_fdot2:
9991 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
9992 case Intrinsic::amdgcn_fmul_legacy:
9995 case Intrinsic::amdgcn_sffbh:
9997 case Intrinsic::amdgcn_sbfe:
9999 Op.getOperand(2),
Op.getOperand(3));
10000 case Intrinsic::amdgcn_ubfe:
10002 Op.getOperand(2),
Op.getOperand(3));
10003 case Intrinsic::amdgcn_cvt_pkrtz:
10004 case Intrinsic::amdgcn_cvt_pknorm_i16:
10005 case Intrinsic::amdgcn_cvt_pknorm_u16:
10006 case Intrinsic::amdgcn_cvt_pk_i16:
10007 case Intrinsic::amdgcn_cvt_pk_u16: {
10009 EVT VT =
Op.getValueType();
10012 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10014 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
10016 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
10018 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
10024 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
10027 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
10028 return DAG.
getNode(ISD::BITCAST,
DL, VT, Node);
10030 case Intrinsic::amdgcn_fmad_ftz:
10032 Op.getOperand(2),
Op.getOperand(3));
10034 case Intrinsic::amdgcn_if_break:
10036 Op->getOperand(1),
Op->getOperand(2)),
10039 case Intrinsic::amdgcn_groupstaticsize: {
10045 const GlobalValue *GV =
10051 case Intrinsic::amdgcn_is_shared:
10052 case Intrinsic::amdgcn_is_private: {
10055 DAG.
getNode(ISD::BITCAST,
DL, MVT::v2i32,
Op.getOperand(1));
10059 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
10063 Subtarget->hasGloballyAddressableScratch()) {
10066 AMDGPU::S_MOV_B32,
DL, MVT::i32,
10067 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
10076 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
10079 case Intrinsic::amdgcn_perm:
10081 Op.getOperand(2),
Op.getOperand(3));
10082 case Intrinsic::amdgcn_reloc_constant: {
10092 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
10093 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
10094 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
10095 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
10096 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
10097 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
10098 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
10099 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
10100 if (
Op.getOperand(4).getValueType() == MVT::i32)
10106 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
10107 Op.getOperand(3), IndexKeyi32);
10109 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
10110 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
10111 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
10112 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
10113 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
10114 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
10115 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
10116 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
10117 if (
Op.getOperand(4).getValueType() == MVT::i64)
10123 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10124 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
10125 Op.getOperand(6)});
10127 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
10128 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
10129 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
10130 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
10131 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
10132 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
10133 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
10136 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
10142 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10143 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10144 IndexKey, Op.getOperand(7),
10145 Op.getOperand(8)});
10147 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
10148 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
10149 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
10150 if (
Op.getOperand(6).getValueType() == MVT::i32)
10156 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
10157 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
10158 IndexKeyi32, Op.getOperand(7)});
10160 case Intrinsic::amdgcn_addrspacecast_nonnull:
10161 return lowerADDRSPACECAST(
Op, DAG);
10162 case Intrinsic::amdgcn_readlane:
10163 case Intrinsic::amdgcn_readfirstlane:
10164 case Intrinsic::amdgcn_writelane:
10165 case Intrinsic::amdgcn_permlane16:
10166 case Intrinsic::amdgcn_permlanex16:
10167 case Intrinsic::amdgcn_permlane64:
10168 case Intrinsic::amdgcn_set_inactive:
10169 case Intrinsic::amdgcn_set_inactive_chain_arg:
10170 case Intrinsic::amdgcn_mov_dpp8:
10171 case Intrinsic::amdgcn_update_dpp:
10173 case Intrinsic::amdgcn_dead: {
10175 for (
const EVT ValTy :
Op.getNode()->values())
10180 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10182 return lowerImage(
Op, ImageDimIntr, DAG,
false);
10193 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
10199 unsigned NewOpcode)
const {
10203 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10204 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10222 M->getMemOperand());
10227 unsigned NewOpcode)
const {
10231 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
10232 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10250 M->getMemOperand());
10255 unsigned IntrID =
Op.getConstantOperandVal(1);
10259 case Intrinsic::amdgcn_ds_ordered_add:
10260 case Intrinsic::amdgcn_ds_ordered_swap: {
10265 unsigned IndexOperand =
M->getConstantOperandVal(7);
10266 unsigned WaveRelease =
M->getConstantOperandVal(8);
10267 unsigned WaveDone =
M->getConstantOperandVal(9);
10269 unsigned OrderedCountIndex = IndexOperand & 0x3f;
10270 IndexOperand &= ~0x3f;
10271 unsigned CountDw = 0;
10274 CountDw = (IndexOperand >> 24) & 0xf;
10275 IndexOperand &= ~(0xf << 24);
10277 if (CountDw < 1 || CountDw > 4) {
10280 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
10281 DL.getDebugLoc()));
10286 if (IndexOperand) {
10289 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
10292 if (WaveDone && !WaveRelease) {
10296 Fn,
"ds_ordered_count: wave_done requires wave_release",
10297 DL.getDebugLoc()));
10300 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
10301 unsigned ShaderType =
10303 unsigned Offset0 = OrderedCountIndex << 2;
10304 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
10307 Offset1 |= (CountDw - 1) << 6;
10310 Offset1 |= ShaderType << 2;
10312 unsigned Offset = Offset0 | (Offset1 << 8);
10319 M->getVTList(),
Ops,
M->getMemoryVT(),
10320 M->getMemOperand());
10322 case Intrinsic::amdgcn_raw_buffer_load:
10323 case Intrinsic::amdgcn_raw_ptr_buffer_load:
10324 case Intrinsic::amdgcn_raw_atomic_buffer_load:
10325 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
10326 case Intrinsic::amdgcn_raw_buffer_load_format:
10327 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
10328 const bool IsFormat =
10329 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
10330 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
10332 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10333 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10347 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
10349 case Intrinsic::amdgcn_struct_buffer_load:
10350 case Intrinsic::amdgcn_struct_ptr_buffer_load:
10351 case Intrinsic::amdgcn_struct_buffer_load_format:
10352 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
10353 case Intrinsic::amdgcn_struct_atomic_buffer_load:
10354 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
10355 const bool IsFormat =
10356 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
10357 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
10359 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10360 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10375 case Intrinsic::amdgcn_raw_tbuffer_load:
10376 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
10378 EVT LoadVT =
Op.getValueType();
10379 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10380 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
10399 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10402 case Intrinsic::amdgcn_struct_tbuffer_load:
10403 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
10405 EVT LoadVT =
Op.getValueType();
10406 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
10407 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
10426 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
10429 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
10430 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
10432 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
10433 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
10434 return lowerStructBufferAtomicIntrin(
Op, DAG,
10436 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
10437 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
10439 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
10440 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
10441 return lowerStructBufferAtomicIntrin(
Op, DAG,
10443 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
10444 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
10446 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
10447 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
10448 return lowerStructBufferAtomicIntrin(
Op, DAG,
10450 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
10451 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
10453 case Intrinsic::amdgcn_raw_buffer_atomic_add:
10454 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
10456 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
10457 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
10459 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
10460 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
10462 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
10463 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
10465 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
10466 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
10468 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
10469 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
10471 case Intrinsic::amdgcn_raw_buffer_atomic_and:
10472 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
10474 case Intrinsic::amdgcn_raw_buffer_atomic_or:
10475 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
10477 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
10478 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
10480 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
10481 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
10483 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
10484 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
10486 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
10487 return lowerRawBufferAtomicIntrin(
Op, DAG,
10489 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
10490 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
10491 return lowerStructBufferAtomicIntrin(
Op, DAG,
10493 case Intrinsic::amdgcn_struct_buffer_atomic_add:
10494 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
10496 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
10497 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
10499 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
10500 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
10501 return lowerStructBufferAtomicIntrin(
Op, DAG,
10503 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
10504 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
10505 return lowerStructBufferAtomicIntrin(
Op, DAG,
10507 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
10508 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
10509 return lowerStructBufferAtomicIntrin(
Op, DAG,
10511 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
10512 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
10513 return lowerStructBufferAtomicIntrin(
Op, DAG,
10515 case Intrinsic::amdgcn_struct_buffer_atomic_and:
10516 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
10518 case Intrinsic::amdgcn_struct_buffer_atomic_or:
10519 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
10521 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
10522 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
10524 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
10525 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
10527 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
10528 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
10530 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
10531 return lowerStructBufferAtomicIntrin(
Op, DAG,
10534 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
10535 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
10536 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
10537 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
10551 EVT VT =
Op.getValueType();
10555 Op->getVTList(),
Ops, VT,
10556 M->getMemOperand());
10558 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
10559 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
10560 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
10561 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
10575 EVT VT =
Op.getValueType();
10579 Op->getVTList(),
Ops, VT,
10580 M->getMemOperand());
10582 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
10583 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
10585 SDValue NodePtr =
M->getOperand(2);
10586 SDValue RayExtent =
M->getOperand(3);
10587 SDValue InstanceMask =
M->getOperand(4);
10588 SDValue RayOrigin =
M->getOperand(5);
10589 SDValue RayDir =
M->getOperand(6);
10591 SDValue TDescr =
M->getOperand(8);
10596 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
10601 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
10602 const unsigned NumVDataDwords = 10;
10603 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
10605 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
10606 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
10607 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
10611 Ops.push_back(NodePtr);
10614 {DAG.getBitcast(MVT::i32, RayExtent),
10615 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
10616 Ops.push_back(RayOrigin);
10617 Ops.push_back(RayDir);
10618 Ops.push_back(Offsets);
10619 Ops.push_back(TDescr);
10620 Ops.push_back(
M->getChain());
10623 MachineMemOperand *MemRef =
M->getMemOperand();
10627 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
10629 SDValue NodePtr =
M->getOperand(2);
10630 SDValue RayExtent =
M->getOperand(3);
10631 SDValue RayOrigin =
M->getOperand(4);
10632 SDValue RayDir =
M->getOperand(5);
10633 SDValue RayInvDir =
M->getOperand(6);
10634 SDValue TDescr =
M->getOperand(7);
10641 if (!Subtarget->hasGFX10_AEncoding()) {
10651 const unsigned NumVDataDwords = 4;
10652 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
10653 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
10654 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
10657 const unsigned BaseOpcodes[2][2] = {
10658 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
10659 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
10660 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
10664 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
10665 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
10666 : AMDGPU::MIMGEncGfx10NSA,
10667 NumVDataDwords, NumVAddrDwords);
10671 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
10672 : AMDGPU::MIMGEncGfx10Default,
10673 NumVDataDwords, NumVAddrDwords);
10679 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
10682 if (Lanes[0].getValueSizeInBits() == 32) {
10683 for (
unsigned I = 0;
I < 3; ++
I)
10690 Ops.push_back(Lanes[2]);
10702 if (UseNSA && IsGFX11Plus) {
10703 Ops.push_back(NodePtr);
10705 Ops.push_back(RayOrigin);
10710 for (
unsigned I = 0;
I < 3; ++
I) {
10713 {DirLanes[I], InvDirLanes[I]})));
10717 Ops.push_back(RayDir);
10718 Ops.push_back(RayInvDir);
10725 Ops.push_back(NodePtr);
10728 packLanes(RayOrigin,
true);
10729 packLanes(RayDir,
true);
10730 packLanes(RayInvDir,
false);
10735 if (NumVAddrDwords > 12) {
10737 Ops.append(16 -
Ops.size(), Undef);
10743 Ops.push_back(MergedOps);
10746 Ops.push_back(TDescr);
10748 Ops.push_back(
M->getChain());
10751 MachineMemOperand *MemRef =
M->getMemOperand();
10755 case Intrinsic::amdgcn_global_atomic_fmin_num:
10756 case Intrinsic::amdgcn_global_atomic_fmax_num:
10757 case Intrinsic::amdgcn_flat_atomic_fmin_num:
10758 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10765 unsigned Opcode = 0;
10767 case Intrinsic::amdgcn_global_atomic_fmin_num:
10768 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
10769 Opcode = ISD::ATOMIC_LOAD_FMIN;
10772 case Intrinsic::amdgcn_global_atomic_fmax_num:
10773 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
10774 Opcode = ISD::ATOMIC_LOAD_FMAX;
10780 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
10781 Ops,
M->getMemOperand());
10783 case Intrinsic::amdgcn_s_get_barrier_state:
10784 case Intrinsic::amdgcn_s_get_named_barrier_state: {
10791 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
10792 BarID = (BarID >> 4) & 0x3F;
10793 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
10796 Ops.push_back(Chain);
10798 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
10799 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
10807 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
10815 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
10816 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
10817 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
10821 EVT VT =
Op->getValueType(0);
10827 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10829 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10837SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
10844 EVT VT = VTList.
VTs[0];
10847 bool IsTFE = VTList.
NumVTs == 3;
10850 unsigned NumOpDWords = NumValueDWords + 1;
10852 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
10853 MachineMemOperand *OpDWordsMMO =
10855 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
10856 OpDWordsVT, OpDWordsMMO, DAG);
10861 NumValueDWords == 1
10870 if (!Subtarget->hasDwordx3LoadStores() &&
10871 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
10875 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
10877 WidenedMemVT, WidenedMMO);
10887 bool ImageStore)
const {
10897 if (Subtarget->hasUnpackedD16VMem()) {
10911 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
10922 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
10928 if ((NumElements % 2) == 1) {
10930 unsigned I = Elts.
size() / 2;
10946 if (NumElements == 3) {
10956 return DAG.
getNode(ISD::BITCAST,
DL, WidenedStoreVT, ZExt);
10967 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
10970 switch (IntrinsicID) {
10971 case Intrinsic::amdgcn_exp_compr: {
10972 if (!Subtarget->hasCompressedExport()) {
10975 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
10987 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src0),
10988 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32, Src1),
10997 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
11001 case Intrinsic::amdgcn_struct_tbuffer_store:
11002 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
11004 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11006 VData = handleD16VData(VData, DAG);
11007 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11008 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11026 M->getMemoryVT(),
M->getMemOperand());
11029 case Intrinsic::amdgcn_raw_tbuffer_store:
11030 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
11032 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
11034 VData = handleD16VData(VData, DAG);
11035 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11036 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11054 M->getMemoryVT(),
M->getMemOperand());
11057 case Intrinsic::amdgcn_raw_buffer_store:
11058 case Intrinsic::amdgcn_raw_ptr_buffer_store:
11059 case Intrinsic::amdgcn_raw_buffer_store_format:
11060 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
11061 const bool IsFormat =
11062 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
11063 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
11070 VData = handleD16VData(VData, DAG);
11080 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11081 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11101 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
11104 M->getMemoryVT(),
M->getMemOperand());
11107 case Intrinsic::amdgcn_struct_buffer_store:
11108 case Intrinsic::amdgcn_struct_ptr_buffer_store:
11109 case Intrinsic::amdgcn_struct_buffer_store_format:
11110 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
11111 const bool IsFormat =
11112 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
11113 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
11121 VData = handleD16VData(VData, DAG);
11131 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11132 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11151 EVT VDataType = VData.getValueType().getScalarType();
11153 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
11156 M->getMemoryVT(),
M->getMemOperand());
11158 case Intrinsic::amdgcn_raw_buffer_load_lds:
11159 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
11160 case Intrinsic::amdgcn_struct_buffer_load_lds:
11161 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
11162 if (!Subtarget->hasVMemToLDSLoad())
11166 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
11167 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
11168 unsigned OpOffset = HasVIndex ? 1 : 0;
11169 SDValue VOffset =
Op.getOperand(5 + OpOffset);
11171 unsigned Size =
Op->getConstantOperandVal(4);
11177 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
11178 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
11179 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
11180 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
11183 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
11184 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
11185 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
11186 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
11189 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
11190 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
11191 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
11192 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
11195 if (!Subtarget->hasLDSLoadB96_B128())
11197 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
11198 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
11199 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
11200 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
11203 if (!Subtarget->hasLDSLoadB96_B128())
11205 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
11206 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
11207 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
11208 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
11216 if (HasVIndex && HasVOffset)
11220 else if (HasVIndex)
11221 Ops.push_back(
Op.getOperand(5));
11222 else if (HasVOffset)
11223 Ops.push_back(VOffset);
11225 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11226 Ops.push_back(Rsrc);
11227 Ops.push_back(
Op.getOperand(6 + OpOffset));
11228 Ops.push_back(
Op.getOperand(7 + OpOffset));
11230 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
11243 MachineMemOperand *LoadMMO =
M->getMemOperand();
11248 MachinePointerInfo StorePtrI = LoadPtrI;
11272 case Intrinsic::amdgcn_load_to_lds:
11273 case Intrinsic::amdgcn_global_load_lds: {
11274 if (!Subtarget->hasVMemToLDSLoad())
11278 unsigned Size =
Op->getConstantOperandVal(4);
11283 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
11286 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
11289 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
11292 if (!Subtarget->hasLDSLoadB96_B128())
11294 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
11297 if (!Subtarget->hasLDSLoadB96_B128())
11299 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
11315 if (
LHS->isDivergent())
11319 RHS.getOperand(0).getValueType() == MVT::i32) {
11322 VOffset =
RHS.getOperand(0);
11326 Ops.push_back(Addr);
11334 Ops.push_back(VOffset);
11337 Ops.push_back(
Op.getOperand(5));
11339 unsigned Aux =
Op.getConstantOperandVal(6);
11347 MachineMemOperand *LoadMMO =
M->getMemOperand();
11349 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
11350 MachinePointerInfo StorePtrI = LoadPtrI;
11369 case Intrinsic::amdgcn_end_cf:
11371 Op->getOperand(2), Chain),
11373 case Intrinsic::amdgcn_s_barrier_init:
11374 case Intrinsic::amdgcn_s_barrier_signal_var: {
11381 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
11382 ? AMDGPU::S_BARRIER_INIT_M0
11383 : AMDGPU::S_BARRIER_SIGNAL_M0;
11398 constexpr unsigned ShAmt = 16;
11405 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11410 case Intrinsic::amdgcn_s_barrier_join: {
11419 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
11422 unsigned BarID = (BarVal >> 4) & 0x3F;
11425 Ops.push_back(Chain);
11427 Opc = AMDGPU::S_BARRIER_JOIN_M0;
11437 Ops.push_back(
copyToM0(DAG, Chain,
DL, M0Val).getValue(0));
11443 case Intrinsic::amdgcn_s_prefetch_data: {
11446 return Op.getOperand(0);
11449 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
11451 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
11458 Op->getVTList(),
Ops,
M->getMemoryVT(),
11459 M->getMemOperand());
11461 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
11462 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
11463 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
11472 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11474 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11490 return PtrVT == MVT::i64;
11504std::pair<SDValue, SDValue>
11534 unsigned Overflow = ImmOffset & ~MaxImm;
11535 ImmOffset -= Overflow;
11536 if ((int32_t)Overflow < 0) {
11537 Overflow += ImmOffset;
11542 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
11561void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
11563 Align Alignment)
const {
11565 SDLoc
DL(CombinedOffset);
11567 uint32_t
Imm =
C->getZExtValue();
11568 uint32_t SOffset, ImmOffset;
11569 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
11579 uint32_t SOffset, ImmOffset;
11582 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
11590 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
11599SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
11602 return MaybePointer;
11616 SDValue NumRecords =
Op->getOperand(3);
11622 if (Subtarget->has45BitNumRecordsBufferResource()) {
11641 SDValue ExtShiftedStrideVec =
11644 DAG.
getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
11651 DAG.
getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
11653 DAG.
getNode(
ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
11655 DAG.
getNode(
ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
11660 auto [LowHalf, HighHalf] =
11661 DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
11671 NumRecords, Flags);
11674 SDValue RsrcPtr = DAG.
getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
11683 bool IsTFE)
const {
11692 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
11707 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
11711 LoadVal = DAG.
getNode(ISD::BITCAST,
DL, LoadVT, LoadVal);
11721 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
11725 Ops[1] = BufferStoreExt;
11730 M->getMemOperand());
11755 DAGCombinerInfo &DCI)
const {
11756 SelectionDAG &DAG = DCI.DAG;
11771 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
11778 "unexpected vector extload");
11791 "unexpected fp extload");
11809 DCI.AddToWorklist(Cvt.
getNode());
11814 DCI.AddToWorklist(Cvt.
getNode());
11817 Cvt = DAG.
getNode(ISD::BITCAST, SL, VT, Cvt);
11825 if (
Info.isEntryFunction())
11826 return Info.getUserSGPRInfo().hasFlatScratchInit();
11834 EVT MemVT =
Load->getMemoryVT();
11835 MachineMemOperand *MMO =
Load->getMemOperand();
11847 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
11875 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
11876 "Custom lowering for non-i32 vectors hasn't been implemented.");
11879 unsigned AS =
Load->getAddressSpace();
11886 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
11890 !Subtarget->hasMultiDwordFlatScratchAddressing())
11900 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
11903 Alignment >=
Align(4) && NumElements < 32) {
11905 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
11917 if (NumElements > 4)
11920 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11930 switch (Subtarget->getMaxPrivateElementSize()) {
11936 if (NumElements > 2)
11941 if (NumElements > 4)
11944 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
11953 auto Flags =
Load->getMemOperand()->getFlags();
11955 Load->getAlign(), Flags, &
Fast) &&
11964 MemVT, *
Load->getMemOperand())) {
11973 EVT VT =
Op.getValueType();
12000 return DAG.
getNode(ISD::BITCAST,
DL, VT, Res);
12010 EVT VT =
Op.getValueType();
12011 const SDNodeFlags
Flags =
Op->getFlags();
12013 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
12019 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
12022 if (CLHS->isExactlyValue(1.0)) {
12039 if (CLHS->isExactlyValue(-1.0)) {
12048 if (!AllowInaccurateRcp &&
12049 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
12063 EVT VT =
Op.getValueType();
12064 const SDNodeFlags
Flags =
Op->getFlags();
12066 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
12067 if (!AllowInaccurateDiv)
12088 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
12102 return DAG.
getNode(Opcode, SL, VTList,
12111 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
12125 return DAG.
getNode(Opcode, SL, VTList,
12131 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12132 return FastLowered;
12135 EVT VT =
Op.getValueType();
12142 if (VT == MVT::bf16) {
12165 unsigned FMADOpCode =
12167 SDValue NegRHSExt = DAG.
getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
12172 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12174 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
12175 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
12181 Tmp = DAG.
getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
12191 SDNodeFlags
Flags =
Op->getFlags();
12198 const APFloat K0Val(0x1p+96f);
12201 const APFloat K1Val(0x1p-32f);
12228 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
12229 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
12230 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
12235 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
12236 return FastLowered;
12242 SDNodeFlags
Flags =
Op->getFlags();
12243 Flags.setNoFPExcept(
true);
12251 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
12262 DAG.
getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
12264 using namespace AMDGPU::Hwreg;
12265 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
12269 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
12270 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
12273 const bool HasDynamicDenormals =
12279 if (!PreservesDenormals) {
12284 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12287 if (HasDynamicDenormals) {
12291 SavedDenormMode =
SDValue(GetReg, 0);
12297 SDNode *EnableDenorm;
12298 if (Subtarget->hasDenormModeInst()) {
12299 const SDValue EnableDenormValue =
12306 const SDValue EnableDenormValue =
12308 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
12309 {EnableDenormValue,
BitField, Glue});
12319 ApproxRcp, One, NegDivScale0, Flags);
12322 ApproxRcp, Fma0, Flags);
12328 NumeratorScaled,
Mul, Flags);
12334 NumeratorScaled, Fma3, Flags);
12336 if (!PreservesDenormals) {
12337 SDNode *DisableDenorm;
12338 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
12342 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
12348 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
12349 const SDValue DisableDenormValue =
12350 HasDynamicDenormals
12355 AMDGPU::S_SETREG_B32, SL, MVT::Other,
12366 {Fma4, Fma1, Fma3, Scale},
Flags);
12372 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
12373 return FastLowered;
12381 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
12385 SDValue NegDivScale0 = DAG.
getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
12405 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
12414 SDValue Scale0BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
12415 SDValue Scale1BC = DAG.
getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
12441 EVT VT =
Op.getValueType();
12443 if (VT == MVT::f32)
12444 return LowerFDIV32(
Op, DAG);
12446 if (VT == MVT::f64)
12447 return LowerFDIV64(
Op, DAG);
12449 if (VT == MVT::f16 || VT == MVT::bf16)
12450 return LowerFDIV16(
Op, DAG);
12459 EVT ResultExpVT =
Op->getValueType(1);
12460 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
12470 if (Subtarget->hasFractBug()) {
12488 EVT VT =
Store->getMemoryVT();
12490 if (VT == MVT::i1) {
12494 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
12498 Store->getValue().getValueType().getScalarType() == MVT::i32);
12500 unsigned AS =
Store->getAddressSpace();
12508 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12512 !Subtarget->hasMultiDwordFlatScratchAddressing())
12519 if (NumElements > 4)
12522 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
12526 VT, *
Store->getMemOperand()))
12532 switch (Subtarget->getMaxPrivateElementSize()) {
12536 if (NumElements > 2)
12540 if (NumElements > 4 ||
12541 (NumElements == 3 && !Subtarget->enableFlatScratch()))
12549 auto Flags =
Store->getMemOperand()->getFlags();
12568 assert(!Subtarget->has16BitInsts());
12569 SDNodeFlags
Flags =
Op->getFlags();
12571 DAG.
getNode(ISD::FP_EXTEND, SL, MVT::f32,
Op.getOperand(0), Flags);
12583 SDNodeFlags
Flags =
Op->getFlags();
12584 MVT VT =
Op.getValueType().getSimpleVT();
12614 SDValue SqrtSNextDown = DAG.
getNode(ISD::BITCAST,
DL, VT, SqrtSNextDownInt);
12617 DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextDown, Flags);
12626 SDValue NegSqrtSNextUp = DAG.
getNode(ISD::FNEG,
DL, VT, SqrtSNextUp, Flags);
12692 SDNodeFlags
Flags =
Op->getFlags();
12738 SqrtRet = DAG.
getNode(ISD::FLDEXP,
DL, MVT::f64, SqrtRet, ScaleDown, Flags);
12755 EVT VT =
Op.getValueType();
12765 if (Subtarget->hasTrigReducedRange()) {
12772 switch (
Op.getOpcode()) {
12799 EVT VT =
Op.getValueType();
12807 Op->getVTList(),
Ops, VT,
12816SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
12817 DAGCombinerInfo &DCI)
const {
12818 EVT VT =
N->getValueType(0);
12820 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
12823 SelectionDAG &DAG = DCI.DAG;
12827 EVT SrcVT = Src.getValueType();
12833 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
12836 DCI.AddToWorklist(Cvt.
getNode());
12839 if (ScalarVT != MVT::f32) {
12851 DAGCombinerInfo &DCI)
const {
12858 if (SignOp.
getOpcode() == ISD::FP_EXTEND ||
12862 SelectionDAG &DAG = DCI.DAG;
12881 for (
unsigned I = 0;
I != NumElts; ++
I) {
12905 if (NewElts.
size() == 1)
12927 for (
unsigned I = 0;
I != NumElts; ++
I) {
12962SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
12964 DAGCombinerInfo &DCI)
const {
12981 SelectionDAG &DAG = DCI.DAG;
12994 AM.BaseOffs =
Offset.getSExtValue();
12999 EVT VT =
N->getValueType(0);
13005 Flags.setNoUnsignedWrap(
13006 N->getFlags().hasNoUnsignedWrap() &&
13018 switch (
N->getOpcode()) {
13029 DAGCombinerInfo &DCI)
const {
13030 SelectionDAG &DAG = DCI.DAG;
13037 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
13038 N->getMemoryVT(), DCI);
13042 NewOps[PtrIdx] = NewPtr;
13051 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
13052 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
13061SDValue SITargetLowering::splitBinaryBitConstantOp(
13065 uint32_t ValLo =
Lo_32(Val);
13066 uint32_t ValHi =
Hi_32(Val);
13073 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
13087 if (V.getValueType() != MVT::i1)
13089 switch (V.getOpcode()) {
13106 return V.getResNo() == 1;
13108 unsigned IntrinsicID = V.getConstantOperandVal(0);
13109 switch (IntrinsicID) {
13110 case Intrinsic::amdgcn_is_shared:
13111 case Intrinsic::amdgcn_is_private:
13128 if (!(
C & 0x000000ff))
13129 ZeroByteMask |= 0x000000ff;
13130 if (!(
C & 0x0000ff00))
13131 ZeroByteMask |= 0x0000ff00;
13132 if (!(
C & 0x00ff0000))
13133 ZeroByteMask |= 0x00ff0000;
13134 if (!(
C & 0xff000000))
13135 ZeroByteMask |= 0xff000000;
13136 uint32_t NonZeroByteMask = ~ZeroByteMask;
13137 if ((NonZeroByteMask &
C) != NonZeroByteMask)
13150 assert(V.getValueSizeInBits() == 32);
13152 if (V.getNumOperands() != 2)
13161 switch (V.getOpcode()) {
13166 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
13171 return (0x03020100 & ~ConstMask) | ConstMask;
13178 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
13184 return uint32_t(0x0c0c0c0c03020100ull >>
C);
13191 DAGCombinerInfo &DCI)
const {
13192 if (DCI.isBeforeLegalize())
13195 SelectionDAG &DAG = DCI.DAG;
13196 EVT VT =
N->getValueType(0);
13201 if (VT == MVT::i64 && CRHS) {
13203 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
13207 if (CRHS && VT == MVT::i32) {
13217 unsigned Shift = CShift->getZExtValue();
13219 unsigned Offset = NB + Shift;
13220 if ((
Offset & (Bits - 1)) == 0) {
13244 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
13259 if (
Y.getOpcode() != ISD::FABS ||
Y.getOperand(0) !=
X ||
13264 if (
X !=
LHS.getOperand(1))
13268 const ConstantFPSDNode *C1 =
13302 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
13303 LHS.getOperand(0) ==
LHS.getOperand(1))) {
13305 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
13306 :
Mask->getZExtValue() & OrdMask;
13327 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
13330 if (LHSMask != ~0u && RHSMask != ~0u) {
13333 if (LHSMask > RHSMask) {
13340 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13341 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
13344 if (!(LHSUsedLanes & RHSUsedLanes) &&
13347 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
13353 uint32_t
Mask = LHSMask & RHSMask;
13354 for (
unsigned I = 0;
I < 32;
I += 8) {
13355 uint32_t ByteSel = 0xff <<
I;
13356 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
13357 Mask &= (0x0c <<
I) & 0xffffffff;
13362 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
13415static const std::optional<ByteProvider<SDValue>>
13417 unsigned Depth = 0) {
13420 return std::nullopt;
13422 if (
Op.getValueSizeInBits() < 8)
13423 return std::nullopt;
13425 if (
Op.getValueType().isVector())
13428 switch (
Op->getOpcode()) {
13440 NarrowVT = VTSign->getVT();
13443 return std::nullopt;
13446 if (SrcIndex >= NarrowByteWidth)
13447 return std::nullopt;
13455 return std::nullopt;
13457 uint64_t BitShift = ShiftOp->getZExtValue();
13459 if (BitShift % 8 != 0)
13460 return std::nullopt;
13462 SrcIndex += BitShift / 8;
13480static const std::optional<ByteProvider<SDValue>>
13482 unsigned StartingIndex = 0) {
13486 return std::nullopt;
13488 unsigned BitWidth =
Op.getScalarValueSizeInBits();
13490 return std::nullopt;
13492 return std::nullopt;
13494 bool IsVec =
Op.getValueType().isVector();
13495 switch (
Op.getOpcode()) {
13498 return std::nullopt;
13503 return std::nullopt;
13507 return std::nullopt;
13510 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
13511 return std::nullopt;
13512 if (!
LHS ||
LHS->isConstantZero())
13514 if (!
RHS ||
RHS->isConstantZero())
13516 return std::nullopt;
13521 return std::nullopt;
13525 return std::nullopt;
13527 uint32_t BitMask = BitMaskOp->getZExtValue();
13529 uint32_t IndexMask = 0xFF << (Index * 8);
13531 if ((IndexMask & BitMask) != IndexMask) {
13534 if (IndexMask & BitMask)
13535 return std::nullopt;
13544 return std::nullopt;
13548 if (!ShiftOp ||
Op.getValueType().isVector())
13549 return std::nullopt;
13551 uint64_t BitsProvided =
Op.getValueSizeInBits();
13552 if (BitsProvided % 8 != 0)
13553 return std::nullopt;
13555 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
13557 return std::nullopt;
13559 uint64_t ConcatSizeInBytes = BitsProvided / 4;
13560 uint64_t ByteShift = BitShift / 8;
13562 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
13563 uint64_t BytesProvided = BitsProvided / 8;
13564 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
13565 NewIndex %= BytesProvided;
13572 return std::nullopt;
13576 return std::nullopt;
13578 uint64_t BitShift = ShiftOp->getZExtValue();
13580 return std::nullopt;
13582 auto BitsProvided =
Op.getScalarValueSizeInBits();
13583 if (BitsProvided % 8 != 0)
13584 return std::nullopt;
13586 uint64_t BytesProvided = BitsProvided / 8;
13587 uint64_t ByteShift = BitShift / 8;
13592 return BytesProvided - ByteShift > Index
13600 return std::nullopt;
13604 return std::nullopt;
13606 uint64_t BitShift = ShiftOp->getZExtValue();
13607 if (BitShift % 8 != 0)
13608 return std::nullopt;
13609 uint64_t ByteShift = BitShift / 8;
13615 return Index < ByteShift
13618 Depth + 1, StartingIndex);
13627 return std::nullopt;
13635 NarrowBitWidth = VTSign->getVT().getSizeInBits();
13637 if (NarrowBitWidth % 8 != 0)
13638 return std::nullopt;
13639 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13641 if (Index >= NarrowByteWidth)
13643 ? std::optional<ByteProvider<SDValue>>(
13651 return std::nullopt;
13655 if (NarrowByteWidth >= Index) {
13660 return std::nullopt;
13667 return std::nullopt;
13673 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
13674 if (NarrowBitWidth % 8 != 0)
13675 return std::nullopt;
13676 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
13681 if (Index >= NarrowByteWidth) {
13683 ? std::optional<ByteProvider<SDValue>>(
13688 if (NarrowByteWidth > Index) {
13692 return std::nullopt;
13697 return std::nullopt;
13700 Depth + 1, StartingIndex);
13706 return std::nullopt;
13707 auto VecIdx = IdxOp->getZExtValue();
13708 auto ScalarSize =
Op.getScalarValueSizeInBits();
13709 if (ScalarSize < 32)
13710 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
13712 StartingIndex, Index);
13717 return std::nullopt;
13721 return std::nullopt;
13724 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
13725 if (IdxMask > 0x07 && IdxMask != 0x0c)
13726 return std::nullopt;
13728 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
13729 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
13731 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
13737 return std::nullopt;
13752 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
13759 auto MemVT = L->getMemoryVT();
13762 return L->getMemoryVT().getSizeInBits() == 16;
13772 int Low8 = Mask & 0xff;
13773 int Hi8 = (Mask & 0xff00) >> 8;
13775 assert(Low8 < 8 && Hi8 < 8);
13777 bool IsConsecutive = (Hi8 - Low8 == 1);
13782 bool Is16Aligned = !(Low8 % 2);
13784 return IsConsecutive && Is16Aligned;
13792 int Low16 = PermMask & 0xffff;
13793 int Hi16 = (PermMask & 0xffff0000) >> 16;
13803 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
13805 if (!OtherOpIs16Bit)
13813 unsigned DWordOffset) {
13818 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
13823 if (Src.getValueType().isVector()) {
13824 auto ScalarTySize = Src.getScalarValueSizeInBits();
13825 auto ScalarTy = Src.getValueType().getScalarType();
13826 if (ScalarTySize == 32) {
13830 if (ScalarTySize > 32) {
13833 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
13834 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
13841 assert(ScalarTySize < 32);
13842 auto NumElements =
TypeSize / ScalarTySize;
13843 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
13844 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
13845 auto NumElementsIn32 = 32 / ScalarTySize;
13846 auto NumAvailElements = DWordOffset < Trunc32Elements
13848 : NumElements - NormalizedTrunc;
13861 auto ShiftVal = 32 * DWordOffset;
13869 [[maybe_unused]]
EVT VT =
N->getValueType(0);
13874 for (
int i = 0; i < 4; i++) {
13876 std::optional<ByteProvider<SDValue>>
P =
13879 if (!
P ||
P->isConstantZero())
13884 if (PermNodes.
size() != 4)
13887 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
13888 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
13890 for (
size_t i = 0; i < PermNodes.
size(); i++) {
13891 auto PermOp = PermNodes[i];
13894 int SrcByteAdjust = 4;
13898 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
13899 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
13901 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
13902 ((PermOp.SrcOffset / 4) != SecondSrc->second))
13906 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
13907 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
13910 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
13912 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
13915 SDValue Op = *PermNodes[FirstSrc.first].Src;
13917 assert(
Op.getValueSizeInBits() == 32);
13921 int Low16 = PermMask & 0xffff;
13922 int Hi16 = (PermMask & 0xffff0000) >> 16;
13924 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
13925 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
13928 if (WellFormedLow && WellFormedHi)
13932 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
13941 assert(
Op.getValueType().isByteSized() &&
13959 DAGCombinerInfo &DCI)
const {
13960 SelectionDAG &DAG = DCI.DAG;
13964 EVT VT =
N->getValueType(0);
13965 if (VT == MVT::i1) {
13970 if (Src !=
RHS.getOperand(0))
13975 if (!CLHS || !CRHS)
13979 static const uint32_t MaxMask = 0x3ff;
13999 Sel |=
LHS.getConstantOperandVal(2);
14008 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14012 auto usesCombinedOperand = [](SDNode *OrUse) {
14014 if (OrUse->getOpcode() != ISD::BITCAST ||
14015 !OrUse->getValueType(0).isVector())
14019 for (
auto *VUser : OrUse->users()) {
14020 if (!VUser->getValueType(0).isVector())
14027 if (VUser->getOpcode() == VectorwiseOp)
14033 if (!
any_of(
N->users(), usesCombinedOperand))
14039 if (LHSMask != ~0u && RHSMask != ~0u) {
14042 if (LHSMask > RHSMask) {
14049 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14050 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14053 if (!(LHSUsedLanes & RHSUsedLanes) &&
14056 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14058 LHSMask &= ~RHSUsedLanes;
14059 RHSMask &= ~LHSUsedLanes;
14061 LHSMask |= LHSUsedLanes & 0x04040404;
14063 uint32_t Sel = LHSMask | RHSMask;
14071 if (LHSMask == ~0u || RHSMask == ~0u) {
14112 return IdentitySrc;
14118 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
14133 if (SrcVT == MVT::i32) {
14138 DCI.AddToWorklist(LowOr.
getNode());
14139 DCI.AddToWorklist(HiBits.getNode());
14143 return DAG.
getNode(ISD::BITCAST, SL, MVT::i64, Vec);
14150 N->getOperand(0), CRHS))
14158 DAGCombinerInfo &DCI)
const {
14159 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
14166 SelectionDAG &DAG = DCI.DAG;
14168 EVT VT =
N->getValueType(0);
14169 if (CRHS && VT == MVT::i64) {
14171 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
14178 unsigned Opc =
LHS.getOpcode();
14202 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(1));
14204 DAG.
getNode(ISD::BITCAST,
DL, MVT::f32,
LHS->getOperand(2));
14208 LHS->getOperand(0), FNegLHS, FNegRHS);
14209 return DAG.
getNode(ISD::BITCAST,
DL, VT, NewSelect);
14217 DAGCombinerInfo &DCI)
const {
14218 if (!Subtarget->has16BitInsts() ||
14222 EVT VT =
N->getValueType(0);
14223 if (VT != MVT::i32)
14227 if (Src.getValueType() != MVT::i16)
14234SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
14235 DAGCombinerInfo &DCI)
const {
14242 VTSign->getVT() == MVT::i8) ||
14244 VTSign->getVT() == MVT::i16))) {
14245 assert(Subtarget->hasScalarSubwordLoads() &&
14246 "s_buffer_load_{u8, i8} are supported "
14247 "in GFX12 (or newer) architectures.");
14248 EVT VT = Src.getValueType();
14253 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
14260 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
14261 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14266 VTSign->getVT() == MVT::i8) ||
14268 VTSign->getVT() == MVT::i16)) &&
14277 Src.getOperand(6), Src.getOperand(7)};
14280 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
14284 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
14285 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
14286 return DCI.DAG.getMergeValues(
14287 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
14293 DAGCombinerInfo &DCI)
const {
14294 SelectionDAG &DAG = DCI.DAG;
14301 if (
N->getOperand(0).isUndef())
14308 DAGCombinerInfo &DCI)
const {
14309 EVT VT =
N->getValueType(0);
14324 if ((VT == MVT::f16 && N0.
getOpcode() == ISD::FSQRT) &&
14334 unsigned MaxDepth)
const {
14335 unsigned Opcode =
Op.getOpcode();
14340 const auto &
F = CFP->getValueAPF();
14341 if (
F.isNaN() &&
F.isSignaling())
14343 if (!
F.isDenormal())
14369 case ISD::FP_EXTEND:
14370 case ISD::FP16_TO_FP:
14371 case ISD::FP_TO_FP16:
14372 case ISD::BF16_TO_FP:
14373 case ISD::FP_TO_BF16:
14406 if (
Op.getValueType() == MVT::i32) {
14412 if (RHS->getZExtValue() == 0xffff0000) {
14422 return Op.getValueType().getScalarType() != MVT::f16;
14426 case ISD::FMINNUM_IEEE:
14427 case ISD::FMAXNUM_IEEE:
14428 case ISD::FMINIMUM:
14429 case ISD::FMAXIMUM:
14430 case ISD::FMINIMUMNUM:
14431 case ISD::FMAXIMUMNUM:
14443 if (Subtarget->supportsMinMaxDenormModes() ||
14453 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
14465 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
14492 if (
Op.getValueType() == MVT::i16) {
14495 TruncSrc.
getOpcode() == ISD::BITCAST &&
14503 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
14505 switch (IntrinsicID) {
14506 case Intrinsic::amdgcn_cvt_pkrtz:
14507 case Intrinsic::amdgcn_cubeid:
14508 case Intrinsic::amdgcn_frexp_mant:
14509 case Intrinsic::amdgcn_fdot2:
14510 case Intrinsic::amdgcn_rcp:
14511 case Intrinsic::amdgcn_rsq:
14512 case Intrinsic::amdgcn_rsq_clamp:
14513 case Intrinsic::amdgcn_rcp_legacy:
14514 case Intrinsic::amdgcn_rsq_legacy:
14515 case Intrinsic::amdgcn_trig_preop:
14516 case Intrinsic::amdgcn_tanh:
14517 case Intrinsic::amdgcn_log:
14518 case Intrinsic::amdgcn_exp2:
14519 case Intrinsic::amdgcn_sqrt:
14537 unsigned MaxDepth)
const {
14540 unsigned Opcode =
MI->getOpcode();
14542 if (Opcode == AMDGPU::G_FCANONICALIZE)
14545 std::optional<FPValueAndVReg> FCR;
14548 if (FCR->Value.isSignaling())
14550 if (!FCR->Value.isDenormal())
14561 case AMDGPU::G_FADD:
14562 case AMDGPU::G_FSUB:
14563 case AMDGPU::G_FMUL:
14564 case AMDGPU::G_FCEIL:
14565 case AMDGPU::G_FFLOOR:
14566 case AMDGPU::G_FRINT:
14567 case AMDGPU::G_FNEARBYINT:
14568 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
14569 case AMDGPU::G_INTRINSIC_TRUNC:
14570 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
14571 case AMDGPU::G_FMA:
14572 case AMDGPU::G_FMAD:
14573 case AMDGPU::G_FSQRT:
14574 case AMDGPU::G_FDIV:
14575 case AMDGPU::G_FREM:
14576 case AMDGPU::G_FPOW:
14577 case AMDGPU::G_FPEXT:
14578 case AMDGPU::G_FLOG:
14579 case AMDGPU::G_FLOG2:
14580 case AMDGPU::G_FLOG10:
14581 case AMDGPU::G_FPTRUNC:
14582 case AMDGPU::G_AMDGPU_RCP_IFLAG:
14583 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
14584 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
14585 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
14586 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
14588 case AMDGPU::G_FNEG:
14589 case AMDGPU::G_FABS:
14590 case AMDGPU::G_FCOPYSIGN:
14592 case AMDGPU::G_FMINNUM:
14593 case AMDGPU::G_FMAXNUM:
14594 case AMDGPU::G_FMINNUM_IEEE:
14595 case AMDGPU::G_FMAXNUM_IEEE:
14596 case AMDGPU::G_FMINIMUM:
14597 case AMDGPU::G_FMAXIMUM:
14598 case AMDGPU::G_FMINIMUMNUM:
14599 case AMDGPU::G_FMAXIMUMNUM: {
14600 if (Subtarget->supportsMinMaxDenormModes() ||
14607 case AMDGPU::G_BUILD_VECTOR:
14612 case AMDGPU::G_INTRINSIC:
14613 case AMDGPU::G_INTRINSIC_CONVERGENT:
14615 case Intrinsic::amdgcn_fmul_legacy:
14616 case Intrinsic::amdgcn_fmad_ftz:
14617 case Intrinsic::amdgcn_sqrt:
14618 case Intrinsic::amdgcn_fmed3:
14619 case Intrinsic::amdgcn_sin:
14620 case Intrinsic::amdgcn_cos:
14621 case Intrinsic::amdgcn_log:
14622 case Intrinsic::amdgcn_exp2:
14623 case Intrinsic::amdgcn_log_clamp:
14624 case Intrinsic::amdgcn_rcp:
14625 case Intrinsic::amdgcn_rcp_legacy:
14626 case Intrinsic::amdgcn_rsq:
14627 case Intrinsic::amdgcn_rsq_clamp:
14628 case Intrinsic::amdgcn_rsq_legacy:
14629 case Intrinsic::amdgcn_div_scale:
14630 case Intrinsic::amdgcn_div_fmas:
14631 case Intrinsic::amdgcn_div_fixup:
14632 case Intrinsic::amdgcn_fract:
14633 case Intrinsic::amdgcn_cvt_pkrtz:
14634 case Intrinsic::amdgcn_cubeid:
14635 case Intrinsic::amdgcn_cubema:
14636 case Intrinsic::amdgcn_cubesc:
14637 case Intrinsic::amdgcn_cubetc:
14638 case Intrinsic::amdgcn_frexp_mant:
14639 case Intrinsic::amdgcn_fdot2:
14640 case Intrinsic::amdgcn_trig_preop:
14641 case Intrinsic::amdgcn_tanh:
14660 if (
C.isDenormal()) {
14674 if (
C.isSignaling()) {
14697SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
14698 DAGCombinerInfo &DCI)
const {
14699 SelectionDAG &DAG = DCI.DAG;
14701 EVT VT =
N->getValueType(0);
14710 EVT VT =
N->getValueType(0);
14711 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
14727 EVT EltVT =
Lo.getValueType();
14730 for (
unsigned I = 0;
I != 2; ++
I) {
14734 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
14735 }
else if (
Op.isUndef()) {
14769 case ISD::FMAXNUM_IEEE:
14770 case ISD::FMAXIMUMNUM:
14772 case ISD::FMAXIMUM:
14779 case ISD::FMINNUM_IEEE:
14780 case ISD::FMINIMUMNUM:
14782 case ISD::FMINIMUM:
14808 if (!MinK || !MaxK)
14821 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
14822 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
14881 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
14887 if (
Info->getMode().DX10Clamp) {
14896 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
14924 case ISD::FMINNUM_IEEE:
14925 case ISD::FMAXNUM_IEEE:
14926 case ISD::FMINIMUMNUM:
14927 case ISD::FMAXIMUMNUM:
14930 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
14932 case ISD::FMINIMUM:
14933 case ISD::FMAXIMUM:
14941 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
14950 DAGCombinerInfo &DCI)
const {
14951 SelectionDAG &DAG = DCI.DAG;
14983 if (
SDValue Med3 = performIntMed3ImmCombine(
14988 if (
SDValue Med3 = performIntMed3ImmCombine(
14994 if (
SDValue Med3 = performIntMed3ImmCombine(
14999 if (
SDValue Med3 = performIntMed3ImmCombine(
15009 if (((
Opc == ISD::FMINNUM && Op0.
getOpcode() == ISD::FMAXNUM) ||
15010 (
Opc == ISD::FMINNUM_IEEE && Op0.
getOpcode() == ISD::FMAXNUM_IEEE) ||
15011 (
Opc == ISD::FMINIMUMNUM && Op0.
getOpcode() == ISD::FMAXIMUMNUM) ||
15014 (VT == MVT::f32 || VT == MVT::f64 ||
15015 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
15016 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
15017 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
15018 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
15020 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1))
15027 const SDNodeFlags
Flags =
N->getFlags();
15028 if ((
Opc == ISD::FMINIMUM ||
Opc == ISD::FMAXIMUM) &&
15029 !Subtarget->hasIEEEMinimumMaximumInsts() &&
Flags.hasNoNaNs()) {
15031 Opc == ISD::FMINIMUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
15032 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
15042 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
15043 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
15052 DAGCombinerInfo &DCI)
const {
15053 EVT VT =
N->getValueType(0);
15057 SelectionDAG &DAG = DCI.DAG;
15072 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
15076 if (
Info->getMode().DX10Clamp) {
15096 DAGCombinerInfo &DCI)
const {
15100 return DCI.DAG.getUNDEF(
N->getValueType(0));
15108 bool IsDivergentIdx,
15113 unsigned VecSize = EltSize * NumElem;
15116 if (VecSize <= 64 && EltSize < 32)
15125 if (IsDivergentIdx)
15129 unsigned NumInsts = NumElem +
15130 ((EltSize + 31) / 32) * NumElem ;
15134 if (Subtarget->useVGPRIndexMode())
15135 return NumInsts <= 16;
15139 if (Subtarget->hasMovrel())
15140 return NumInsts <= 15;
15146 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
15161SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
15162 DAGCombinerInfo &DCI)
const {
15168 EVT ResVT =
N->getValueType(0);
15192 if (!
C ||
C->getZExtValue() != 0x1f)
15208 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
15227 case ISD::FMAXNUM_IEEE:
15228 case ISD::FMINNUM_IEEE:
15229 case ISD::FMAXIMUM:
15230 case ISD::FMINIMUM: {
15236 DCI.AddToWorklist(Elt0.
getNode());
15237 DCI.AddToWorklist(Elt1.
getNode());
15259 if (!DCI.isBeforeLegalize())
15267 VecSize > 32 && VecSize % 32 == 0 && Idx) {
15270 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
15271 unsigned EltIdx = BitIndex / 32;
15272 unsigned LeftoverBitIdx = BitIndex % 32;
15276 DCI.AddToWorklist(Cast.
getNode());
15280 DCI.AddToWorklist(Elt.
getNode());
15283 DCI.AddToWorklist(Srl.
getNode());
15287 DCI.AddToWorklist(Trunc.
getNode());
15289 if (VecEltVT == ResVT) {
15290 return DAG.
getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
15301SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
15302 DAGCombinerInfo &DCI)
const {
15313 SelectionDAG &DAG = DCI.DAG;
15332 if (Src.getOpcode() == ISD::FP_EXTEND &&
15333 Src.getOperand(0).getValueType() == MVT::f16) {
15334 return Src.getOperand(0);
15338 APFloat Val = CFP->getValueAPF();
15339 bool LosesInfo =
true;
15349 DAGCombinerInfo &DCI)
const {
15350 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
15351 "combine only useful on gfx8");
15353 SDValue TruncSrc =
N->getOperand(0);
15354 EVT VT =
N->getValueType(0);
15355 if (VT != MVT::f16)
15362 SelectionDAG &DAG = DCI.DAG;
15390 return DAG.
getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
15393unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
15395 const SDNode *N1)
const {
15400 if (((VT == MVT::f32 &&
15402 (VT == MVT::f16 && Subtarget->hasMadF16() &&
15422 EVT VT =
N->getValueType(0);
15423 if (VT != MVT::i32 && VT != MVT::i64)
15429 unsigned Opc =
N->getOpcode();
15484 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
15503 DAGCombinerInfo &DCI)
const {
15506 SelectionDAG &DAG = DCI.DAG;
15507 EVT VT =
N->getValueType(0);
15517 if (!
N->isDivergent() && Subtarget->hasSMulHi())
15521 if (NumBits <= 32 || NumBits > 64)
15532 if (!Subtarget->hasFullRate64Ops()) {
15533 unsigned NumUsers = 0;
15534 for (SDNode *User :
LHS->
users()) {
15537 if (!
User->isAnyAdd())
15561 bool MulSignedLo =
false;
15562 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
15571 if (VT != MVT::i64) {
15594 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
15596 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
15597 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
15599 if (!MulLHSUnsigned32) {
15606 if (!MulRHSUnsigned32) {
15617 if (VT != MVT::i64)
15623SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
15624 DAGCombinerInfo &DCI)
const {
15634 SelectionDAG &DAG = DCI.DAG;
15649 unsigned Opcode =
N->getOpcode();
15650 if (Opcode == ISD::PTRADD)
15653 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
15664static std::optional<ByteProvider<SDValue>>
15667 if (!Byte0 || Byte0->isConstantZero()) {
15668 return std::nullopt;
15671 if (Byte1 && !Byte1->isConstantZero()) {
15672 return std::nullopt;
15678 unsigned FirstCs =
First & 0x0c0c0c0c;
15679 unsigned SecondCs = Second & 0x0c0c0c0c;
15680 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
15681 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
15683 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
15684 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
15685 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
15686 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
15688 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
15712 for (
int BPI = 0; BPI < 2; BPI++) {
15715 BPP = {Src1, Src0};
15717 unsigned ZeroMask = 0x0c0c0c0c;
15718 unsigned FMask = 0xFF << (8 * (3 - Step));
15720 unsigned FirstMask =
15721 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15722 unsigned SecondMask =
15723 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
15727 int FirstGroup = -1;
15728 for (
int I = 0;
I < 2;
I++) {
15730 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
15731 return IterElt.SrcOp == *BPP.first.Src &&
15732 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
15736 if (Match != Srcs.
end()) {
15737 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
15742 if (FirstGroup != -1) {
15744 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
15745 return IterElt.SrcOp == *BPP.second.Src &&
15746 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
15749 if (Match != Srcs.
end()) {
15750 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
15752 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
15760 unsigned ZeroMask = 0x0c0c0c0c;
15761 unsigned FMask = 0xFF << (8 * (3 - Step));
15765 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15769 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
15778 if (Srcs.
size() == 1) {
15779 auto *Elt = Srcs.
begin();
15783 if (Elt->PermMask == 0x3020100)
15790 auto *FirstElt = Srcs.
begin();
15791 auto *SecondElt = std::next(FirstElt);
15798 auto FirstMask = FirstElt->PermMask;
15799 auto SecondMask = SecondElt->PermMask;
15801 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
15802 unsigned FirstPlusFour = FirstMask | 0x04040404;
15805 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
15817 FirstElt = std::next(SecondElt);
15818 if (FirstElt == Srcs.
end())
15821 SecondElt = std::next(FirstElt);
15824 if (SecondElt == Srcs.
end()) {
15830 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
15836 return Perms.
size() == 2
15842 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
15843 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
15844 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
15845 EntryMask += ZeroMask;
15850 auto Opcode =
Op.getOpcode();
15856static std::optional<bool>
15867 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
15870 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
15872 assert(!(S0IsUnsigned && S0IsSigned));
15873 assert(!(S1IsUnsigned && S1IsSigned));
15881 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
15887 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
15888 return std::nullopt;
15900 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
15901 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
15906 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
15912 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
15913 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
15914 return std::nullopt;
15920 DAGCombinerInfo &DCI)
const {
15921 SelectionDAG &DAG = DCI.DAG;
15922 EVT VT =
N->getValueType(0);
15928 if (Subtarget->hasMad64_32()) {
15929 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
15934 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
15938 if (VT == MVT::i64) {
15939 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
15944 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
15946 std::optional<bool> IsSigned;
15952 int ChainLength = 0;
15953 for (
int I = 0;
I < 4;
I++) {
15957 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
15960 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
15965 TempNode->getOperand(MulIdx), *Src0, *Src1,
15966 TempNode->getOperand(MulIdx)->getOperand(0),
15967 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
15971 IsSigned = *IterIsSigned;
15972 if (*IterIsSigned != *IsSigned)
15975 auto AddIdx = 1 - MulIdx;
15978 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
15979 Src2s.
push_back(TempNode->getOperand(AddIdx));
15989 TempNode->getOperand(AddIdx), *Src0, *Src1,
15990 TempNode->getOperand(AddIdx)->getOperand(0),
15991 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
15995 if (*IterIsSigned != *IsSigned)
15999 ChainLength =
I + 2;
16003 TempNode = TempNode->getOperand(AddIdx);
16005 ChainLength =
I + 1;
16006 if (TempNode->getNumOperands() < 2)
16008 LHS = TempNode->getOperand(0);
16009 RHS = TempNode->getOperand(1);
16012 if (ChainLength < 2)
16018 if (ChainLength < 4) {
16028 bool UseOriginalSrc =
false;
16029 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
16030 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
16031 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
16032 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
16033 SmallVector<unsigned, 4> SrcBytes;
16034 auto Src0Mask = Src0s.
begin()->PermMask;
16035 SrcBytes.
push_back(Src0Mask & 0xFF000000);
16036 bool UniqueEntries =
true;
16037 for (
auto I = 1;
I < 4;
I++) {
16038 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
16041 UniqueEntries =
false;
16047 if (UniqueEntries) {
16048 UseOriginalSrc =
true;
16050 auto *FirstElt = Src0s.
begin();
16054 auto *SecondElt = Src1s.
begin();
16056 SecondElt->DWordOffset);
16065 if (!UseOriginalSrc) {
16072 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
16075 : Intrinsic::amdgcn_udot4,
16085 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
16090 unsigned Opc =
LHS.getOpcode();
16102 auto Cond =
RHS.getOperand(0);
16107 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16124 DAGCombinerInfo &DCI)
const {
16125 SelectionDAG &DAG = DCI.DAG;
16127 EVT VT =
N->getValueType(0);
16140 SDNodeFlags ShlFlags = N1->
getFlags();
16144 SDNodeFlags NewShlFlags =
16149 DCI.AddToWorklist(Inner.
getNode());
16156 if (Subtarget->hasMad64_32()) {
16157 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
16166 if (VT == MVT::i64) {
16167 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16180 if (!YIsConstant && !ZIsConstant && !
X->isDivergent() &&
16181 Y->isDivergent() !=
Z->isDivergent()) {
16190 if (
Y->isDivergent())
16193 SDNodeFlags ReassocFlags =
16196 DCI.AddToWorklist(UniformInner.
getNode());
16204 DAGCombinerInfo &DCI)
const {
16205 SelectionDAG &DAG = DCI.DAG;
16206 EVT VT =
N->getValueType(0);
16208 if (VT == MVT::i64) {
16209 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
16213 if (VT != MVT::i32)
16222 unsigned Opc =
RHS.getOpcode();
16229 auto Cond =
RHS.getOperand(0);
16234 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
16252SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
16253 DAGCombinerInfo &DCI)
const {
16255 if (
N->getValueType(0) != MVT::i32)
16261 SelectionDAG &DAG = DCI.DAG;
16266 unsigned LHSOpc =
LHS.getOpcode();
16267 unsigned Opc =
N->getOpcode();
16271 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
16277 DAGCombinerInfo &DCI)
const {
16281 SelectionDAG &DAG = DCI.DAG;
16282 EVT VT =
N->getValueType(0);
16294 if (
A ==
LHS.getOperand(1)) {
16295 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16296 if (FusedOp != 0) {
16298 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
16306 if (
A ==
RHS.getOperand(1)) {
16307 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16308 if (FusedOp != 0) {
16310 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
16319 DAGCombinerInfo &DCI)
const {
16323 SelectionDAG &DAG = DCI.DAG;
16325 EVT VT =
N->getValueType(0);
16338 if (
A ==
LHS.getOperand(1)) {
16339 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
16340 if (FusedOp != 0) {
16344 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
16353 if (
A ==
RHS.getOperand(1)) {
16354 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
16355 if (FusedOp != 0) {
16357 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
16366 DAGCombinerInfo &DCI)
const {
16367 SelectionDAG &DAG = DCI.DAG;
16369 EVT VT =
N->getValueType(0);
16370 if ((VT != MVT::f16 && VT != MVT::bf16) || !Subtarget->has16BitInsts())
16376 SDNodeFlags
Flags =
N->getFlags();
16377 SDNodeFlags RHSFlags =
RHS->getFlags();
16383 bool IsNegative =
false;
16384 if (CLHS->isExactlyValue(1.0) ||
16385 (IsNegative = CLHS->isExactlyValue(-1.0))) {
16388 if (
RHS.getOpcode() == ISD::FSQRT) {
16392 return IsNegative ? DAG.
getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
16401 DAGCombinerInfo &DCI)
const {
16402 SelectionDAG &DAG = DCI.DAG;
16403 EVT VT =
N->getValueType(0);
16407 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
16408 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
16423 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
16428 const ConstantFPSDNode *FalseNode =
16438 if (ScalarVT == MVT::f32 &&
16444 if (TrueNodeExpVal == INT_MIN)
16447 if (FalseNodeExpVal == INT_MIN)
16460 return DAG.
getNode(ISD::FLDEXP, SL, VT,
LHS, SelectNode,
N->getFlags());
16467 DAGCombinerInfo &DCI)
const {
16468 SelectionDAG &DAG = DCI.DAG;
16469 EVT VT =
N->getValueType(0);
16472 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
16490 (
N->getFlags().hasAllowContract() &&
16491 FMA->getFlags().hasAllowContract())) {
16506 if (FMAOp1.
getOpcode() != ISD::FP_EXTEND ||
16525 if (Vec1 == Vec2 || Vec3 == Vec4)
16531 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
16540 DAGCombinerInfo &DCI)
const {
16541 SelectionDAG &DAG = DCI.DAG;
16546 EVT VT =
LHS.getValueType();
16575 return LHS.getOperand(0);
16583 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
16590 const APInt &CT =
LHS.getConstantOperandAPInt(1);
16591 const APInt &CF =
LHS.getConstantOperandAPInt(2);
16599 return LHS.getOperand(0);
16631 DAG.
getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
16636 {Op0Hi, Op1Hi, CarryInHi});
16646 DCI.CombineTo(
LHS.getNode(), Result);
16650 if (VT != MVT::f32 && VT != MVT::f64 &&
16651 (!Subtarget->has16BitInsts() || VT != MVT::f16))
16659 LHS.getOpcode() == ISD::FABS) {
16666 const unsigned IsInfMask =
16668 const unsigned IsFiniteMask =
16682SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
16683 DAGCombinerInfo &DCI)
const {
16684 SelectionDAG &DAG = DCI.DAG;
16705 unsigned ShiftOffset = 8 *
Offset;
16707 ShiftOffset -=
C->getZExtValue();
16709 ShiftOffset +=
C->getZExtValue();
16711 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
16713 MVT::f32, Shifted);
16724 DCI.AddToWorklist(
N);
16731 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
16737 DAGCombinerInfo &DCI)
const {
16742 const MachineFunction &MF = DCI.DAG.getMachineFunction();
16746 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
16747 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
16750 APFloat One(
F.getSemantics(),
"1.0");
16752 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
16758 DAGCombinerInfo &DCI)
const {
16779 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
16780 bool isInteger =
LHS.getValueType().isInteger();
16783 if (!isFloatingPoint && !isInteger)
16788 if (!isEquality && !isNonEquality)
16805 if (isFloatingPoint) {
16807 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
16818 if (!(isEquality && TrueVal == ConstVal) &&
16819 !(isNonEquality && FalseVal == ConstVal))
16826 SelectLHS, SelectRHS);
16831 switch (
N->getOpcode()) {
16847 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
16857 switch (
N->getOpcode()) {
16859 return performAddCombine(
N, DCI);
16861 return performPtrAddCombine(
N, DCI);
16863 return performSubCombine(
N, DCI);
16866 return performAddCarrySubCarryCombine(
N, DCI);
16868 return performFAddCombine(
N, DCI);
16870 return performFSubCombine(
N, DCI);
16872 return performFDivCombine(
N, DCI);
16874 return performFMulCombine(
N, DCI);
16876 return performSetCCCombine(
N, DCI);
16878 if (
auto Res = performSelectCombine(
N, DCI))
16883 case ISD::FMAXNUM_IEEE:
16884 case ISD::FMINNUM_IEEE:
16885 case ISD::FMAXIMUM:
16886 case ISD::FMINIMUM:
16887 case ISD::FMAXIMUMNUM:
16888 case ISD::FMINIMUMNUM:
16895 return performMinMaxCombine(
N, DCI);
16897 return performFMACombine(
N, DCI);
16899 return performAndCombine(
N, DCI);
16901 return performOrCombine(
N, DCI);
16904 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
16905 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
16911 return performXorCombine(
N, DCI);
16913 return performZeroExtendCombine(
N, DCI);
16915 return performSignExtendInRegCombine(
N, DCI);
16917 return performClassCombine(
N, DCI);
16919 return performFCanonicalizeCombine(
N, DCI);
16921 return performRcpCombine(
N, DCI);
16936 return performUCharToFloatCombine(
N, DCI);
16938 return performFCopySignCombine(
N, DCI);
16943 return performCvtF32UByteNCombine(
N, DCI);
16945 return performFMed3Combine(
N, DCI);
16947 return performCvtPkRTZCombine(
N, DCI);
16949 return performClampCombine(
N, DCI);
16952 EVT VT =
N->getValueType(0);
16955 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
16958 EVT EltVT = Src.getValueType();
16959 if (EltVT != MVT::i16)
16960 Src = DAG.
getNode(ISD::BITCAST, SL, MVT::i16, Src);
16963 return DAG.
getNode(ISD::BITCAST, SL, VT, Ext);
16969 return performExtractVectorEltCombine(
N, DCI);
16971 return performInsertVectorEltCombine(
N, DCI);
16973 return performFPRoundCombine(
N, DCI);
16982 return performMemSDNodeCombine(MemNode, DCI);
17013 unsigned Opcode =
Node->getMachineOpcode();
17016 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
17017 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
17020 SDNode *
Users[5] = {
nullptr};
17022 unsigned DmaskIdx =
17023 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
17024 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
17025 unsigned NewDmask = 0;
17026 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
17027 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
17028 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
17029 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
17030 unsigned TFCLane = 0;
17031 bool HasChain =
Node->getNumValues() > 1;
17033 if (OldDmask == 0) {
17041 TFCLane = OldBitsSet;
17045 for (SDUse &Use :
Node->uses()) {
17048 if (
Use.getResNo() != 0)
17051 SDNode *
User =
Use.getUser();
17054 if (!
User->isMachineOpcode() ||
17055 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
17067 if (UsesTFC && Lane == TFCLane) {
17072 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
17074 Dmask &= ~(1 << Comp);
17082 NewDmask |= 1 << Comp;
17087 bool NoChannels = !NewDmask;
17094 if (OldBitsSet == 1)
17100 if (NewDmask == OldDmask)
17109 unsigned NewChannels = BitsSet + UsesTFC;
17113 assert(NewOpcode != -1 &&
17114 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
17115 "failed to find equivalent MIMG op");
17123 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
17125 MVT ResultVT = NewChannels == 1
17128 : NewChannels == 5 ? 8
17130 SDVTList NewVTList =
17133 MachineSDNode *NewNode =
17142 if (NewChannels == 1) {
17152 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
17157 if (i || !NoChannels)
17162 if (NewUser != User) {
17172 Idx = AMDGPU::sub1;
17175 Idx = AMDGPU::sub2;
17178 Idx = AMDGPU::sub3;
17181 Idx = AMDGPU::sub4;
17192 Op =
Op.getOperand(0);
17213 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
17217 Node->getOperand(0), SL, VReg, SrcVal,
17223 return ToResultReg.
getNode();
17228 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
17230 Ops.push_back(
Node->getOperand(i));
17236 Node->getOperand(i).getValueType(),
17237 Node->getOperand(i)),
17249 unsigned Opcode =
Node->getMachineOpcode();
17251 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
17252 !
TII->isGather4(Opcode) &&
17254 return adjustWritemask(
Node, DAG);
17257 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
17263 case AMDGPU::V_DIV_SCALE_F32_e64:
17264 case AMDGPU::V_DIV_SCALE_F64_e64: {
17274 (Src0 == Src1 || Src0 == Src2))
17330 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
17331 unsigned InitIdx = 0;
17333 if (
TII->isImage(
MI)) {
17341 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
17342 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
17343 unsigned D16Val = D16 ? D16->getImm() : 0;
17345 if (!TFEVal && !LWEVal)
17356 assert(MO_Dmask &&
"Expected dmask operand in instruction");
17358 unsigned dmask = MO_Dmask->
getImm();
17363 bool Packed = !Subtarget->hasUnpackedD16VMem();
17365 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
17371 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
17372 if (DstSize < InitIdx)
17375 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
17383 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
17384 unsigned NewDst = 0;
17389 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
17390 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
17393 for (; SizeLeft; SizeLeft--, CurrIdx++) {
17394 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
17414 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
17426 if (
TII->isVOP3(
MI.getOpcode())) {
17428 TII->legalizeOperandsVOP3(
MRI,
MI);
17430 if (
TII->isMAI(
MI)) {
17435 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17436 AMDGPU::OpName::scale_src0);
17437 if (Src0Idx != -1) {
17438 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
17439 AMDGPU::OpName::scale_src1);
17440 if (
TII->usesConstantBus(
MRI,
MI, Src0Idx) &&
17441 TII->usesConstantBus(
MRI,
MI, Src1Idx))
17442 TII->legalizeOpWithMove(
MI, Src1Idx);
17449 if (
TII->isImage(
MI))
17450 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
17524std::pair<unsigned, const TargetRegisterClass *>
17531 if (Constraint.
size() == 1) {
17535 if (VT == MVT::Other)
17538 switch (Constraint[0]) {
17545 RC = &AMDGPU::SReg_32RegClass;
17548 RC = &AMDGPU::SGPR_64RegClass;
17553 return std::pair(0U,
nullptr);
17560 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
17561 : &AMDGPU::VGPR_32_Lo256RegClass;
17564 RC = Subtarget->has1024AddressableVGPRs()
17565 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
17568 return std::pair(0U,
nullptr);
17573 if (!Subtarget->hasMAIInsts())
17577 RC = &AMDGPU::AGPR_32RegClass;
17582 return std::pair(0U,
nullptr);
17587 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
17591 RC = &AMDGPU::AV_32RegClass;
17594 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
17596 return std::pair(0U,
nullptr);
17605 return std::pair(0U, RC);
17608 if (Kind !=
'\0') {
17610 RC = &AMDGPU::VGPR_32_Lo256RegClass;
17611 }
else if (Kind ==
's') {
17612 RC = &AMDGPU::SGPR_32RegClass;
17613 }
else if (Kind ==
'a') {
17614 RC = &AMDGPU::AGPR_32RegClass;
17620 return std::pair(0U,
nullptr);
17626 return std::pair(0U,
nullptr);
17630 RC =
TRI->getVGPRClassForBitWidth(Width);
17632 RC =
TRI->getSGPRClassForBitWidth(Width);
17634 RC =
TRI->getAGPRClassForBitWidth(Width);
17636 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
17641 return std::pair(0U,
nullptr);
17643 return std::pair(Reg, RC);
17649 return std::pair(0U,
nullptr);
17650 if (Idx < RC->getNumRegs())
17652 return std::pair(0U,
nullptr);
17658 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
17664 if (Constraint.
size() == 1) {
17665 switch (Constraint[0]) {
17675 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
17683 if (Constraint.
size() == 1) {
17684 switch (Constraint[0]) {
17692 }
else if (Constraint.
size() == 2) {
17693 if (Constraint ==
"VA")
17711 std::vector<SDValue> &
Ops,
17726 unsigned Size =
Op.getScalarValueSizeInBits();
17730 if (
Size == 16 && !Subtarget->has16BitInsts())
17734 Val =
C->getSExtValue();
17738 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17742 if (
Size != 16 ||
Op.getNumOperands() != 2)
17744 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
17747 Val =
C->getSExtValue();
17751 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
17761 if (Constraint.
size() == 1) {
17762 switch (Constraint[0]) {
17777 }
else if (Constraint.
size() == 2) {
17778 if (Constraint ==
"DA") {
17779 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
17780 int64_t LoBits =
static_cast<int32_t
>(Val);
17784 if (Constraint ==
"DB") {
17792 unsigned MaxSize)
const {
17793 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
17794 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
17796 MVT VT =
Op.getSimpleValueType();
17821 switch (UnalignedClassID) {
17822 case AMDGPU::VReg_64RegClassID:
17823 return AMDGPU::VReg_64_Align2RegClassID;
17824 case AMDGPU::VReg_96RegClassID:
17825 return AMDGPU::VReg_96_Align2RegClassID;
17826 case AMDGPU::VReg_128RegClassID:
17827 return AMDGPU::VReg_128_Align2RegClassID;
17828 case AMDGPU::VReg_160RegClassID:
17829 return AMDGPU::VReg_160_Align2RegClassID;
17830 case AMDGPU::VReg_192RegClassID:
17831 return AMDGPU::VReg_192_Align2RegClassID;
17832 case AMDGPU::VReg_224RegClassID:
17833 return AMDGPU::VReg_224_Align2RegClassID;
17834 case AMDGPU::VReg_256RegClassID:
17835 return AMDGPU::VReg_256_Align2RegClassID;
17836 case AMDGPU::VReg_288RegClassID:
17837 return AMDGPU::VReg_288_Align2RegClassID;
17838 case AMDGPU::VReg_320RegClassID:
17839 return AMDGPU::VReg_320_Align2RegClassID;
17840 case AMDGPU::VReg_352RegClassID:
17841 return AMDGPU::VReg_352_Align2RegClassID;
17842 case AMDGPU::VReg_384RegClassID:
17843 return AMDGPU::VReg_384_Align2RegClassID;
17844 case AMDGPU::VReg_512RegClassID:
17845 return AMDGPU::VReg_512_Align2RegClassID;
17846 case AMDGPU::VReg_1024RegClassID:
17847 return AMDGPU::VReg_1024_Align2RegClassID;
17848 case AMDGPU::AReg_64RegClassID:
17849 return AMDGPU::AReg_64_Align2RegClassID;
17850 case AMDGPU::AReg_96RegClassID:
17851 return AMDGPU::AReg_96_Align2RegClassID;
17852 case AMDGPU::AReg_128RegClassID:
17853 return AMDGPU::AReg_128_Align2RegClassID;
17854 case AMDGPU::AReg_160RegClassID:
17855 return AMDGPU::AReg_160_Align2RegClassID;
17856 case AMDGPU::AReg_192RegClassID:
17857 return AMDGPU::AReg_192_Align2RegClassID;
17858 case AMDGPU::AReg_256RegClassID:
17859 return AMDGPU::AReg_256_Align2RegClassID;
17860 case AMDGPU::AReg_512RegClassID:
17861 return AMDGPU::AReg_512_Align2RegClassID;
17862 case AMDGPU::AReg_1024RegClassID:
17863 return AMDGPU::AReg_1024_Align2RegClassID;
17879 if (Info->isEntryFunction()) {
17886 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
17888 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
17889 :
TRI->getAlignedHighSGPRForRC(MF, 2,
17890 &AMDGPU::SGPR_64RegClass);
17891 Info->setSGPRForEXECCopy(SReg);
17893 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
17894 Info->getStackPtrOffsetReg()));
17895 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
17896 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
17900 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
17901 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
17903 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
17904 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
17906 Info->limitOccupancy(MF);
17908 if (ST.isWave32() && !MF.
empty()) {
17909 for (
auto &
MBB : MF) {
17910 for (
auto &
MI :
MBB) {
17911 TII->fixImplicitOperands(
MI);
17921 if (ST.needsAlignedVGPRs()) {
17922 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
17928 if (NewClassID != -1)
17929 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
17938 const APInt &DemandedElts,
17940 unsigned Depth)
const {
17942 unsigned Opc =
Op.getOpcode();
17945 unsigned IID =
Op.getConstantOperandVal(0);
17947 case Intrinsic::amdgcn_mbcnt_lo:
17948 case Intrinsic::amdgcn_mbcnt_hi: {
17954 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
17964 Op, Known, DemandedElts, DAG,
Depth);
17980 unsigned MaxValue =
17987 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
17991 unsigned Src1Cst = 0;
17992 if (Src1.
isImm()) {
17993 Src1Cst = Src1.
getImm();
17994 }
else if (Src1.
isReg()) {
17998 Src1Cst = Cst->Value.getZExtValue();
18009 if (Width >= BFEWidth)
18018 Known = Known.
sext(BFEWidth);
18020 Known = Known.
zext(BFEWidth);
18026 unsigned Depth)
const {
18029 switch (
MI->getOpcode()) {
18030 case AMDGPU::S_BFE_I32:
18033 case AMDGPU::S_BFE_U32:
18036 case AMDGPU::S_BFE_I64:
18039 case AMDGPU::S_BFE_U64:
18042 case AMDGPU::G_INTRINSIC:
18043 case AMDGPU::G_INTRINSIC_CONVERGENT: {
18046 case Intrinsic::amdgcn_workitem_id_x:
18049 case Intrinsic::amdgcn_workitem_id_y:
18052 case Intrinsic::amdgcn_workitem_id_z:
18055 case Intrinsic::amdgcn_mbcnt_lo:
18056 case Intrinsic::amdgcn_mbcnt_hi: {
18068 case Intrinsic::amdgcn_groupstaticsize: {
18079 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
18082 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
18085 case AMDGPU::G_AMDGPU_SMED3:
18086 case AMDGPU::G_AMDGPU_UMED3: {
18087 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
18114 unsigned Depth)
const {
18121 AttributeList Attrs =
18123 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
18150 if (Header->getAlignment() != PrefAlign)
18151 return Header->getAlignment();
18153 unsigned LoopSize = 0;
18158 LoopSize +=
MBB->getAlignment().value() / 2;
18161 LoopSize +=
TII->getInstSizeInBytes(
MI);
18162 if (LoopSize > 192)
18167 if (LoopSize <= 64)
18170 if (LoopSize <= 128)
18171 return CacheLineAlign;
18177 auto I = Exit->getFirstNonDebugInstr();
18178 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
18179 return CacheLineAlign;
18188 if (PreTerm == Pre->
begin() ||
18189 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
18193 auto ExitHead = Exit->getFirstNonDebugInstr();
18194 if (ExitHead == Exit->end() ||
18195 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
18200 return CacheLineAlign;
18208 N =
N->getOperand(0).getNode();
18209 if (
N->getOpcode() == ISD::INLINEASM ||
N->getOpcode() == ISD::INLINEASM_BR)
18218 switch (
N->getOpcode()) {
18226 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
18227 return !
TRI->isSGPRReg(
MRI, Reg);
18233 return !
TRI->isSGPRReg(
MRI, Reg);
18237 unsigned AS = L->getAddressSpace();
18241 case ISD::CALLSEQ_END:
18270 return A->readMem() &&
A->writeMem();
18291 switch (Ty.getScalarSizeInBits()) {
18303 const APInt &DemandedElts,
18306 unsigned Depth)
const {
18311 if (Info->getMode().DX10Clamp)
18323 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
18343 <<
"Hardware instruction generated for atomic "
18345 <<
" operation at memory scope " << MemScope;
18350 Type *EltTy = VT->getElementType();
18351 return VT->getNumElements() == 2 &&
18371 unsigned BW =
IT->getBitWidth();
18372 return BW == 32 || BW == 64;
18386 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
18387 return BW == 32 || BW == 64;
18390 if (Ty->isFloatTy() || Ty->isDoubleTy())
18394 return VT->getNumElements() == 2 &&
18395 VT->getElementType()->getPrimitiveSizeInBits() == 16;
18405 bool HasSystemScope) {
18412 if (HasSystemScope) {
18421 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
18434 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
18460 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
18473 bool HasSystemScope =
18499 if (Subtarget->hasEmulatedSystemScopeAtomics())
18515 if (!HasSystemScope &&
18516 Subtarget->supportsAgentScopeFineGrainedRemoteMemoryAtomics())
18528 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
18536 ConstVal && ConstVal->isNullValue())
18574 if (Ty->isFloatTy()) {
18579 if (Ty->isDoubleTy()) {
18600 if (Ty->isFloatTy() &&
18601 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
18614 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18618 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
18622 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
18627 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
18632 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
18636 if (Ty->isFloatTy()) {
18639 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18642 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18647 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
18655 if (Subtarget->hasFlatAtomicFaddF32Inst())
18664 if (Subtarget->hasLDSFPAtomicAddF32()) {
18665 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
18667 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
18695 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
18697 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
18701 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
18703 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
18756 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
18757 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
18758 : &AMDGPU::SReg_32RegClass;
18759 if (!
TRI->isSGPRClass(RC) && !isDivergent)
18760 return TRI->getEquivalentSGPRClass(RC);
18761 if (
TRI->isSGPRClass(RC) && isDivergent)
18762 return TRI->getEquivalentVGPRClass(RC);
18774 unsigned WaveSize) {
18779 if (!
IT ||
IT->getBitWidth() != WaveSize)
18784 if (!Visited.
insert(V).second)
18786 bool Result =
false;
18787 for (
const auto *U : V->users()) {
18789 if (V == U->getOperand(1)) {
18794 case Intrinsic::amdgcn_if_break:
18795 case Intrinsic::amdgcn_if:
18796 case Intrinsic::amdgcn_else:
18801 if (V == U->getOperand(0)) {
18806 case Intrinsic::amdgcn_end_cf:
18807 case Intrinsic::amdgcn_loop:
18813 Result =
hasCFUser(U, Visited, WaveSize);
18822 const Value *V)
const {
18824 if (CI->isInlineAsm()) {
18833 for (
auto &TC : TargetConstraints) {
18847 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
18875 return MRI.hasOneNonDBGUse(N0);
18882 if (
I.getMetadata(
"amdgpu.noclobber"))
18884 if (
I.getMetadata(
"amdgpu.last.use"))
18948 Alignment = RMW->getAlign();
18961 bool FullFlatEmulation =
18963 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
18964 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
18965 RMW->getType()->isDoubleTy()));
18968 bool ReturnValueIsUsed = !AI->
use_empty();
18977 if (FullFlatEmulation) {
18988 std::prev(BB->
end())->eraseFromParent();
18989 Builder.SetInsertPoint(BB);
18991 Value *LoadedShared =
nullptr;
18992 if (FullFlatEmulation) {
18993 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
18994 {Addr},
nullptr,
"is.shared");
18995 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
18996 Builder.SetInsertPoint(SharedBB);
18997 Value *CastToLocal = Builder.CreateAddrSpaceCast(
19003 LoadedShared = Clone;
19005 Builder.CreateBr(PhiBB);
19006 Builder.SetInsertPoint(CheckPrivateBB);
19009 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
19010 {Addr},
nullptr,
"is.private");
19011 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
19013 Builder.SetInsertPoint(PrivateBB);
19015 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
19018 Value *LoadedPrivate;
19020 LoadedPrivate = Builder.CreateAlignedLoad(
19021 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
19024 LoadedPrivate, RMW->getValOperand());
19026 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
19028 auto [ResultLoad, Equal] =
19034 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
19037 Builder.CreateBr(PhiBB);
19039 Builder.SetInsertPoint(GlobalBB);
19043 if (FullFlatEmulation) {
19044 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
19053 if (!FullFlatEmulation) {
19058 MDNode *RangeNotPrivate =
19061 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
19065 Builder.CreateBr(PhiBB);
19067 Builder.SetInsertPoint(PhiBB);
19069 if (ReturnValueIsUsed) {
19072 if (FullFlatEmulation)
19079 Builder.CreateBr(ExitBB);
19083 unsigned PtrOpIdx) {
19084 Value *PtrOp =
I->getOperand(PtrOpIdx);
19091 I->setOperand(PtrOpIdx, ASCast);
19103 ConstVal && ConstVal->isNullValue()) {
19133 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
19141 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
19156 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static void getCoopAtomicOperandsInfo(const CallInst &CI, bool IsLoad, TargetLoweringBase::IntrinsicInfo &Info)
static uint64_t getIdentityValueFor64BitWaveReduction(unsigned Opc)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static MachineBasicBlock * Expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static uint32_t getIdentityValueFor32BitWaveReduction(unsigned Opc)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned AndSaveExecOpc
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf()
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
bool hasMinimum3Maximum3F32() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
const SIInstrInfo * getInstrInfo() const override
bool hasMin3Max3PKF16() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasMinimum3Maximum3PKF16() const
bool hasGloballyAddressableScratch() const
bool hasMinimum3Maximum3F16() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasEmulatedSystemScopeAtomics() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
constexpr bool empty() const
empty - Check if the string is empty.
constexpr size_t size() const
size - Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
constexpr bool isZero() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ TC_RETURN_GFX_WholeWave
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isGFX11(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isGFX1250(const MCSubtargetInfo &STI)
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ SMULO
Same for multiplication.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > Tys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
@ System
Synchronized with respect to all concurrently executing threads.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
unsigned getUndefRegState(bool B)
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
unsigned AtomicNoRetBaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const