35#include "llvm/IR/IntrinsicsAMDGPU.h"
36#include "llvm/IR/IntrinsicsR600.h"
43#define DEBUG_TYPE "si-lower"
48 "amdgpu-disable-loop-alignment",
49 cl::desc(
"Do not align and prefetch loops"),
53 "amdgpu-use-divergent-register-indexing",
55 cl::desc(
"Use indirect register addressing for divergent indexes"),
60 return Info->getMode().allFP32Denormals();
65 return Info->getMode().allFP64FP16Denormals();
69 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
70 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
72 return AMDGPU::SGPR0 + Reg;
173 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
174 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
175 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
176 MVT::i1, MVT::v32i32},
180 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
181 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
182 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
183 MVT::i1, MVT::v32i32},
219 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
226 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
227 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
228 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
231 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
232 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
233 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
237 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
238 MVT::v3i16, MVT::v4i16, MVT::Other},
243 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
259 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
260 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
261 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
262 MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
263 MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16,
264 MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64,
265 MVT::v32i32, MVT::v32f32}) {
297 for (
MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
311 for (
MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
325 for (
MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
339 for (
MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
353 for (
MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
368 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
376 {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
377 MVT::v4i16, MVT::v4f16},
382 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
386 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
387 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
388 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
389 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
464 {MVT::f32, MVT::f64},
Legal);
537 for (
MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
538 MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
646 {MVT::v4f16, MVT::v8f16, MVT::v16f16},
Custom);
649 {MVT::v4f16, MVT::v8f16, MVT::v16f16},
Expand);
651 for (
MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
673 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
674 MVT::v16f16, MVT::v16i16},
677 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16})
685 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16})
700 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
720 {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
721 MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16},
730 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
731 MVT::v2i16, MVT::v2f16},
735 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
736 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
741 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
742 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
822 EVT DestVT,
EVT SrcVT)
const {
832 LLT DestTy,
LLT SrcTy)
const {
833 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
834 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
860 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
862 return VT.
isInteger() ? MVT::i32 : MVT::f32;
889 return (NumElts + 1) / 2;
895 return NumElts * ((
Size + 31) / 32);
904 EVT VT,
EVT &IntermediateVT,
905 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
914 if (ScalarVT == MVT::bf16) {
915 RegisterVT = MVT::i32;
916 IntermediateVT = MVT::v2bf16;
918 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
919 IntermediateVT = RegisterVT;
921 NumIntermediates = (NumElts + 1) / 2;
922 return NumIntermediates;
927 IntermediateVT = RegisterVT;
928 NumIntermediates = NumElts;
929 return NumIntermediates;
932 if (Size < 16 && Subtarget->has16BitInsts()) {
934 RegisterVT = MVT::i16;
935 IntermediateVT = ScalarVT;
936 NumIntermediates = NumElts;
937 return NumIntermediates;
942 RegisterVT = MVT::i32;
943 IntermediateVT = ScalarVT;
944 NumIntermediates = NumElts;
945 return NumIntermediates;
949 RegisterVT = MVT::i32;
950 IntermediateVT = RegisterVT;
951 NumIntermediates = NumElts * ((
Size + 31) / 32);
952 return NumIntermediates;
957 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
963 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
964 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
975 auto *ST = dyn_cast<StructType>(Ty);
980 assert(ST->getNumContainedTypes() == 2 &&
981 ST->getContainedType(1)->isIntegerTy(32));
1008 unsigned IntrID)
const {
1010 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1024 if (RsrcIntr->IsImage)
1029 unsigned MaxNumLanes = 4;
1031 if (RsrcIntr->IsImage) {
1055 if (RsrcIntr->IsImage) {
1056 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1078 case Intrinsic::amdgcn_raw_buffer_load_lds:
1079 case Intrinsic::amdgcn_struct_buffer_load_lds: {
1080 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1090 case Intrinsic::amdgcn_atomic_inc:
1091 case Intrinsic::amdgcn_atomic_dec:
1092 case Intrinsic::amdgcn_ds_ordered_add:
1093 case Intrinsic::amdgcn_ds_ordered_swap:
1094 case Intrinsic::amdgcn_ds_fadd:
1095 case Intrinsic::amdgcn_ds_fmin:
1096 case Intrinsic::amdgcn_ds_fmax: {
1109 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1117 if (!Vol || !Vol->
isZero())
1122 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1123 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1126 Info.ptrVal =
nullptr;
1131 case Intrinsic::amdgcn_ds_append:
1132 case Intrinsic::amdgcn_ds_consume: {
1145 case Intrinsic::amdgcn_global_atomic_csub: {
1155 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1165 case Intrinsic::amdgcn_global_atomic_fadd:
1166 case Intrinsic::amdgcn_global_atomic_fmin:
1167 case Intrinsic::amdgcn_global_atomic_fmax:
1168 case Intrinsic::amdgcn_flat_atomic_fadd:
1169 case Intrinsic::amdgcn_flat_atomic_fmin:
1170 case Intrinsic::amdgcn_flat_atomic_fmax:
1171 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1172 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1183 case Intrinsic::amdgcn_ds_gws_init:
1184 case Intrinsic::amdgcn_ds_gws_barrier:
1185 case Intrinsic::amdgcn_ds_gws_sema_v:
1186 case Intrinsic::amdgcn_ds_gws_sema_br:
1187 case Intrinsic::amdgcn_ds_gws_sema_p:
1188 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1198 Info.memVT = MVT::i32;
1202 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1208 case Intrinsic::amdgcn_global_load_lds: {
1210 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1216 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1226 Info.memVT = MVT::i32;
1240 Type *&AccessTy)
const {
1242 case Intrinsic::amdgcn_atomic_inc:
1243 case Intrinsic::amdgcn_atomic_dec:
1244 case Intrinsic::amdgcn_ds_ordered_add:
1245 case Intrinsic::amdgcn_ds_ordered_swap:
1246 case Intrinsic::amdgcn_ds_append:
1247 case Intrinsic::amdgcn_ds_consume:
1248 case Intrinsic::amdgcn_ds_fadd:
1249 case Intrinsic::amdgcn_ds_fmin:
1250 case Intrinsic::amdgcn_ds_fmax:
1251 case Intrinsic::amdgcn_global_atomic_fadd:
1252 case Intrinsic::amdgcn_flat_atomic_fadd:
1253 case Intrinsic::amdgcn_flat_atomic_fmin:
1254 case Intrinsic::amdgcn_flat_atomic_fmax:
1255 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1256 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1257 case Intrinsic::amdgcn_global_atomic_csub: {
1268bool SITargetLowering::isLegalFlatAddressingMode(
const AddrMode &AM)
const {
1272 return AM.BaseOffs == 0 && AM.Scale == 0;
1275 return AM.Scale == 0 &&
1276 (AM.BaseOffs == 0 ||
1283 return AM.
Scale == 0 &&
1298 return isLegalFlatAddressingMode(AM);
1301 return isLegalMUBUFAddressingMode(AM);
1304bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1326 if (AM.HasBaseReg) {
1356 return isLegalMUBUFAddressingMode(AM);
1362 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1391 return isLegalMUBUFAddressingMode(AM);
1416 return isLegalFlatAddressingMode(AM);
1437 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1451 Alignment < RequiredAlignment)
1472 RequiredAlignment =
Align(4);
1490 *IsFast = (Alignment >= RequiredAlignment) ? 64
1491 : (Alignment <
Align(4)) ? 32
1513 *IsFast = (Alignment >= RequiredAlignment) ? 96
1514 : (Alignment <
Align(4)) ? 32
1527 RequiredAlignment =
Align(8);
1538 *IsFast = (Alignment >= RequiredAlignment) ? 128
1539 : (Alignment <
Align(4)) ? 32
1556 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1558 return Alignment >= RequiredAlignment ||
1563 bool AlignedBy4 = Alignment >=
Align(4);
1565 *IsFast = AlignedBy4;
1567 return AlignedBy4 ||
1577 bool AlignedBy4 = Alignment >=
Align(4);
1579 *IsFast = AlignedBy4;
1590 return Alignment >=
Align(4) ||
1604 return Size >= 32 && Alignment >=
Align(4);
1609 unsigned *IsFast)
const {
1611 Alignment,
Flags, IsFast);
1621 if (Op.size() >= 16 &&
1622 Op.isDstAligned(
Align(4)))
1625 if (Op.size() >= 8 && Op.isDstAligned(
Align(4)))
1633 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1643 unsigned DestAS)
const {
1651 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1655 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1675 unsigned Index)
const {
1722 std::tie(InputPtrReg, RC, ArgTy) =
1732 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1738 const SDLoc &SL)
const {
1745 const SDLoc &SL)
const {
1748 std::optional<uint32_t> KnownSize =
1750 if (KnownSize.has_value())
1770 if (
Arg && (
Arg->Flags.isSExt() ||
Arg->Flags.isZExt()) &&
1777 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
1786SDValue SITargetLowering::lowerKernargMemParameter(
1798 int64_t OffsetDiff =
Offset - AlignDownOffset;
1804 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1814 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed,
Arg);
1835 if (
Arg.Flags.isByVal()) {
1836 unsigned Size =
Arg.Flags.getByValSize();
1872 ExtType, SL, VA.
getLocVT(), Chain, FIN,
1908 for (
unsigned I = 0,
E = Ins.size(), PSInputNum = 0;
I !=
E; ++
I) {
1911 assert((!
Arg->VT.isVector() ||
Arg->VT.getScalarSizeInBits() == 16) &&
1912 "vector type argument should have been split");
1916 !
Arg->Flags.isInReg() && PSInputNum <= 15) {
1917 bool SkipArg = !
Arg->Used && !
Info->isPSInputAllocated(PSInputNum);
1922 if (
Arg->Flags.isSplit()) {
1923 while (!
Arg->Flags.isSplitEnd()) {
1925 Arg->VT.getScalarSizeInBits() == 16) &&
1926 "unexpected vector split in ps argument type");
1935 Skipped.
set(
Arg->getOrigArgIndex());
1940 Info->markPSInputAllocated(PSInputNum);
1942 Info->markPSInputEnabled(PSInputNum);
1959 if (
Info.hasWorkItemIDX()) {
1961 MRI.setType(MF.
addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1965 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
1969 if (
Info.hasWorkItemIDY()) {
1975 unsigned Reg = AMDGPU::VGPR1;
1976 MRI.setType(MF.
addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1983 if (
Info.hasWorkItemIDZ()) {
1989 unsigned Reg = AMDGPU::VGPR2;
1990 MRI.setType(MF.
addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2009 if (RegIdx == ArgVGPRs.
size()) {
2016 unsigned Reg = ArgVGPRs[RegIdx];
2018 assert(Reg != AMDGPU::NoRegister);
2028 unsigned NumArgRegs) {
2031 if (RegIdx == ArgSGPRs.
size())
2034 unsigned Reg = ArgSGPRs[RegIdx];
2036 assert(Reg != AMDGPU::NoRegister);
2050 assert(Reg != AMDGPU::NoRegister);
2076 const unsigned Mask = 0x3ff;
2079 if (
Info.hasWorkItemIDX()) {
2084 if (
Info.hasWorkItemIDY()) {
2089 if (
Info.hasWorkItemIDZ())
2101 const unsigned Mask = 0x3ff;
2115 if (
Info.hasDispatchPtr())
2119 if (
Info.hasQueuePtr() &&
2125 if (
Info.hasImplicitArgPtr())
2128 if (
Info.hasDispatchID())
2133 if (
Info.hasWorkGroupIDX())
2136 if (
Info.hasWorkGroupIDY())
2139 if (
Info.hasWorkGroupIDZ())
2142 if (
Info.hasLDSKernelId())
2151 if (
Info.hasImplicitBufferPtr()) {
2153 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2158 if (
Info.hasPrivateSegmentBuffer()) {
2160 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2164 if (
Info.hasDispatchPtr()) {
2166 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2171 if (
Info.hasQueuePtr() &&
2174 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2178 if (
Info.hasKernargSegmentPtr()) {
2187 if (
Info.hasDispatchID()) {
2189 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2195 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2199 if (
Info.hasLDSKernelId()) {
2201 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2214 bool IsShader)
const {
2222 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2224 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2228 unsigned NumRequiredSystemSGPRs =
Info.hasWorkGroupIDX() +
2229 Info.hasWorkGroupIDY() +
2230 Info.hasWorkGroupIDZ() +
2231 Info.hasWorkGroupInfo();
2232 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2234 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2239 if (
Info.hasWorkGroupIDX()) {
2240 Register Reg =
Info.addWorkGroupIDX(HasArchitectedSGPRs);
2241 if (!HasArchitectedSGPRs)
2242 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2247 if (
Info.hasWorkGroupIDY()) {
2248 Register Reg =
Info.addWorkGroupIDY(HasArchitectedSGPRs);
2249 if (!HasArchitectedSGPRs)
2250 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2255 if (
Info.hasWorkGroupIDZ()) {
2256 Register Reg =
Info.addWorkGroupIDZ(HasArchitectedSGPRs);
2257 if (!HasArchitectedSGPRs)
2258 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2263 if (
Info.hasWorkGroupInfo()) {
2265 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2269 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2271 unsigned PrivateSegmentWaveByteOffsetReg;
2274 PrivateSegmentWaveByteOffsetReg =
2275 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2279 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2281 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2284 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2286 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2287 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2291 Info.getNumPreloadedSGPRs() >= 16);
2306 if (HasStackObjects)
2307 Info.setHasNonSpillStackObjects(
true);
2312 HasStackObjects =
true;
2316 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2318 if (!ST.enableFlatScratch()) {
2319 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2326 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2328 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2338 Info.setScratchRSrcReg(ReservedBufferReg);
2357 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2358 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2365 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2366 if (!
MRI.isLiveIn(Reg)) {
2367 Info.setStackPtrOffsetReg(Reg);
2372 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2379 if (ST.getFrameLowering()->hasFP(MF)) {
2380 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2386 return !
Info->isEntryFunction();
2398 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2407 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2408 RC = &AMDGPU::SGPR_64RegClass;
2409 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2410 RC = &AMDGPU::SGPR_32RegClass;
2416 Entry->addLiveIn(*
I);
2421 for (
auto *Exit : Exits)
2423 TII->get(TargetOpcode::COPY), *
I)
2441 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2446 Info->allocateKnownAddressLDSGlobal(Fn);
2459 assert(!
Info->hasDispatchPtr() && !
Info->hasKernargSegmentPtr() &&
2460 !
Info->hasWorkGroupInfo() && !
Info->hasLDSKernelId() &&
2461 !
Info->hasWorkItemIDX() && !
Info->hasWorkItemIDY() &&
2462 !
Info->hasWorkItemIDZ());
2467 !
Info->hasWorkGroupIDZ());
2486 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2487 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2490 Info->markPSInputAllocated(0);
2491 Info->markPSInputEnabled(0);
2502 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2503 if ((PsInputBits & 0x7F) == 0 ||
2504 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2507 }
else if (IsKernel) {
2510 Splits.
append(Ins.begin(), Ins.end());
2516 }
else if (!IsGraphics) {
2537 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2539 if (
Arg.isOrigArg() && Skipped[
Arg.getOrigArgIndex()]) {
2547 if (IsEntryFunc && VA.
isMemLoc()) {
2554 if (
Arg.Flags.isByRef()) {
2560 Arg.Flags.getPointerAddrSpace())) {
2562 Arg.Flags.getPointerAddrSpace());
2570 DAG, VT, MemVT,
DL, Chain,
Offset, Alignment, Ins[i].
Flags.isSExt(), &Ins[i]);
2574 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
2587 }
else if (!IsEntryFunc && VA.
isMemLoc()) {
2588 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain,
Arg);
2590 if (!
Arg.Flags.isByVal())
2599 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
2600 RC = &AMDGPU::VGPR_32RegClass;
2601 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
2602 RC = &AMDGPU::SGPR_32RegClass;
2610 if (
Arg.Flags.isSRet()) {
2660 auto &ArgUsageInfo =
2665 Info->setBytesInStackArgArea(StackArgSize);
2667 return Chains.
empty() ? Chain :
2706 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
2724 for (
unsigned I = 0, RealRVLocIdx = 0,
E = RVLocs.
size();
I !=
E;
2725 ++
I, ++RealRVLocIdx) {
2757 if (!
Info->isEntryFunction()) {
2763 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2765 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2781 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
2798 for (
unsigned i = 0; i != RVLocs.
size(); ++i) {
2864 auto &ArgUsageInfo =
2866 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2895 std::tie(OutgoingArg, ArgRC, ArgTy) =
2903 std::tie(IncomingArg, IncomingArgRC, Ty) =
2905 assert(IncomingArgRC == ArgRC);
2908 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2916 InputReg = getImplicitArgPtr(DAG,
DL);
2918 std::optional<uint32_t> Id =
2920 if (Id.has_value()) {
2932 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
2936 unsigned SpecialArgOffset =
2950 std::tie(OutgoingArg, ArgRC, Ty) =
2953 std::tie(OutgoingArg, ArgRC, Ty) =
2956 std::tie(OutgoingArg, ArgRC, Ty) =
2971 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
2972 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
2973 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
2990 InputReg = InputReg.
getNode() ?
2999 InputReg = InputReg.
getNode() ?
3003 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3004 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3014 IncomingArgX ? *IncomingArgX :
3015 IncomingArgY ? *IncomingArgY :
3016 *IncomingArgZ, ~0u);
3023 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3061 if (
Callee->isDivergent())
3068 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3072 if (!CallerPreserved)
3075 bool CCMatch = CallerCC == CalleeCC;
3088 if (
Arg.hasByValAttr())
3102 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3103 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3112 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3150 bool IsSibCall =
false;
3151 bool IsThisReturn =
false;
3156 for (
unsigned I = 0,
E = CLI.
Ins.size();
I !=
E; ++
I)
3165 "unsupported call to variadic function ");
3173 "unsupported required tail call to function ");
3180 "unsupported call to a shader function ");
3187 "unsupported calling convention for call from "
3188 "graphics shader of function ");
3193 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3196 "site marked musttail");
3203 if (!TailCallOpt && IsTailCall)
3254 RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
3260 MVT PtrVT = MVT::i32;
3263 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3299 int32_t
Offset = LocMemOffset;
3306 unsigned OpSize =
Flags.isByVal() ?
3312 ?
Flags.getNonZeroByValAlign()
3339 if (Outs[i].
Flags.isByVal()) {
3344 Outs[i].
Flags.getNonZeroByValAlign(),
3358 if (!MemOpChains.
empty())
3364 for (
auto &RegToPass : RegsToPass) {
3366 RegToPass.second, InGlue);
3375 if (IsTailCall && !IsSibCall) {
3380 std::vector<SDValue> Ops;
3381 Ops.push_back(Chain);
3401 for (
auto &RegToPass : RegsToPass) {
3403 RegToPass.second.getValueType()));
3409 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3410 assert(Mask &&
"Missing call preserved mask for calling convention");
3414 Ops.push_back(InGlue);
3424 return DAG.
getNode(OPC,
DL, NodeTys, Ops);
3429 Chain = Call.getValue(0);
3430 InGlue = Call.getValue(1);
3432 uint64_t CalleePopBytes = NumBytes;
3440 InVals, IsThisReturn,
3441 IsThisReturn ? OutVals[0] :
SDValue());
3452 EVT VT = Op.getValueType();
3454 SDValue Tmp2 = Op.getValue(1);
3455 SDValue Tmp3 = Op.getOperand(2);
3467 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3476 DAG.
getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32));
3478 Align StackAlign = TFL->getStackAlign();
3479 Tmp1 = DAG.
getNode(Opc, dl, VT, SP, ScaledSize);
3480 if (Alignment && *Alignment > StackAlign) {
3483 << ST.getWavefrontSizeLog2(),
3501 if (isa<ConstantSDNode>(
Size))
3510 .
Case(
"m0", AMDGPU::M0)
3511 .
Case(
"exec", AMDGPU::EXEC)
3512 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
3513 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
3514 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
3515 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
3516 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
3519 if (Reg == AMDGPU::NoRegister) {
3533 case AMDGPU::EXEC_LO:
3534 case AMDGPU::EXEC_HI:
3535 case AMDGPU::FLAT_SCR_LO:
3536 case AMDGPU::FLAT_SCR_HI:
3541 case AMDGPU::FLAT_SCR:
3560 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
3569static std::pair<MachineBasicBlock *, MachineBasicBlock *>
3591 auto Next = std::next(
I);
3604 return std::pair(LoopBB, RemainderBB);
3611 auto I =
MI.getIterator();
3612 auto E = std::next(
I);
3634 Src->setIsKill(
false);
3650 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3653 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
3675 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
3676 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
3685 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
3686 Register NewExec =
MRI.createVirtualRegister(BoolRC);
3687 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3688 Register CondReg =
MRI.createVirtualRegister(BoolRC);
3696 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
3703 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
3707 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
3712 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3713 : AMDGPU::S_AND_SAVEEXEC_B64),
3717 MRI.setSimpleHint(NewExec, CondReg);
3719 if (UseGPRIdxMode) {
3721 SGPRIdxReg = CurrentIdxReg;
3723 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3724 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
3731 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3734 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3741 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3743 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
3744 : AMDGPU::S_XOR_B64_term), Exec)
3765 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
3766 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
3774 const auto *BoolXExecRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3776 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
3777 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
3778 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3779 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
3794 InitResultReg, DstReg, PhiReg, TmpExec,
3795 Offset, UseGPRIdxMode, SGPRIdxReg);
3805 BuildMI(*LandingPad, First,
DL,
TII->get(MovExecOpc), Exec)
3812static std::pair<unsigned, int>
3817 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
3822 return std::pair(AMDGPU::sub0,
Offset);
3836 assert(
Idx->getReg() != AMDGPU::NoRegister);
3857 return Idx->getReg();
3859 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3876 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
3877 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
3886 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3889 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3893 if (UseGPRIdxMode) {
3900 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
3913 MI.eraseFromParent();
3922 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3923 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3929 UseGPRIdxMode, SGPRIdxReg);
3933 if (UseGPRIdxMode) {
3935 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
3937 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
3942 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3947 MI.eraseFromParent();
3964 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
3975 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3977 if (
Idx->getReg() == AMDGPU::NoRegister) {
3988 MI.eraseFromParent();
3993 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3997 if (UseGPRIdxMode) {
4001 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4010 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4011 TRI.getRegSizeInBits(*VecRC), 32,
false);
4017 MI.eraseFromParent();
4027 Register PhiReg =
MRI.createVirtualRegister(VecRC);
4031 UseGPRIdxMode, SGPRIdxReg);
4034 if (UseGPRIdxMode) {
4036 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4038 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4044 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4045 TRI.getRegSizeInBits(*VecRC), 32,
false);
4046 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
4052 MI.eraseFromParent();
4063 switch (
MI.getOpcode()) {
4064 case AMDGPU::S_UADDO_PSEUDO:
4065 case AMDGPU::S_USUBO_PSEUDO: {
4072 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4074 : AMDGPU::S_SUB_I32;
4081 MI.eraseFromParent();
4084 case AMDGPU::S_ADD_U64_PSEUDO:
4085 case AMDGPU::S_SUB_U64_PSEUDO: {
4096 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4097 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4100 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4102 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4105 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4107 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4109 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4111 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4112 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4120 MI.eraseFromParent();
4123 case AMDGPU::V_ADD_U64_PSEUDO:
4124 case AMDGPU::V_SUB_U64_PSEUDO: {
4130 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
4136 if (IsAdd && ST.hasLshlAddB64()) {
4142 TII->legalizeOperands(*
Add);
4143 MI.eraseFromParent();
4147 const auto *CarryRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4149 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4150 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4152 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
4153 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
4157 : &AMDGPU::VReg_64RegClass;
4160 : &AMDGPU::VReg_64RegClass;
4163 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
4165 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
4168 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
4170 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
4173 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
4175 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
4177 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
4184 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4198 TII->legalizeOperands(*LoHalf);
4199 TII->legalizeOperands(*HiHalf);
4200 MI.eraseFromParent();
4203 case AMDGPU::S_ADD_CO_PSEUDO:
4204 case AMDGPU::S_SUB_CO_PSEUDO: {
4218 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
4219 ? AMDGPU::S_ADDC_U32
4220 : AMDGPU::S_SUBB_U32;
4222 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4223 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
4228 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4229 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
4233 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4235 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
4241 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
4242 assert(WaveSize == 64 || WaveSize == 32);
4244 if (WaveSize == 64) {
4245 if (ST.hasScalarCompareEq64()) {
4251 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
4253 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
4255 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
4256 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);