40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
49#define DEBUG_TYPE "si-lower"
54 "amdgpu-disable-loop-alignment",
55 cl::desc(
"Do not align and prefetch loops"),
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc(
"Use indirect register addressing for divergent indexes"),
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 return AMDGPU::SGPR0 + Reg;
186 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
187 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
188 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
189 MVT::i1, MVT::v32i32},
193 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
194 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
195 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
196 MVT::i1, MVT::v32i32},
234 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
241 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
242 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
243 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
246 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
247 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
248 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
252 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
253 MVT::v3i16, MVT::v4i16, MVT::Other},
258 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
274 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
275 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
276 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
277 MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
278 MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16,
279 MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64,
280 MVT::v32i32, MVT::v32f32, MVT::v32i16, MVT::v32f16}) {
312 for (
MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
326 for (
MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
340 for (
MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
354 for (
MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
368 for (
MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
383 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
391 {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
392 MVT::v4i16, MVT::v4f16},
397 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
401 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
402 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
403 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
404 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
481 {MVT::f32, MVT::f64},
Legal);
560 {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
561 MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v32i16, MVT::v32f16}) {
679 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
683 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
686 for (
MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
687 MVT::v32i16, MVT::v32f16}) {
709 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
710 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
713 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
721 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
736 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
756 {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
757 MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
758 MVT::v32i16, MVT::v32f16},
767 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
768 MVT::v2i16, MVT::v2f16, MVT::i128},
772 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
773 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
774 MVT::i16, MVT::i8, MVT::i128},
778 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
779 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
870 EVT DestVT,
EVT SrcVT)
const {
880 LLT DestTy,
LLT SrcTy)
const {
908 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
910 return VT.
isInteger() ? MVT::i32 : MVT::f32;
937 return (NumElts + 1) / 2;
943 return NumElts * ((
Size + 31) / 32);
952 EVT VT,
EVT &IntermediateVT,
953 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
962 if (ScalarVT == MVT::bf16) {
963 RegisterVT = MVT::i32;
964 IntermediateVT = MVT::v2bf16;
966 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
967 IntermediateVT = RegisterVT;
969 NumIntermediates = (NumElts + 1) / 2;
970 return NumIntermediates;
975 IntermediateVT = RegisterVT;
976 NumIntermediates = NumElts;
977 return NumIntermediates;
980 if (Size < 16 && Subtarget->has16BitInsts()) {
982 RegisterVT = MVT::i16;
983 IntermediateVT = ScalarVT;
984 NumIntermediates = NumElts;
985 return NumIntermediates;
990 RegisterVT = MVT::i32;
991 IntermediateVT = ScalarVT;
992 NumIntermediates = NumElts;
993 return NumIntermediates;
997 RegisterVT = MVT::i32;
998 IntermediateVT = RegisterVT;
999 NumIntermediates = NumElts * ((
Size + 31) / 32);
1000 return NumIntermediates;
1005 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1009 assert(MaxNumLanes != 0);
1011 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1012 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1023 auto *ST = dyn_cast<StructType>(Ty);
1028 assert(ST->getNumContainedTypes() == 2 &&
1029 ST->getContainedType(1)->isIntegerTy(32));
1056 unsigned IntrID)
const {
1058 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1072 if (RsrcIntr->IsImage)
1076 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1083 Info.ptrVal = RsrcArg;
1088 unsigned MaxNumLanes = 4;
1090 if (RsrcIntr->IsImage) {
1114 if (RsrcIntr->IsImage) {
1115 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1137 case Intrinsic::amdgcn_raw_buffer_load_lds:
1138 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1139 case Intrinsic::amdgcn_struct_buffer_load_lds:
1140 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1141 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1151 case Intrinsic::amdgcn_ds_ordered_add:
1152 case Intrinsic::amdgcn_ds_ordered_swap:
1153 case Intrinsic::amdgcn_ds_fadd:
1154 case Intrinsic::amdgcn_ds_fmin:
1155 case Intrinsic::amdgcn_ds_fmax: {
1168 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1176 if (!Vol || !Vol->
isZero())
1181 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1182 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1185 Info.ptrVal =
nullptr;
1190 case Intrinsic::amdgcn_ds_append:
1191 case Intrinsic::amdgcn_ds_consume: {
1204 case Intrinsic::amdgcn_global_atomic_csub: {
1214 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1224 case Intrinsic::amdgcn_global_atomic_fadd:
1225 case Intrinsic::amdgcn_global_atomic_fmin:
1226 case Intrinsic::amdgcn_global_atomic_fmax:
1227 case Intrinsic::amdgcn_flat_atomic_fadd:
1228 case Intrinsic::amdgcn_flat_atomic_fmin:
1229 case Intrinsic::amdgcn_flat_atomic_fmax:
1230 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1231 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1242 case Intrinsic::amdgcn_ds_gws_init:
1243 case Intrinsic::amdgcn_ds_gws_barrier:
1244 case Intrinsic::amdgcn_ds_gws_sema_v:
1245 case Intrinsic::amdgcn_ds_gws_sema_br:
1246 case Intrinsic::amdgcn_ds_gws_sema_p:
1247 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1257 Info.memVT = MVT::i32;
1261 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1267 case Intrinsic::amdgcn_global_load_lds: {
1269 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1275 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1285 Info.memVT = MVT::i32;
1299 Type *&AccessTy)
const {
1301 case Intrinsic::amdgcn_ds_ordered_add:
1302 case Intrinsic::amdgcn_ds_ordered_swap:
1303 case Intrinsic::amdgcn_ds_append:
1304 case Intrinsic::amdgcn_ds_consume:
1305 case Intrinsic::amdgcn_ds_fadd:
1306 case Intrinsic::amdgcn_ds_fmin:
1307 case Intrinsic::amdgcn_ds_fmax:
1308 case Intrinsic::amdgcn_global_atomic_fadd:
1309 case Intrinsic::amdgcn_flat_atomic_fadd:
1310 case Intrinsic::amdgcn_flat_atomic_fmin:
1311 case Intrinsic::amdgcn_flat_atomic_fmax:
1312 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1313 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1314 case Intrinsic::amdgcn_global_atomic_csub: {
1325bool SITargetLowering::isLegalFlatAddressingMode(
const AddrMode &AM,
1331 return AM.BaseOffs == 0 && AM.Scale == 0;
1334 return AM.Scale == 0 &&
1336 AM.BaseOffs, AddrSpace, FlatVariant));
1358 return isLegalMUBUFAddressingMode(AM);
1361bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1383 if (AM.HasBaseReg) {
1413 return isLegalMUBUFAddressingMode(AM);
1419 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1455 : isLegalMUBUFAddressingMode(AM);
1503 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1517 Alignment < RequiredAlignment)
1538 RequiredAlignment =
Align(4);
1556 *IsFast = (Alignment >= RequiredAlignment) ? 64
1557 : (Alignment <
Align(4)) ? 32
1579 *IsFast = (Alignment >= RequiredAlignment) ? 96
1580 : (Alignment <
Align(4)) ? 32
1593 RequiredAlignment =
Align(8);
1604 *IsFast = (Alignment >= RequiredAlignment) ? 128
1605 : (Alignment <
Align(4)) ? 32
1622 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1624 return Alignment >= RequiredAlignment ||
1629 bool AlignedBy4 = Alignment >=
Align(4);
1631 *IsFast = AlignedBy4;
1633 return AlignedBy4 ||
1643 bool AlignedBy4 = Alignment >=
Align(4);
1645 *IsFast = AlignedBy4;
1656 return Alignment >=
Align(4) ||
1670 return Size >= 32 && Alignment >=
Align(4);
1675 unsigned *IsFast)
const {
1677 Alignment, Flags, IsFast);
1687 if (
Op.size() >= 16 &&
1691 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1699 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1709 unsigned DestAS)
const {
1717 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1721 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1741 unsigned Index)
const {
1788 std::tie(InputPtrReg, RC, ArgTy) =
1798 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1804 const SDLoc &SL)
const {
1811 const SDLoc &SL)
const {
1814 std::optional<uint32_t> KnownSize =
1816 if (KnownSize.has_value())
1843 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
1852SDValue SITargetLowering::lowerKernargMemParameter(
1864 int64_t OffsetDiff =
Offset - AlignDownOffset;
1870 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1880 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
1891 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
1938 ExtType, SL, VA.
getLocVT(), Chain, FIN,
1974 for (
unsigned I = 0,
E = Ins.size(), PSInputNum = 0;
I !=
E; ++
I) {
1978 "vector type argument should have been split");
1983 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
1992 "unexpected vector split in ps argument type");
2006 Info->markPSInputAllocated(PSInputNum);
2008 Info->markPSInputEnabled(PSInputNum);
2025 if (
Info.hasWorkItemIDX()) {
2027 MRI.setType(MF.
addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2031 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2035 if (
Info.hasWorkItemIDY()) {
2041 unsigned Reg = AMDGPU::VGPR1;
2042 MRI.setType(MF.
addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2049 if (
Info.hasWorkItemIDZ()) {
2055 unsigned Reg = AMDGPU::VGPR2;
2056 MRI.setType(MF.
addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2075 if (RegIdx == ArgVGPRs.
size()) {
2082 unsigned Reg = ArgVGPRs[RegIdx];
2084 assert(Reg != AMDGPU::NoRegister);
2094 unsigned NumArgRegs) {
2097 if (RegIdx == ArgSGPRs.
size())
2100 unsigned Reg = ArgSGPRs[RegIdx];
2102 assert(Reg != AMDGPU::NoRegister);
2116 assert(Reg != AMDGPU::NoRegister);
2142 const unsigned Mask = 0x3ff;
2145 if (
Info.hasWorkItemIDX()) {
2147 Info.setWorkItemIDX(Arg);
2150 if (
Info.hasWorkItemIDY()) {
2152 Info.setWorkItemIDY(Arg);
2155 if (
Info.hasWorkItemIDZ())
2167 const unsigned Mask = 0x3ff;
2192 if (
Info.hasImplicitArgPtr())
2200 if (
Info.hasWorkGroupIDX())
2203 if (
Info.hasWorkGroupIDY())
2206 if (
Info.hasWorkGroupIDZ())
2209 if (
Info.hasLDSKernelId())
2221 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2228 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2234 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2242 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2257 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2263 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2278 unsigned LastExplicitArgOffset =
2281 bool InPreloadSequence =
true;
2283 for (
auto &Arg :
F.args()) {
2284 if (!InPreloadSequence || !Arg.hasInRegAttr())
2287 int ArgIdx = Arg.getArgNo();
2290 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2291 (
int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2294 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2295 (
int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2297 assert(ArgLocs[ArgIdx].isMemLoc());
2298 auto &ArgLoc = ArgLocs[InIdx];
2300 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2302 unsigned NumAllocSGPRs =
2303 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2306 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2307 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2308 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2312 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2313 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2315 if (PaddingSGPRs + NumAllocSGPRs + 1 >
2317 InPreloadSequence =
false;
2323 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2325 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2327 if (PreloadRegs->
size() > 1)
2328 RC = &AMDGPU::SGPR_32RegClass;
2329 for (
auto &Reg : *PreloadRegs) {
2335 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2344 if (
Info.hasLDSKernelId()) {
2346 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2356 bool IsShader)
const {
2364 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2366 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2370 unsigned NumRequiredSystemSGPRs =
Info.hasWorkGroupIDX() +
2371 Info.hasWorkGroupIDY() +
2372 Info.hasWorkGroupIDZ() +
2373 Info.hasWorkGroupInfo();
2374 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2376 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2381 if (
Info.hasWorkGroupIDX()) {
2382 Register Reg =
Info.addWorkGroupIDX(HasArchitectedSGPRs);
2383 if (!HasArchitectedSGPRs)
2384 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2389 if (
Info.hasWorkGroupIDY()) {
2390 Register Reg =
Info.addWorkGroupIDY(HasArchitectedSGPRs);
2391 if (!HasArchitectedSGPRs)
2392 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2397 if (
Info.hasWorkGroupIDZ()) {
2398 Register Reg =
Info.addWorkGroupIDZ(HasArchitectedSGPRs);
2399 if (!HasArchitectedSGPRs)
2400 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2405 if (
Info.hasWorkGroupInfo()) {
2407 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2411 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2413 unsigned PrivateSegmentWaveByteOffsetReg;
2416 PrivateSegmentWaveByteOffsetReg =
2417 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2421 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2423 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2426 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2428 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2429 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2433 Info.getNumPreloadedSGPRs() >= 16);
2448 if (HasStackObjects)
2449 Info.setHasNonSpillStackObjects(
true);
2454 HasStackObjects =
true;
2458 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2460 if (!ST.enableFlatScratch()) {
2461 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2468 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2470 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2480 Info.setScratchRSrcReg(ReservedBufferReg);
2499 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2500 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2507 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2508 if (!
MRI.isLiveIn(Reg)) {
2509 Info.setStackPtrOffsetReg(Reg);
2514 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2521 if (ST.getFrameLowering()->hasFP(MF)) {
2522 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2528 return !
Info->isEntryFunction();
2540 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2549 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2550 RC = &AMDGPU::SGPR_64RegClass;
2551 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2552 RC = &AMDGPU::SGPR_32RegClass;
2558 Entry->addLiveIn(*
I);
2563 for (
auto *Exit : Exits)
2565 TII->get(TargetOpcode::COPY), *
I)
2583 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2602 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2603 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2609 !
Info->hasWorkGroupIDZ());
2628 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2629 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2632 Info->markPSInputAllocated(0);
2633 Info->markPSInputEnabled(0);
2644 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2645 if ((PsInputBits & 0x7F) == 0 ||
2646 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2649 }
else if (IsKernel) {
2652 Splits.
append(Ins.begin(), Ins.end());
2666 }
else if (!IsGraphics) {
2675 AMDGPU::SGPR2, AMDGPU::SGPR3},
2690 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2700 if (IsEntryFunc && VA.
isMemLoc()) {
2723 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
2727 int64_t OffsetDiff =
Offset - AlignDownOffset;
2734 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2745 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
2746 Ins[i].Flags.isSExt(), &Ins[i]);
2754 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2757 if (PreloadRegs.
size() == 1) {
2758 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
2763 TRI->getRegSizeInBits(*RC)));
2771 for (
auto Reg : PreloadRegs) {
2778 PreloadRegs.size()),
2787 NewArg = convertArgType(DAG, VT, MemVT,
DL, CMemVT,
2788 Ins[i].Flags.isSExt(), &Ins[i]);
2793 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
2794 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2799 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
2812 }
else if (!IsEntryFunc && VA.
isMemLoc()) {
2813 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
2824 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
2825 RC = &AMDGPU::VGPR_32RegClass;
2826 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
2827 RC = &AMDGPU::SGPR_32RegClass;
2885 auto &ArgUsageInfo =
2890 Info->setBytesInStackArgArea(StackArgSize);
2892 return Chains.
empty() ? Chain :
2916 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
2917 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
2918 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
2941 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
2959 for (
unsigned I = 0, RealRVLocIdx = 0,
E = RVLocs.
size();
I !=
E;
2960 ++
I, ++RealRVLocIdx) {
2964 SDValue Arg = OutVals[RealRVLocIdx];
2992 if (!
Info->isEntryFunction()) {
2998 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3000 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3016 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
3033 for (
unsigned i = 0; i != RVLocs.
size(); ++i) {
3099 auto &ArgUsageInfo =
3101 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3130 std::tie(OutgoingArg, ArgRC, ArgTy) =
3138 std::tie(IncomingArg, IncomingArgRC, Ty) =
3140 assert(IncomingArgRC == ArgRC);
3143 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3151 InputReg = getImplicitArgPtr(DAG,
DL);
3153 std::optional<uint32_t> Id =
3155 if (Id.has_value()) {
3167 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3171 unsigned SpecialArgOffset =
3185 std::tie(OutgoingArg, ArgRC, Ty) =
3188 std::tie(OutgoingArg, ArgRC, Ty) =
3191 std::tie(OutgoingArg, ArgRC, Ty) =
3206 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3207 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3208 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3225 InputReg = InputReg.
getNode() ?
3234 InputReg = InputReg.
getNode() ?
3238 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3239 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3249 IncomingArgX ? *IncomingArgX :
3250 IncomingArgY ? *IncomingArgY :
3251 *IncomingArgZ, ~0u);
3258 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3299 if (Callee->isDivergent())
3306 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3310 if (!CallerPreserved)
3313 bool CCMatch = CallerCC == CalleeCC;
3326 if (Arg.hasByValAttr())
3340 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3341 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3350 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3384 if (IsChainCallConv) {
3388 RequestedExec = CLI.
Args.back();
3389 assert(RequestedExec.
Node &&
"No node for EXEC");
3394 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Unexpected last arg");
3395 CLI.
Outs.pop_back();
3399 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Exec wasn't split up");
3400 CLI.
Outs.pop_back();
3405 "Haven't popped all the pieces of the EXEC mask");
3416 bool IsSibCall =
false;
3417 bool IsThisReturn =
false;
3422 for (
unsigned I = 0,
E = CLI.
Ins.size();
I !=
E; ++
I)
3431 "unsupported call to variadic function ");
3439 "unsupported required tail call to function ");
3444 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3448 "site marked musttail or on llvm.amdgcn.cs.chain");
3455 if (!TailCallOpt && IsTailCall)
3500 if (!IsSibCall || IsChainCallConv) {
3507 RegsToPass.emplace_back(IsChainCallConv
3508 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3509 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3516 MVT PtrVT = MVT::i32;
3519 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3547 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
3555 int32_t
Offset = LocMemOffset;
3562 unsigned OpSize = Flags.isByVal() ?
3568 ? Flags.getNonZeroByValAlign()
3595 if (Outs[i].Flags.isByVal()) {
3597 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
3600 Outs[i].Flags.getNonZeroByValAlign(),
3608 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
3614 if (!MemOpChains.
empty())
3620 for (
auto &RegToPass : RegsToPass) {
3622 RegToPass.second, InGlue);
3631 if (IsTailCall && !IsSibCall) {
3636 std::vector<SDValue> Ops;
3637 Ops.push_back(Chain);
3638 Ops.push_back(Callee);
3655 if (IsChainCallConv)
3656 Ops.push_back(RequestedExec.
Node);
3660 for (
auto &RegToPass : RegsToPass) {
3662 RegToPass.second.getValueType()));
3667 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3668 assert(Mask &&
"Missing call preserved mask for calling convention");
3672 Ops.push_back(InGlue);
3691 return DAG.
getNode(OPC,
DL, NodeTys, Ops);
3696 Chain = Call.getValue(0);
3697 InGlue = Call.getValue(1);
3699 uint64_t CalleePopBytes = NumBytes;
3707 InVals, IsThisReturn,
3708 IsThisReturn ? OutVals[0] :
SDValue());
3719 EVT VT =
Op.getValueType();
3734 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3745 Tmp1 = DAG.
getNode(Opc, dl, VT, SP, ScaledSize);
3746 if (Alignment && *Alignment > StackAlign) {
3767 if (isa<ConstantSDNode>(
Size))
3774 if (
Op.getValueType() != MVT::i32)
3793 assert(
Op.getValueType() == MVT::i32);
3802 Op.getOperand(0), IntrinID, GetRoundBothImm);
3836 SDValue RoundModeTimesNumBits =
3856 TableEntry, EnumOffset);
3864 .
Case(
"m0", AMDGPU::M0)
3865 .
Case(
"exec", AMDGPU::EXEC)
3866 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
3867 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
3868 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
3869 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
3870 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
3873 if (Reg == AMDGPU::NoRegister) {
3887 case AMDGPU::EXEC_LO:
3888 case AMDGPU::EXEC_HI:
3889 case AMDGPU::FLAT_SCR_LO:
3890 case AMDGPU::FLAT_SCR_HI:
3895 case AMDGPU::FLAT_SCR:
3914 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
3923static std::pair<MachineBasicBlock *, MachineBasicBlock *>
3945 auto Next = std::next(
I);
3958 return std::pair(LoopBB, RemainderBB);
3965 auto I =
MI.getIterator();
3966 auto E = std::next(
I);
3988 Src->setIsKill(
false);
4004 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4007 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4029 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4030 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4039 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4040 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4041 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4042 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4050 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4057 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4061 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4066 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4067 : AMDGPU::S_AND_SAVEEXEC_B64),
4071 MRI.setSimpleHint(NewExec, CondReg);
4073 if (UseGPRIdxMode) {
4075 SGPRIdxReg = CurrentIdxReg;
4077 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4078 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4085 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4088 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4095 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4097 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4098 : AMDGPU::S_XOR_B64_term), Exec)
4119 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4120 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4128 const auto *BoolXExecRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4130 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4131 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4132 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4133 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4148 InitResultReg, DstReg, PhiReg, TmpExec,
4149 Offset, UseGPRIdxMode, SGPRIdxReg);
4166static std::pair<unsigned, int>
4171 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
4176 return std::pair(AMDGPU::sub0,
Offset);
4190 assert(
Idx->getReg() != AMDGPU::NoRegister);
4211 return Idx->getReg();
4213 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4230 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
4231 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4240 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4243 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4247 if (UseGPRIdxMode) {
4254 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4267 MI.eraseFromParent();
4276 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4277 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4283 UseGPRIdxMode, SGPRIdxReg);
4287 if (UseGPRIdxMode) {
4289 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4291 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4296 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4301 MI.eraseFromParent();