35#include "llvm/IR/IntrinsicsAMDGPU.h"
36#include "llvm/IR/IntrinsicsR600.h"
43#define DEBUG_TYPE "si-lower"
48 "amdgpu-disable-loop-alignment",
49 cl::desc(
"Do not align and prefetch loops"),
53 "amdgpu-use-divergent-register-indexing",
55 cl::desc(
"Use indirect register addressing for divergent indexes"),
60 return Info->getMode().allFP32Denormals();
65 return Info->getMode().allFP64FP16Denormals();
69 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
70 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
72 return AMDGPU::SGPR0 + Reg;
818 EVT DestVT,
EVT SrcVT)
const {
828 LLT DestTy,
LLT SrcTy)
const {
829 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
830 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
885 return (NumElts + 1) / 2;
891 return NumElts * ((
Size + 31) / 32);
900 EVT VT,
EVT &IntermediateVT,
901 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
915 IntermediateVT = RegisterVT;
917 NumIntermediates = (NumElts + 1) / 2;
918 return NumIntermediates;
923 IntermediateVT = RegisterVT;
924 NumIntermediates = NumElts;
925 return NumIntermediates;
928 if (Size < 16 && Subtarget->has16BitInsts()) {
931 IntermediateVT = ScalarVT;
932 NumIntermediates = NumElts;
933 return NumIntermediates;
939 IntermediateVT = ScalarVT;
940 NumIntermediates = NumElts;
941 return NumIntermediates;
946 IntermediateVT = RegisterVT;
947 NumIntermediates = NumElts * ((
Size + 31) / 32);
948 return NumIntermediates;
953 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
959 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
960 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
971 auto *ST = dyn_cast<StructType>(Ty);
976 assert(ST->getNumContainedTypes() == 2 &&
977 ST->getContainedType(1)->isIntegerTy(32));
984 unsigned IntrID)
const {
986 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1000 if (RsrcIntr->IsImage)
1005 unsigned MaxNumLanes = 4;
1007 if (RsrcIntr->IsImage) {
1031 if (RsrcIntr->IsImage) {
1032 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1054 case Intrinsic::amdgcn_raw_buffer_load_lds:
1055 case Intrinsic::amdgcn_struct_buffer_load_lds: {
1056 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1066 case Intrinsic::amdgcn_atomic_inc:
1067 case Intrinsic::amdgcn_atomic_dec:
1068 case Intrinsic::amdgcn_ds_ordered_add:
1069 case Intrinsic::amdgcn_ds_ordered_swap:
1070 case Intrinsic::amdgcn_ds_fadd:
1071 case Intrinsic::amdgcn_ds_fmin:
1072 case Intrinsic::amdgcn_ds_fmax: {
1085 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1093 if (!Vol || !Vol->
isZero())
1098 case Intrinsic::amdgcn_ds_append:
1099 case Intrinsic::amdgcn_ds_consume: {
1112 case Intrinsic::amdgcn_global_atomic_csub: {
1122 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1132 case Intrinsic::amdgcn_global_atomic_fadd:
1133 case Intrinsic::amdgcn_global_atomic_fmin:
1134 case Intrinsic::amdgcn_global_atomic_fmax:
1135 case Intrinsic::amdgcn_flat_atomic_fadd:
1136 case Intrinsic::amdgcn_flat_atomic_fmin:
1137 case Intrinsic::amdgcn_flat_atomic_fmax:
1138 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1139 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1150 case Intrinsic::amdgcn_ds_gws_init:
1151 case Intrinsic::amdgcn_ds_gws_barrier:
1152 case Intrinsic::amdgcn_ds_gws_sema_v:
1153 case Intrinsic::amdgcn_ds_gws_sema_br:
1154 case Intrinsic::amdgcn_ds_gws_sema_p:
1155 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1169 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1175 case Intrinsic::amdgcn_global_load_lds: {
1177 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1183 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1207 Type *&AccessTy)
const {
1209 case Intrinsic::amdgcn_atomic_inc:
1210 case Intrinsic::amdgcn_atomic_dec:
1211 case Intrinsic::amdgcn_ds_ordered_add:
1212 case Intrinsic::amdgcn_ds_ordered_swap:
1213 case Intrinsic::amdgcn_ds_append:
1214 case Intrinsic::amdgcn_ds_consume:
1215 case Intrinsic::amdgcn_ds_fadd:
1216 case Intrinsic::amdgcn_ds_fmin:
1217 case Intrinsic::amdgcn_ds_fmax:
1218 case Intrinsic::amdgcn_global_atomic_fadd:
1219 case Intrinsic::amdgcn_flat_atomic_fadd:
1220 case Intrinsic::amdgcn_flat_atomic_fmin:
1221 case Intrinsic::amdgcn_flat_atomic_fmax:
1222 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1223 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1224 case Intrinsic::amdgcn_global_atomic_csub: {
1235bool SITargetLowering::isLegalFlatAddressingMode(
const AddrMode &AM)
const {
1239 return AM.BaseOffs == 0 && AM.Scale == 0;
1242 return AM.Scale == 0 &&
1243 (AM.BaseOffs == 0 ||
1250 return AM.
Scale == 0 &&
1265 return isLegalFlatAddressingMode(AM);
1268 return isLegalMUBUFAddressingMode(AM);
1271bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1293 if (AM.HasBaseReg) {
1323 return isLegalMUBUFAddressingMode(AM);
1329 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1357 return isLegalMUBUFAddressingMode(AM);
1381 return isLegalFlatAddressingMode(AM);
1402 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1416 Alignment < RequiredAlignment)
1437 RequiredAlignment =
Align(4);
1455 *IsFast = (Alignment >= RequiredAlignment) ? 64
1456 : (Alignment <
Align(4)) ? 32
1478 *IsFast = (Alignment >= RequiredAlignment) ? 96
1479 : (Alignment <
Align(4)) ? 32
1492 RequiredAlignment =
Align(8);
1503 *IsFast = (Alignment >= RequiredAlignment) ? 128
1504 : (Alignment <
Align(4)) ? 32
1521 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1523 return Alignment >= RequiredAlignment ||
1528 bool AlignedBy4 = Alignment >=
Align(4);
1530 *IsFast = AlignedBy4;
1532 return AlignedBy4 ||
1542 bool AlignedBy4 = Alignment >=
Align(4);
1544 *IsFast = AlignedBy4;
1557 Alignment >=
Align(4) : Alignment !=
Align(2);
1573 return Size >= 32 && Alignment >=
Align(4);
1578 unsigned *IsFast)
const {
1580 Alignment, Flags, IsFast);
1590 if (Op.size() >= 16 &&
1591 Op.isDstAligned(
Align(4)))
1594 if (Op.size() >= 8 && Op.isDstAligned(
Align(4)))
1602 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1612 unsigned DestAS)
const {
1620 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1624 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1644 unsigned Index)
const {
1691 std::tie(InputPtrReg, RC, ArgTy) =
1701 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1707 const SDLoc &SL)
const {
1714 const SDLoc &SL)
const {
1717 std::optional<uint32_t> KnownSize =
1719 if (KnownSize.has_value())
1739 if (
Arg && (
Arg->Flags.isSExt() ||
Arg->Flags.isZExt()) &&
1746 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
1755SDValue SITargetLowering::lowerKernargMemParameter(
1767 int64_t OffsetDiff =
Offset - AlignDownOffset;
1773 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1783 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed,
Arg);
1804 if (
Arg.Flags.isByVal()) {
1805 unsigned Size =
Arg.Flags.getByValSize();
1841 ExtType, SL, VA.
getLocVT(), Chain, FIN,
1877 for (
unsigned I = 0,
E = Ins.size(), PSInputNum = 0;
I !=
E; ++
I) {
1880 assert((!
Arg->VT.isVector() ||
Arg->VT.getScalarSizeInBits() == 16) &&
1881 "vector type argument should have been split");
1885 !
Arg->Flags.isInReg() && PSInputNum <= 15) {
1886 bool SkipArg = !
Arg->Used && !
Info->isPSInputAllocated(PSInputNum);
1891 if (
Arg->Flags.isSplit()) {
1892 while (!
Arg->Flags.isSplitEnd()) {
1894 Arg->VT.getScalarSizeInBits() == 16) &&
1895 "unexpected vector split in ps argument type");
1904 Skipped.
set(
Arg->getOrigArgIndex());
1909 Info->markPSInputAllocated(PSInputNum);
1911 Info->markPSInputEnabled(PSInputNum);
1928 if (
Info.hasWorkItemIDX()) {
1930 MRI.setType(MF.
addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1934 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
1938 if (
Info.hasWorkItemIDY()) {
1944 unsigned Reg = AMDGPU::VGPR1;
1945 MRI.setType(MF.
addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1952 if (
Info.hasWorkItemIDZ()) {
1958 unsigned Reg = AMDGPU::VGPR2;
1959 MRI.setType(MF.
addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1978 if (RegIdx == ArgVGPRs.
size()) {
1985 unsigned Reg = ArgVGPRs[RegIdx];
1987 assert(Reg != AMDGPU::NoRegister);
1997 unsigned NumArgRegs) {
2000 if (RegIdx == ArgSGPRs.
size())
2003 unsigned Reg = ArgSGPRs[RegIdx];
2005 assert(Reg != AMDGPU::NoRegister);
2019 assert(Reg != AMDGPU::NoRegister);
2045 const unsigned Mask = 0x3ff;
2048 if (
Info.hasWorkItemIDX()) {
2053 if (
Info.hasWorkItemIDY()) {
2058 if (
Info.hasWorkItemIDZ())
2070 const unsigned Mask = 0x3ff;
2084 if (
Info.hasDispatchPtr())
2093 if (
Info.hasImplicitArgPtr())
2096 if (
Info.hasDispatchID())
2101 if (
Info.hasWorkGroupIDX())
2104 if (
Info.hasWorkGroupIDY())
2107 if (
Info.hasWorkGroupIDZ())
2110 if (
Info.hasLDSKernelId())
2119 if (
Info.hasImplicitBufferPtr()) {
2121 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2126 if (
Info.hasPrivateSegmentBuffer()) {
2128 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2132 if (
Info.hasDispatchPtr()) {
2134 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2141 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2145 if (
Info.hasKernargSegmentPtr()) {
2154 if (
Info.hasDispatchID()) {
2156 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2162 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2166 if (
Info.hasLDSKernelId()) {
2168 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2181 bool IsShader)
const {
2185 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2190 unsigned NumRequiredSystemSGPRs =
Info.hasWorkGroupIDX() +
2191 Info.hasWorkGroupIDY() +
2192 Info.hasWorkGroupIDZ() +
2193 Info.hasWorkGroupInfo();
2194 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2196 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2201 if (
Info.hasWorkGroupIDX()) {
2203 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2207 if (
Info.hasWorkGroupIDY()) {
2209 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2213 if (
Info.hasWorkGroupIDZ()) {
2215 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2219 if (
Info.hasWorkGroupInfo()) {
2221 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2225 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2227 unsigned PrivateSegmentWaveByteOffsetReg;
2230 PrivateSegmentWaveByteOffsetReg =
2231 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2235 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2237 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2240 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2242 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2243 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2247 Info.getNumPreloadedSGPRs() >= 16);
2262 if (HasStackObjects)
2263 Info.setHasNonSpillStackObjects(
true);
2268 HasStackObjects =
true;
2272 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2274 if (!ST.enableFlatScratch()) {
2275 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2282 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2284 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2294 Info.setScratchRSrcReg(ReservedBufferReg);
2313 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2314 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2321 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2322 if (!
MRI.isLiveIn(Reg)) {
2323 Info.setStackPtrOffsetReg(Reg);
2328 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2335 if (ST.getFrameLowering()->hasFP(MF)) {
2336 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2342 return !
Info->isEntryFunction();
2354 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2363 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2364 RC = &AMDGPU::SGPR_64RegClass;
2365 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2366 RC = &AMDGPU::SGPR_32RegClass;
2372 Entry->addLiveIn(*
I);
2377 for (
auto *Exit : Exits)
2379 TII->get(TargetOpcode::COPY), *
I)
2397 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2402 Info->allocateKnownAddressLDSGlobal(Fn);
2415 assert(!
Info->hasDispatchPtr() && !
Info->hasKernargSegmentPtr() &&
2417 !
Info->hasWorkGroupIDX() && !
Info->hasWorkGroupIDY() &&
2418 !
Info->hasWorkGroupIDZ() && !
Info->hasWorkGroupInfo() &&
2419 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2420 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2439 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2440 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2443 Info->markPSInputAllocated(0);
2444 Info->markPSInputEnabled(0);
2455 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2456 if ((PsInputBits & 0x7F) == 0 ||
2457 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2460 }
else if (IsKernel) {
2463 Splits.
append(Ins.begin(), Ins.end());
2469 }
else if (!IsGraphics) {
2490 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2492 if (
Arg.isOrigArg() && Skipped[
Arg.getOrigArgIndex()]) {
2500 if (IsEntryFunc && VA.
isMemLoc()) {
2507 if (
Arg.Flags.isByRef()) {
2513 Arg.Flags.getPointerAddrSpace())) {
2515 Arg.Flags.getPointerAddrSpace());
2523 DAG, VT, MemVT,
DL, Chain,
Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2527 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
2540 }
else if (!IsEntryFunc && VA.
isMemLoc()) {
2541 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain,
Arg);
2543 if (!
Arg.Flags.isByVal())
2552 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
2553 RC = &AMDGPU::VGPR_32RegClass;
2554 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
2555 RC = &AMDGPU::SGPR_32RegClass;
2563 if (
Arg.Flags.isSRet()) {
2613 auto &ArgUsageInfo =
2618 Info->setBytesInStackArgArea(StackArgSize);
2620 return Chains.
empty() ? Chain :
2659 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
2677 for (
unsigned I = 0, RealRVLocIdx = 0,
E = RVLocs.
size();
I !=
E;
2678 ++
I, ++RealRVLocIdx) {
2710 if (!
Info->isEntryFunction()) {
2716 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2718 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2751 for (
unsigned i = 0; i != RVLocs.
size(); ++i) {
2817 auto &ArgUsageInfo =
2819 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2848 std::tie(OutgoingArg, ArgRC, ArgTy) =
2856 std::tie(IncomingArg, IncomingArgRC, Ty) =
2858 assert(IncomingArgRC == ArgRC);
2869 InputReg = getImplicitArgPtr(DAG,
DL);
2871 std::optional<uint32_t> Id =
2873 if (Id.has_value()) {
2885 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
2889 unsigned SpecialArgOffset =
2903 std::tie(OutgoingArg, ArgRC, Ty) =
2906 std::tie(OutgoingArg, ArgRC, Ty) =
2909 std::tie(OutgoingArg, ArgRC, Ty) =
2924 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
2925 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
2926 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
2943 InputReg = InputReg.
getNode() ?
2952 InputReg = InputReg.
getNode() ?
2956 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
2957 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
2967 IncomingArgX ? *IncomingArgX :
2968 IncomingArgY ? *IncomingArgY :
2969 *IncomingArgZ, ~0u);
2976 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3014 if (
Callee->isDivergent())
3021 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3025 if (!CallerPreserved)
3028 bool CCMatch = CallerCC == CalleeCC;
3041 if (
Arg.hasByValAttr())
3055 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3056 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3065 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3103 bool IsSibCall =
false;
3104 bool IsThisReturn =
false;
3109 for (
unsigned I = 0,
E = CLI.
Ins.size();
I !=
E; ++
I)
3118 "unsupported call to variadic function ");
3126 "unsupported required tail call to function ");
3133 "unsupported call to a shader function ");
3140 "unsupported calling convention for call from "
3141 "graphics shader of function ");
3146 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3149 "site marked musttail");
3156 if (!TailCallOpt && IsTailCall)
3207 RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
3216 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3252 int32_t
Offset = LocMemOffset;
3259 unsigned OpSize = Flags.isByVal() ?
3265 ? Flags.getNonZeroByValAlign()
3292 if (Outs[i].Flags.isByVal()) {
3297 Outs[i].Flags.getNonZeroByValAlign(),
3311 if (!MemOpChains.
empty())
3317 for (
auto &RegToPass : RegsToPass) {
3319 RegToPass.second, InFlag);
3328 if (IsTailCall && !IsSibCall) {
3333 std::vector<SDValue> Ops;
3334 Ops.push_back(Chain);
3354 for (
auto &RegToPass : RegsToPass) {
3356 RegToPass.second.getValueType()));
3362 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3363 assert(Mask &&
"Missing call preserved mask for calling convention");
3367 Ops.push_back(InFlag);
3380 Chain = Call.getValue(0);
3381 InFlag = Call.getValue(1);
3383 uint64_t CalleePopBytes = NumBytes;
3391 InVals, IsThisReturn,
3392 IsThisReturn ? OutVals[0] :
SDValue());
3403 EVT VT = Op.getValueType();
3405 SDValue Tmp2 = Op.getValue(1);
3406 SDValue Tmp3 = Op.getOperand(2);
3418 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3429 Align StackAlign = TFL->getStackAlign();
3430 Tmp1 = DAG.
getNode(Opc, dl, VT, SP, ScaledSize);
3431 if (Alignment && *Alignment > StackAlign) {
3434 << ST.getWavefrontSizeLog2(),
3452 if (isa<ConstantSDNode>(
Size))
3461 .
Case(
"m0", AMDGPU::M0)
3462 .
Case(
"exec", AMDGPU::EXEC)
3463 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
3464 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
3465 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
3466 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
3467 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
3470 if (Reg == AMDGPU::NoRegister) {
3484 case AMDGPU::EXEC_LO:
3485 case AMDGPU::EXEC_HI:
3486 case AMDGPU::FLAT_SCR_LO:
3487 case AMDGPU::FLAT_SCR_HI:
3492 case AMDGPU::FLAT_SCR:
3511 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
3520static std::pair<MachineBasicBlock *, MachineBasicBlock *>
3542 auto Next = std::next(
I);
3555 return std::pair(LoopBB, RemainderBB);
3562 auto I =
MI.getIterator();
3563 auto E = std::next(
I);
3585 Src->setIsKill(
false);
3601 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3604 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
3626 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
3627 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
3636 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
3637 Register NewExec =
MRI.createVirtualRegister(BoolRC);
3638 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3639 Register CondReg =
MRI.createVirtualRegister(BoolRC);
3647 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
3654 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
3658 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
3663 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3664 : AMDGPU::S_AND_SAVEEXEC_B64),
3668 MRI.setSimpleHint(NewExec, CondReg);
3670 if (UseGPRIdxMode) {
3672 SGPRIdxReg = CurrentIdxReg;
3674 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3675 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
3682 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3685 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3692 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3694 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
3695 : AMDGPU::S_XOR_B64_term), Exec)
3716 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
3717 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
3725 const auto *BoolXExecRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3727 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
3728 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
3729 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3730 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
3745 InitResultReg, DstReg, PhiReg, TmpExec,
3746 Offset, UseGPRIdxMode, SGPRIdxReg);
3756 BuildMI(*LandingPad, First,
DL,
TII->get(MovExecOpc), Exec)
3763static std::pair<unsigned, int>
3768 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
3773 return std::pair(AMDGPU::sub0,
Offset);
3787 assert(
Idx->getReg() != AMDGPU::NoRegister);
3808 return Idx->getReg();
3810 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3827 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
3828 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
3837 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3840 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3844 if (UseGPRIdxMode) {
3851 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
3864 MI.eraseFromParent();
3873 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3874 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3880 UseGPRIdxMode, SGPRIdxReg);
3884 if (UseGPRIdxMode) {
3886 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
3888 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
3893 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3898 MI.eraseFromParent();
3915 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
3926 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3928 if (
Idx->getReg() == AMDGPU::NoRegister) {
3939 MI.eraseFromParent();
3944 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3948 if (UseGPRIdxMode) {
3952 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
3961 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
3962 TRI.getRegSizeInBits(*VecRC), 32,
false);
3968 MI.eraseFromParent();
3978 Register PhiReg =
MRI.createVirtualRegister(VecRC);