41#define DEBUG_TYPE "amdgpu-resource-usage"
50 "amdgpu-assume-external-call-stack-size",
55 "amdgpu-assume-dynamic-stack-object-size",
56 cl::desc(
"Assumed extra stack use if there are any "
57 "variable sized objects (in bytes)"),
61 "Function register usage analysis",
true,
true)
68 return cast<Function>(
Op.getGlobal()->stripPointerCastsAndAliases());
74 if (!UseOp.isImplicit() || !
TII.isFLAT(*UseOp.getParent()))
85 ST.getTargetID().isXnackOnOrAny());
89 const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
const {
99 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
106 bool HasIndirectCall =
false;
113 uint32_t AssumedStackSizeForDynamicSizeObjects =
119 AssumedStackSizeForDynamicSizeObjects = 0;
121 AssumedStackSizeForExternalCall = 0;
126 if (!
F ||
F->isDeclaration())
130 assert(MF &&
"function must have been generated already");
135 assert(CI.second &&
"should only be called once per function");
136 Info = analyzeResourceUsage(*MF,
TM, AssumedStackSizeForDynamicSizeObjects,
137 AssumedStackSizeForExternalCall);
138 HasIndirectCall |=
Info.HasIndirectCall;
144 for (
const auto &
IT : CG) {
146 if (!
F ||
F->isDeclaration())
156 assert(MF &&
"function must have been generated already");
157 Info = analyzeResourceUsage(*MF,
TM, AssumedStackSizeForDynamicSizeObjects,
158 AssumedStackSizeForExternalCall);
159 HasIndirectCall |=
Info.HasIndirectCall;
163 propagateIndirectCallRegisterUsage();
169AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
171 uint32_t AssumedStackSizeForDynamicSizeObjects,
172 uint32_t AssumedStackSizeForExternalCall)
const {
173 SIFunctionResourceInfo
Info;
182 Info.UsesFlatScratch =
MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
183 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
197 Info.UsesFlatScratch =
false;
200 Info.PrivateSegmentSize = FrameInfo.getStackSize();
203 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
204 if (
Info.HasDynamicallySizedStack)
205 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
208 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
211 MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
MRI.isPhysRegUsed(AMDGPU::VCC_HI);
216 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
217 MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
219 if (
MRI.isPhysRegUsed(Reg)) {
220 HighestVGPRReg = Reg;
225 if (
ST.hasMAIInsts()) {
226 MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
228 if (
MRI.isPhysRegUsed(Reg)) {
229 HighestAGPRReg =
Reg;
233 Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
235 :
TRI.getHWRegIndex(HighestAGPRReg) + 1;
238 MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
240 if (
MRI.isPhysRegUsed(Reg)) {
241 HighestSGPRReg =
Reg;
248 Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
250 :
TRI.getHWRegIndex(HighestVGPRReg) + 1;
251 Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
253 :
TRI.getHWRegIndex(HighestSGPRReg) + 1;
258 int32_t MaxVGPR = -1;
259 int32_t MaxAGPR = -1;
260 int32_t MaxSGPR = -1;
277 case AMDGPU::EXEC_LO:
278 case AMDGPU::EXEC_HI:
281 case AMDGPU::M0_LO16:
282 case AMDGPU::M0_HI16:
283 case AMDGPU::SRC_SHARED_BASE_LO:
284 case AMDGPU::SRC_SHARED_BASE:
285 case AMDGPU::SRC_SHARED_LIMIT_LO:
286 case AMDGPU::SRC_SHARED_LIMIT:
287 case AMDGPU::SRC_PRIVATE_BASE_LO:
288 case AMDGPU::SRC_PRIVATE_BASE:
289 case AMDGPU::SRC_PRIVATE_LIMIT_LO:
290 case AMDGPU::SRC_PRIVATE_LIMIT:
291 case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
292 case AMDGPU::SGPR_NULL:
293 case AMDGPU::SGPR_NULL64:
297 case AMDGPU::NoRegister:
299 "Instruction uses invalid noreg register");
305 case AMDGPU::VCC_LO_LO16:
306 case AMDGPU::VCC_LO_HI16:
307 case AMDGPU::VCC_HI_LO16:
308 case AMDGPU::VCC_HI_HI16:
312 case AMDGPU::FLAT_SCR:
313 case AMDGPU::FLAT_SCR_LO:
314 case AMDGPU::FLAT_SCR_HI:
317 case AMDGPU::XNACK_MASK:
318 case AMDGPU::XNACK_MASK_LO:
319 case AMDGPU::XNACK_MASK_HI:
322 case AMDGPU::LDS_DIRECT:
333 case AMDGPU::SRC_VCCZ:
336 case AMDGPU::SRC_EXECZ:
339 case AMDGPU::SRC_SCC:
346 if (AMDGPU::SGPR_32RegClass.
contains(Reg) ||
347 AMDGPU::SGPR_LO16RegClass.
contains(Reg) ||
348 AMDGPU::SGPR_HI16RegClass.
contains(Reg)) {
351 }
else if (AMDGPU::VGPR_32RegClass.
contains(Reg) ||
352 AMDGPU::VGPR_16RegClass.
contains(Reg)) {
355 }
else if (AMDGPU::AGPR_32RegClass.
contains(Reg) ||
356 AMDGPU::AGPR_LO16RegClass.
contains(Reg)) {
360 }
else if (AMDGPU::SGPR_64RegClass.
contains(Reg)) {
363 }
else if (AMDGPU::VReg_64RegClass.
contains(Reg)) {
366 }
else if (AMDGPU::AReg_64RegClass.
contains(Reg)) {
370 }
else if (AMDGPU::VReg_96RegClass.
contains(Reg)) {
373 }
else if (AMDGPU::SReg_96RegClass.
contains(Reg)) {
376 }
else if (AMDGPU::AReg_96RegClass.
contains(Reg)) {
380 }
else if (AMDGPU::SGPR_128RegClass.
contains(Reg)) {
383 }
else if (AMDGPU::VReg_128RegClass.
contains(Reg)) {
386 }
else if (AMDGPU::AReg_128RegClass.
contains(Reg)) {
390 }
else if (AMDGPU::VReg_160RegClass.
contains(Reg)) {
393 }
else if (AMDGPU::SReg_160RegClass.
contains(Reg)) {
396 }
else if (AMDGPU::AReg_160RegClass.
contains(Reg)) {
400 }
else if (AMDGPU::VReg_192RegClass.
contains(Reg)) {
403 }
else if (AMDGPU::SReg_192RegClass.
contains(Reg)) {
406 }
else if (AMDGPU::AReg_192RegClass.
contains(Reg)) {
410 }
else if (AMDGPU::VReg_224RegClass.
contains(Reg)) {
413 }
else if (AMDGPU::SReg_224RegClass.
contains(Reg)) {
416 }
else if (AMDGPU::AReg_224RegClass.
contains(Reg)) {
420 }
else if (AMDGPU::SReg_256RegClass.
contains(Reg)) {
423 }
else if (AMDGPU::VReg_256RegClass.
contains(Reg)) {
426 }
else if (AMDGPU::AReg_256RegClass.
contains(Reg)) {
430 }
else if (AMDGPU::VReg_288RegClass.
contains(Reg)) {
433 }
else if (AMDGPU::SReg_288RegClass.
contains(Reg)) {
436 }
else if (AMDGPU::AReg_288RegClass.
contains(Reg)) {
440 }
else if (AMDGPU::VReg_320RegClass.
contains(Reg)) {
443 }
else if (AMDGPU::SReg_320RegClass.
contains(Reg)) {
446 }
else if (AMDGPU::AReg_320RegClass.
contains(Reg)) {
450 }
else if (AMDGPU::VReg_352RegClass.
contains(Reg)) {
453 }
else if (AMDGPU::SReg_352RegClass.
contains(Reg)) {
456 }
else if (AMDGPU::AReg_352RegClass.
contains(Reg)) {
460 }
else if (AMDGPU::VReg_384RegClass.
contains(Reg)) {
463 }
else if (AMDGPU::SReg_384RegClass.
contains(Reg)) {
466 }
else if (AMDGPU::AReg_384RegClass.
contains(Reg)) {
470 }
else if (AMDGPU::SReg_512RegClass.
contains(Reg)) {
473 }
else if (AMDGPU::VReg_512RegClass.
contains(Reg)) {
476 }
else if (AMDGPU::AReg_512RegClass.
contains(Reg)) {
480 }
else if (AMDGPU::SReg_1024RegClass.
contains(Reg)) {
483 }
else if (AMDGPU::VReg_1024RegClass.
contains(Reg)) {
486 }
else if (AMDGPU::AReg_1024RegClass.
contains(Reg)) {
494 AMDGPU::TTMP_64RegClass.
contains(Reg) ||
495 AMDGPU::TTMP_128RegClass.
contains(Reg) ||
496 AMDGPU::TTMP_256RegClass.
contains(Reg) ||
497 AMDGPU::TTMP_512RegClass.
contains(Reg) ||
498 !
TRI.getPhysRegBaseClass(Reg)) &&
499 "Unknown register class");
501 unsigned HWReg =
TRI.getHWRegIndex(Reg);
502 int MaxUsed = HWReg + Width - 1;
504 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
506 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
508 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
517 TII->getNamedOperand(
MI, AMDGPU::OpName::callee);
521 CallGraphResourceInfo.end();
532 I = CallGraphResourceInfo.find(Callee);
535 if (!Callee || !
Callee->doesNotRecurse()) {
536 Info.HasRecursion =
true;
540 if (!
MI.isReturn()) {
547 CalleeFrameSize = std::max(
549 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
553 if (IsIndirect ||
I == CallGraphResourceInfo.end()) {
555 std::max(CalleeFrameSize,
556 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
560 Info.UsesFlatScratch =
ST.hasFlatAddressSpace();
561 Info.HasDynamicallySizedStack =
true;
562 Info.HasIndirectCall =
true;
566 MaxSGPR = std::max(
I->second.NumExplicitSGPR - 1, MaxSGPR);
567 MaxVGPR = std::max(
I->second.NumVGPR - 1, MaxVGPR);
568 MaxAGPR = std::max(
I->second.NumAGPR - 1, MaxAGPR);
570 std::max(
I->second.PrivateSegmentSize, CalleeFrameSize);
571 Info.UsesVCC |=
I->second.UsesVCC;
572 Info.UsesFlatScratch |=
I->second.UsesFlatScratch;
573 Info.HasDynamicallySizedStack |=
I->second.HasDynamicallySizedStack;
574 Info.HasRecursion |=
I->second.HasRecursion;
575 Info.HasIndirectCall |=
I->second.HasIndirectCall;
581 Info.NumExplicitSGPR = MaxSGPR + 1;
582 Info.NumVGPR = MaxVGPR + 1;
583 Info.NumAGPR = MaxAGPR + 1;
584 Info.PrivateSegmentSize += CalleeFrameSize;
589void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
592 int32_t NonKernelMaxSGPRs = 0;
593 int32_t NonKernelMaxVGPRs = 0;
594 int32_t NonKernelMaxAGPRs = 0;
596 for (
const auto &
I : CallGraphResourceInfo) {
598 auto &
Info =
I.getSecond();
599 NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs,
Info.NumExplicitSGPR);
600 NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs,
Info.NumVGPR);
601 NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs,
Info.NumAGPR);
608 for (
auto &
I : CallGraphResourceInfo) {
609 auto &
Info =
I.getSecond();
610 if (
Info.HasIndirectCall) {
611 Info.NumExplicitSGPR = std::max(
Info.NumExplicitSGPR, NonKernelMaxSGPRs);
612 Info.NumVGPR = std::max(
Info.NumVGPR, NonKernelMaxVGPRs);
613 Info.NumAGPR = std::max(
Info.NumAGPR, NonKernelMaxAGPRs);
unsigned const MachineRegisterInfo * MRI
static cl::opt< uint32_t > clAssumedStackSizeForDynamicSizeObjects("amdgpu-assume-dynamic-stack-object-size", cl::desc("Assumed extra stack use if there are any " "variable sized objects (in bytes)"), cl::Hidden, cl::init(4096))
static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, const SIInstrInfo &TII, unsigned Reg)
static cl::opt< uint32_t > clAssumedStackSizeForExternalCall("amdgpu-assume-external-call-stack-size", cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, cl::init(16384))
Analyzes how many registers and other resources are used by functions.
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Analysis containing CSE Info
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Target-Independent Code Generator Pass Configuration Options pass.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
The basic data container for the call graph of a Module of IR.
This class represents an Operation in the Expression.
bool hasFlatScratchInit() const
Generic base class for all target subtargets.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
This class contains meta information specific to a module.
MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A Module instance is used to store all the information related to an LLVM module.
Wrapper class representing virtual and physical registers.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
bool isStackRealigned() const
MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
Primary interface to the complete machine description for the target machine.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
char & AMDGPUResourceUsageAnalysisID
po_iterator< T > po_begin(const T &G)
auto reverse(ContainerTy &&C)
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
po_iterator< T > po_end(const T &G)
int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const
int32_t getTotalNumVGPRs(const GCNSubtarget &ST, int32_t NumAGPR, int32_t NumVGPR) const
bool runOnModule(Module &M) override
runOnModule - Virtual method overriden by subclasses to process the module being operated on.