41#define DEBUG_TYPE "amdgpu-resource-usage"
50 "amdgpu-assume-external-call-stack-size",
55 "amdgpu-assume-dynamic-stack-object-size",
56 cl::desc(
"Assumed extra stack use if there are any "
57 "variable sized objects (in bytes)"),
61 "Function register usage analysis",
true,
true)
68 return cast<Function>(
Op.getGlobal()->stripPointerCastsAndAliases());
74 if (!UseOp.isImplicit() || !
TII.isFLAT(*UseOp.getParent()))
85 ST.getTargetID().isXnackOnOrAny());
94 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
101 bool HasIndirectCall =
false;
108 uint32_t AssumedStackSizeForDynamicSizeObjects =
114 AssumedStackSizeForDynamicSizeObjects = 0;
116 AssumedStackSizeForExternalCall = 0;
121 if (!
F ||
F->isDeclaration())
125 assert(MF &&
"function must have been generated already");
130 assert(CI.second &&
"should only be called once per function");
131 Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects,
132 AssumedStackSizeForExternalCall);
133 HasIndirectCall |=
Info.HasIndirectCall;
139 for (
const auto &
IT : CG) {
141 if (!
F ||
F->isDeclaration())
151 assert(MF &&
"function must have been generated already");
152 Info = analyzeResourceUsage(*MF, TM, AssumedStackSizeForDynamicSizeObjects,
153 AssumedStackSizeForExternalCall);
154 HasIndirectCall |=
Info.HasIndirectCall;
158 propagateIndirectCallRegisterUsage();
164AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
166 uint32_t AssumedStackSizeForDynamicSizeObjects,
167 uint32_t AssumedStackSizeForExternalCall)
const {
168 SIFunctionResourceInfo
Info;
177 Info.UsesFlatScratch =
MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
178 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
192 Info.UsesFlatScratch =
false;
195 Info.PrivateSegmentSize = FrameInfo.getStackSize();
198 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
199 if (
Info.HasDynamicallySizedStack)
200 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
203 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
206 MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
MRI.isPhysRegUsed(AMDGPU::VCC_HI);
211 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
212 MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
214 if (
MRI.isPhysRegUsed(Reg)) {
215 HighestVGPRReg = Reg;
220 if (
ST.hasMAIInsts()) {
221 MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
223 if (
MRI.isPhysRegUsed(Reg)) {
224 HighestAGPRReg =
Reg;
228 Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
230 :
TRI.getHWRegIndex(HighestAGPRReg) + 1;
233 MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
235 if (
MRI.isPhysRegUsed(Reg)) {
236 HighestSGPRReg =
Reg;
243 Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
245 :
TRI.getHWRegIndex(HighestVGPRReg) + 1;
246 Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
248 :
TRI.getHWRegIndex(HighestSGPRReg) + 1;
253 int32_t MaxVGPR = -1;
254 int32_t MaxAGPR = -1;
255 int32_t MaxSGPR = -1;
272 case AMDGPU::EXEC_LO:
273 case AMDGPU::EXEC_HI:
276 case AMDGPU::M0_LO16:
277 case AMDGPU::M0_HI16:
278 case AMDGPU::SRC_SHARED_BASE_LO:
279 case AMDGPU::SRC_SHARED_BASE:
280 case AMDGPU::SRC_SHARED_LIMIT_LO:
281 case AMDGPU::SRC_SHARED_LIMIT:
282 case AMDGPU::SRC_PRIVATE_BASE_LO:
283 case AMDGPU::SRC_PRIVATE_BASE:
284 case AMDGPU::SRC_PRIVATE_LIMIT_LO:
285 case AMDGPU::SRC_PRIVATE_LIMIT:
286 case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
287 case AMDGPU::SGPR_NULL:
288 case AMDGPU::SGPR_NULL64:
292 case AMDGPU::NoRegister:
294 "Instruction uses invalid noreg register");
300 case AMDGPU::VCC_LO_LO16:
301 case AMDGPU::VCC_LO_HI16:
302 case AMDGPU::VCC_HI_LO16:
303 case AMDGPU::VCC_HI_HI16:
307 case AMDGPU::FLAT_SCR:
308 case AMDGPU::FLAT_SCR_LO:
309 case AMDGPU::FLAT_SCR_HI:
312 case AMDGPU::XNACK_MASK:
313 case AMDGPU::XNACK_MASK_LO:
314 case AMDGPU::XNACK_MASK_HI:
317 case AMDGPU::LDS_DIRECT:
328 case AMDGPU::SRC_VCCZ:
331 case AMDGPU::SRC_EXECZ:
334 case AMDGPU::SRC_SCC:
341 if (AMDGPU::SGPR_32RegClass.
contains(Reg) ||
342 AMDGPU::SGPR_LO16RegClass.
contains(Reg) ||
343 AMDGPU::SGPR_HI16RegClass.
contains(Reg)) {
346 }
else if (AMDGPU::VGPR_32RegClass.
contains(Reg) ||
347 AMDGPU::VGPR_16RegClass.
contains(Reg)) {
350 }
else if (AMDGPU::AGPR_32RegClass.
contains(Reg) ||
351 AMDGPU::AGPR_LO16RegClass.
contains(Reg)) {
355 }
else if (AMDGPU::SGPR_64RegClass.
contains(Reg)) {
358 }
else if (AMDGPU::VReg_64RegClass.
contains(Reg)) {
361 }
else if (AMDGPU::AReg_64RegClass.
contains(Reg)) {
365 }
else if (AMDGPU::VReg_96RegClass.
contains(Reg)) {
368 }
else if (AMDGPU::SReg_96RegClass.
contains(Reg)) {
371 }
else if (AMDGPU::AReg_96RegClass.
contains(Reg)) {
375 }
else if (AMDGPU::SGPR_128RegClass.
contains(Reg)) {
378 }
else if (AMDGPU::VReg_128RegClass.
contains(Reg)) {
381 }
else if (AMDGPU::AReg_128RegClass.
contains(Reg)) {
385 }
else if (AMDGPU::VReg_160RegClass.
contains(Reg)) {
388 }
else if (AMDGPU::SReg_160RegClass.
contains(Reg)) {
391 }
else if (AMDGPU::AReg_160RegClass.
contains(Reg)) {
395 }
else if (AMDGPU::VReg_192RegClass.
contains(Reg)) {
398 }
else if (AMDGPU::SReg_192RegClass.
contains(Reg)) {
401 }
else if (AMDGPU::AReg_192RegClass.
contains(Reg)) {
405 }
else if (AMDGPU::VReg_224RegClass.
contains(Reg)) {
408 }
else if (AMDGPU::SReg_224RegClass.
contains(Reg)) {
411 }
else if (AMDGPU::AReg_224RegClass.
contains(Reg)) {
415 }
else if (AMDGPU::SReg_256RegClass.
contains(Reg)) {
418 }
else if (AMDGPU::VReg_256RegClass.
contains(Reg)) {
421 }
else if (AMDGPU::AReg_256RegClass.
contains(Reg)) {
425 }
else if (AMDGPU::VReg_288RegClass.
contains(Reg)) {
428 }
else if (AMDGPU::SReg_288RegClass.
contains(Reg)) {
431 }
else if (AMDGPU::AReg_288RegClass.
contains(Reg)) {
435 }
else if (AMDGPU::VReg_320RegClass.
contains(Reg)) {
438 }
else if (AMDGPU::SReg_320RegClass.
contains(Reg)) {
441 }
else if (AMDGPU::AReg_320RegClass.
contains(Reg)) {
445 }
else if (AMDGPU::VReg_352RegClass.
contains(Reg)) {
448 }
else if (AMDGPU::SReg_352RegClass.
contains(Reg)) {
451 }
else if (AMDGPU::AReg_352RegClass.
contains(Reg)) {
455 }
else if (AMDGPU::VReg_384RegClass.
contains(Reg)) {
458 }
else if (AMDGPU::SReg_384RegClass.
contains(Reg)) {
461 }
else if (AMDGPU::AReg_384RegClass.
contains(Reg)) {
465 }
else if (AMDGPU::SReg_512RegClass.
contains(Reg)) {
468 }
else if (AMDGPU::VReg_512RegClass.
contains(Reg)) {
471 }
else if (AMDGPU::AReg_512RegClass.
contains(Reg)) {
475 }
else if (AMDGPU::SReg_1024RegClass.
contains(Reg)) {
478 }
else if (AMDGPU::VReg_1024RegClass.
contains(Reg)) {
481 }
else if (AMDGPU::AReg_1024RegClass.
contains(Reg)) {
489 AMDGPU::TTMP_64RegClass.
contains(Reg) ||
490 AMDGPU::TTMP_128RegClass.
contains(Reg) ||
491 AMDGPU::TTMP_256RegClass.
contains(Reg) ||
492 AMDGPU::TTMP_512RegClass.
contains(Reg) ||
493 !
TRI.getPhysRegBaseClass(Reg)) &&
494 "Unknown register class");
496 unsigned HWReg =
TRI.getHWRegIndex(Reg);
497 int MaxUsed = HWReg + Width - 1;
499 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
501 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
503 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
512 TII->getNamedOperand(
MI, AMDGPU::OpName::callee);
516 CallGraphResourceInfo.end();
527 I = CallGraphResourceInfo.find(Callee);
530 if (!Callee || !
Callee->doesNotRecurse()) {
531 Info.HasRecursion =
true;
535 if (!
MI.isReturn()) {
542 CalleeFrameSize = std::max(
544 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
548 if (IsIndirect ||
I == CallGraphResourceInfo.end()) {
550 std::max(CalleeFrameSize,
551 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
555 Info.UsesFlatScratch =
ST.hasFlatAddressSpace();
556 Info.HasDynamicallySizedStack =
true;
557 Info.HasIndirectCall =
true;
561 MaxSGPR = std::max(
I->second.NumExplicitSGPR - 1, MaxSGPR);
562 MaxVGPR = std::max(
I->second.NumVGPR - 1, MaxVGPR);
563 MaxAGPR = std::max(
I->second.NumAGPR - 1, MaxAGPR);
565 std::max(
I->second.PrivateSegmentSize, CalleeFrameSize);
566 Info.UsesVCC |=
I->second.UsesVCC;
567 Info.UsesFlatScratch |=
I->second.UsesFlatScratch;
568 Info.HasDynamicallySizedStack |=
I->second.HasDynamicallySizedStack;
569 Info.HasRecursion |=
I->second.HasRecursion;
570 Info.HasIndirectCall |=
I->second.HasIndirectCall;
576 Info.NumExplicitSGPR = MaxSGPR + 1;
577 Info.NumVGPR = MaxVGPR + 1;
578 Info.NumAGPR = MaxAGPR + 1;
579 Info.PrivateSegmentSize += CalleeFrameSize;
584void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
587 int32_t NonKernelMaxSGPRs = 0;
588 int32_t NonKernelMaxVGPRs = 0;
589 int32_t NonKernelMaxAGPRs = 0;
591 for (
const auto &
I : CallGraphResourceInfo) {
593 auto &
Info =
I.getSecond();
594 NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs,
Info.NumExplicitSGPR);
595 NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs,
Info.NumVGPR);
596 NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs,
Info.NumAGPR);
603 for (
auto &
I : CallGraphResourceInfo) {
604 auto &
Info =
I.getSecond();
605 if (
Info.HasIndirectCall) {
606 Info.NumExplicitSGPR = std::max(
Info.NumExplicitSGPR, NonKernelMaxSGPRs);
607 Info.NumVGPR = std::max(
Info.NumVGPR, NonKernelMaxVGPRs);
608 Info.NumAGPR = std::max(
Info.NumAGPR, NonKernelMaxAGPRs);
unsigned const MachineRegisterInfo * MRI
static cl::opt< uint32_t > clAssumedStackSizeForDynamicSizeObjects("amdgpu-assume-dynamic-stack-object-size", cl::desc("Assumed extra stack use if there are any " "variable sized objects (in bytes)"), cl::Hidden, cl::init(4096))
static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, const SIInstrInfo &TII, unsigned Reg)
static cl::opt< uint32_t > clAssumedStackSizeForExternalCall("amdgpu-assume-external-call-stack-size", cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, cl::init(16384))
Analyzes how many registers and other resources are used by functions.
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Analysis containing CSE Info
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Target-Independent Code Generator Pass Configuration Options pass.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
The basic data container for the call graph of a Module of IR.
This class represents an Operation in the Expression.
bool hasFlatScratchInit() const
Generic base class for all target subtargets.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
This class contains meta information specific to a module.
MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A Module instance is used to store all the information related to an LLVM module.
Wrapper class representing virtual and physical registers.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
bool isStackRealigned() const
MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
Primary interface to the complete machine description for the target machine.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
char & AMDGPUResourceUsageAnalysisID
po_iterator< T > po_begin(const T &G)
auto reverse(ContainerTy &&C)
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
po_iterator< T > po_end(const T &G)
int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const
int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const
bool runOnModule(Module &M) override
runOnModule - Virtual method overriden by subclasses to process the module being operated on.