LLVM 20.0.0git
AMDGPUResourceUsageAnalysis.cpp
Go to the documentation of this file.
1//===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// \brief Analyzes how many registers and other resources are used by
11/// functions.
12///
13/// The results of this analysis are used to fill the register usage, flat
14/// usage, etc. into hardware registers.
15///
16//===----------------------------------------------------------------------===//
17
19#include "AMDGPU.h"
20#include "GCNSubtarget.h"
25#include "llvm/IR/GlobalValue.h"
27
28using namespace llvm;
29using namespace llvm::AMDGPU;
30
31#define DEBUG_TYPE "amdgpu-resource-usage"
32
35
36// In code object v4 and older, we need to tell the runtime some amount ahead of
37// time if we don't know the true stack size. Assume a smaller number if this is
38// only due to dynamic / non-entry block allocas.
40 "amdgpu-assume-external-call-stack-size",
41 cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
42 cl::init(16384));
43
45 "amdgpu-assume-dynamic-stack-object-size",
46 cl::desc("Assumed extra stack use if there are any "
47 "variable sized objects (in bytes)"),
48 cl::Hidden, cl::init(4096));
49
51 "Function register usage analysis", true, true)
52
53static const Function *getCalleeFunction(const MachineOperand &Op) {
54 if (Op.isImm()) {
55 assert(Op.getImm() == 0);
56 return nullptr;
57 }
58 return cast<Function>(Op.getGlobal()->stripPointerCastsAndAliases());
59}
60
62 const SIInstrInfo &TII, unsigned Reg) {
63 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
64 if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
65 return true;
66 }
67
68 return false;
69}
70
72 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
73 if (!TPC)
74 return false;
75
76 const TargetMachine &TM = TPC->getTM<TargetMachine>();
77 const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
78
79 // By default, for code object v5 and later, track only the minimum scratch
80 // size
81 uint32_t AssumedStackSizeForDynamicSizeObjects =
83 uint32_t AssumedStackSizeForExternalCall = clAssumedStackSizeForExternalCall;
86 STI.getTargetTriple().getOS() == Triple::AMDPAL) {
87 if (!clAssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
88 AssumedStackSizeForDynamicSizeObjects = 0;
89 if (!clAssumedStackSizeForExternalCall.getNumOccurrences())
90 AssumedStackSizeForExternalCall = 0;
91 }
92
93 ResourceInfo = analyzeResourceUsage(MF, AssumedStackSizeForDynamicSizeObjects,
94 AssumedStackSizeForExternalCall);
95
96 return false;
97}
98
100AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
101 const MachineFunction &MF, uint32_t AssumedStackSizeForDynamicSizeObjects,
102 uint32_t AssumedStackSizeForExternalCall) const {
103 SIFunctionResourceInfo Info;
104
106 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
107 const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
108 const MachineRegisterInfo &MRI = MF.getRegInfo();
109 const SIInstrInfo *TII = ST.getInstrInfo();
110 const SIRegisterInfo &TRI = TII->getRegisterInfo();
111
112 Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
113 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
114 MRI.isLiveIn(MFI->getPreloadedReg(
116
117 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
118 // instructions aren't used to access the scratch buffer. Inline assembly may
119 // need it though.
120 //
121 // If we only have implicit uses of flat_scr on flat instructions, it is not
122 // really needed.
123 if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
124 (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
125 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
126 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
127 Info.UsesFlatScratch = false;
128 }
129
130 Info.PrivateSegmentSize = FrameInfo.getStackSize();
131
132 // Assume a big number if there are any unknown sized objects.
133 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
134 if (Info.HasDynamicallySizedStack)
135 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
136
137 if (MFI->isStackRealigned())
138 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
139
140 Info.UsesVCC =
141 MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
142
143 // If there are no calls, MachineRegisterInfo can tell us the used register
144 // count easily.
145 // A tail call isn't considered a call for MachineFrameInfo's purposes.
146 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
147 Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
148 Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
149 if (ST.hasMAIInsts())
150 Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
151 return Info;
152 }
153
154 int32_t MaxVGPR = -1;
155 int32_t MaxAGPR = -1;
156 int32_t MaxSGPR = -1;
157 Info.CalleeSegmentSize = 0;
158
159 for (const MachineBasicBlock &MBB : MF) {
160 for (const MachineInstr &MI : MBB) {
161 // TODO: Check regmasks? Do they occur anywhere except calls?
162 for (const MachineOperand &MO : MI.operands()) {
163 unsigned Width = 0;
164 bool IsSGPR = false;
165 bool IsAGPR = false;
166
167 if (!MO.isReg())
168 continue;
169
170 Register Reg = MO.getReg();
171 switch (Reg) {
172 case AMDGPU::EXEC:
173 case AMDGPU::EXEC_LO:
174 case AMDGPU::EXEC_HI:
175 case AMDGPU::SCC:
176 case AMDGPU::M0:
177 case AMDGPU::M0_LO16:
178 case AMDGPU::M0_HI16:
179 case AMDGPU::SRC_SHARED_BASE_LO:
180 case AMDGPU::SRC_SHARED_BASE:
181 case AMDGPU::SRC_SHARED_LIMIT_LO:
182 case AMDGPU::SRC_SHARED_LIMIT:
183 case AMDGPU::SRC_PRIVATE_BASE_LO:
184 case AMDGPU::SRC_PRIVATE_BASE:
185 case AMDGPU::SRC_PRIVATE_LIMIT_LO:
186 case AMDGPU::SRC_PRIVATE_LIMIT:
187 case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
188 case AMDGPU::SGPR_NULL:
189 case AMDGPU::SGPR_NULL64:
190 case AMDGPU::MODE:
191 continue;
192
193 case AMDGPU::NoRegister:
194 assert(MI.isDebugInstr() &&
195 "Instruction uses invalid noreg register");
196 continue;
197
198 case AMDGPU::VCC:
199 case AMDGPU::VCC_LO:
200 case AMDGPU::VCC_HI:
201 case AMDGPU::VCC_LO_LO16:
202 case AMDGPU::VCC_LO_HI16:
203 case AMDGPU::VCC_HI_LO16:
204 case AMDGPU::VCC_HI_HI16:
205 Info.UsesVCC = true;
206 continue;
207
208 case AMDGPU::FLAT_SCR:
209 case AMDGPU::FLAT_SCR_LO:
210 case AMDGPU::FLAT_SCR_HI:
211 continue;
212
213 case AMDGPU::XNACK_MASK:
214 case AMDGPU::XNACK_MASK_LO:
215 case AMDGPU::XNACK_MASK_HI:
216 llvm_unreachable("xnack_mask registers should not be used");
217
218 case AMDGPU::LDS_DIRECT:
219 llvm_unreachable("lds_direct register should not be used");
220
221 case AMDGPU::TBA:
222 case AMDGPU::TBA_LO:
223 case AMDGPU::TBA_HI:
224 case AMDGPU::TMA:
225 case AMDGPU::TMA_LO:
226 case AMDGPU::TMA_HI:
227 llvm_unreachable("trap handler registers should not be used");
228
229 case AMDGPU::SRC_VCCZ:
230 llvm_unreachable("src_vccz register should not be used");
231
232 case AMDGPU::SRC_EXECZ:
233 llvm_unreachable("src_execz register should not be used");
234
235 case AMDGPU::SRC_SCC:
236 llvm_unreachable("src_scc register should not be used");
237
238 default:
239 break;
240 }
241
242 if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
243 AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
244 AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
245 IsSGPR = true;
246 Width = 1;
247 } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
248 AMDGPU::VGPR_16RegClass.contains(Reg)) {
249 IsSGPR = false;
250 Width = 1;
251 } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
252 AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
253 IsSGPR = false;
254 IsAGPR = true;
255 Width = 1;
256 } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
257 IsSGPR = true;
258 Width = 2;
259 } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
260 IsSGPR = false;
261 Width = 2;
262 } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
263 IsSGPR = false;
264 IsAGPR = true;
265 Width = 2;
266 } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
267 IsSGPR = false;
268 Width = 3;
269 } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
270 IsSGPR = true;
271 Width = 3;
272 } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
273 IsSGPR = false;
274 IsAGPR = true;
275 Width = 3;
276 } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
277 IsSGPR = true;
278 Width = 4;
279 } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
280 IsSGPR = false;
281 Width = 4;
282 } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
283 IsSGPR = false;
284 IsAGPR = true;
285 Width = 4;
286 } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
287 IsSGPR = false;
288 Width = 5;
289 } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
290 IsSGPR = true;
291 Width = 5;
292 } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
293 IsSGPR = false;
294 IsAGPR = true;
295 Width = 5;
296 } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
297 IsSGPR = false;
298 Width = 6;
299 } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
300 IsSGPR = true;
301 Width = 6;
302 } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
303 IsSGPR = false;
304 IsAGPR = true;
305 Width = 6;
306 } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
307 IsSGPR = false;
308 Width = 7;
309 } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
310 IsSGPR = true;
311 Width = 7;
312 } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
313 IsSGPR = false;
314 IsAGPR = true;
315 Width = 7;
316 } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
317 IsSGPR = true;
318 Width = 8;
319 } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
320 IsSGPR = false;
321 Width = 8;
322 } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
323 IsSGPR = false;
324 IsAGPR = true;
325 Width = 8;
326 } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
327 IsSGPR = false;
328 Width = 9;
329 } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
330 IsSGPR = true;
331 Width = 9;
332 } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
333 IsSGPR = false;
334 IsAGPR = true;
335 Width = 9;
336 } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
337 IsSGPR = false;
338 Width = 10;
339 } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
340 IsSGPR = true;
341 Width = 10;
342 } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
343 IsSGPR = false;
344 IsAGPR = true;
345 Width = 10;
346 } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
347 IsSGPR = false;
348 Width = 11;
349 } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
350 IsSGPR = true;
351 Width = 11;
352 } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
353 IsSGPR = false;
354 IsAGPR = true;
355 Width = 11;
356 } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
357 IsSGPR = false;
358 Width = 12;
359 } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
360 IsSGPR = true;
361 Width = 12;
362 } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
363 IsSGPR = false;
364 IsAGPR = true;
365 Width = 12;
366 } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
367 IsSGPR = true;
368 Width = 16;
369 } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
370 IsSGPR = false;
371 Width = 16;
372 } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
373 IsSGPR = false;
374 IsAGPR = true;
375 Width = 16;
376 } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
377 IsSGPR = true;
378 Width = 32;
379 } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
380 IsSGPR = false;
381 Width = 32;
382 } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
383 IsSGPR = false;
384 IsAGPR = true;
385 Width = 32;
386 } else {
387 // We only expect TTMP registers or registers that do not belong to
388 // any RC.
389 assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
390 AMDGPU::TTMP_64RegClass.contains(Reg) ||
391 AMDGPU::TTMP_128RegClass.contains(Reg) ||
392 AMDGPU::TTMP_256RegClass.contains(Reg) ||
393 AMDGPU::TTMP_512RegClass.contains(Reg) ||
394 !TRI.getPhysRegBaseClass(Reg)) &&
395 "Unknown register class");
396 }
397 unsigned HWReg = TRI.getHWRegIndex(Reg);
398 int MaxUsed = HWReg + Width - 1;
399 if (IsSGPR) {
400 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
401 } else if (IsAGPR) {
402 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
403 } else {
404 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
405 }
406 }
407
408 if (MI.isCall()) {
409 // Pseudo used just to encode the underlying global. Is there a better
410 // way to track this?
411
412 const MachineOperand *CalleeOp =
413 TII->getNamedOperand(MI, AMDGPU::OpName::callee);
414
415 const Function *Callee = getCalleeFunction(*CalleeOp);
416
417 // Avoid crashing on undefined behavior with an illegal call to a
418 // kernel. If a callsite's calling convention doesn't match the
419 // function's, it's undefined behavior. If the callsite calling
420 // convention does match, that would have errored earlier.
421 if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
422 report_fatal_error("invalid call to entry function");
423
424 auto isSameFunction = [](const MachineFunction &MF, const Function *F) {
425 return F == &MF.getFunction();
426 };
427
428 if (Callee && !isSameFunction(MF, Callee))
429 Info.Callees.push_back(Callee);
430
431 bool IsIndirect = !Callee || Callee->isDeclaration();
432
433 // FIXME: Call site could have norecurse on it
434 if (!Callee || !Callee->doesNotRecurse()) {
435 Info.HasRecursion = true;
436
437 // TODO: If we happen to know there is no stack usage in the
438 // callgraph, we don't need to assume an infinitely growing stack.
439 if (!MI.isReturn()) {
440 // We don't need to assume an unknown stack size for tail calls.
441
442 // FIXME: This only benefits in the case where the kernel does not
443 // directly call the tail called function. If a kernel directly
444 // calls a tail recursive function, we'll assume maximum stack size
445 // based on the regular call instruction.
446 Info.CalleeSegmentSize = std::max(
447 Info.CalleeSegmentSize,
448 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
449 }
450 }
451
452 if (IsIndirect) {
453 Info.CalleeSegmentSize =
454 std::max(Info.CalleeSegmentSize,
455 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
456
457 // Register usage of indirect calls gets handled later
458 Info.UsesVCC = true;
459 Info.UsesFlatScratch = ST.hasFlatAddressSpace();
460 Info.HasDynamicallySizedStack = true;
461 Info.HasIndirectCall = true;
462 }
463 }
464 }
465 }
466
467 Info.NumExplicitSGPR = MaxSGPR + 1;
468 Info.NumVGPR = MaxVGPR + 1;
469 Info.NumAGPR = MaxAGPR + 1;
470
471 return Info;
472}
unsigned const MachineRegisterInfo * MRI
aarch64 promote const
static cl::opt< uint32_t > clAssumedStackSizeForDynamicSizeObjects("amdgpu-assume-dynamic-stack-object-size", cl::desc("Assumed extra stack use if there are any " "variable sized objects (in bytes)"), cl::Hidden, cl::init(4096))
static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, const SIInstrInfo &TII, unsigned Reg)
static cl::opt< uint32_t > clAssumedStackSizeForExternalCall("amdgpu-assume-external-call-stack-size", cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden, cl::init(16384))
#define DEBUG_TYPE
Analyzes how many registers and other resources are used by functions.
MachineBasicBlock & MBB
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
Target-Independent Code Generator Pass Configuration Options pass.
This class represents an Operation in the Expression.
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Generic base class for all target subtargets.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:69
MachineOperand class - Representation of each machine instruction operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
char & AMDGPUResourceUsageAnalysisID
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...