LLVM 23.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
23#include "AMDGPUTargetMachine.h"
24#include "GCNSubtarget.h"
29#include "R600AsmPrinter.h"
35#include "llvm/ADT/StringSet.h"
43#include "llvm/MC/MCAssembler.h"
44#include "llvm/MC/MCContext.h"
46#include "llvm/MC/MCStreamer.h"
47#include "llvm/MC/MCValue.h"
54
55using namespace llvm;
56using namespace llvm::AMDGPU;
57
58// This should get the default rounding mode from the kernel. We just set the
59// default here, but this could change if the OpenCL rounding mode pragmas are
60// used.
61//
62// The denormal mode here should match what is reported by the OpenCL runtime
63// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
64// can also be override to flush with the -cl-denorms-are-zero compiler flag.
65//
66// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
67// precision, and leaves single precision to flush all and does not report
68// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
69// CL_FP_DENORM for both.
70//
71// FIXME: It seems some instructions do not support single precision denormals
72// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
73// and sin_f32, cos_f32 on most parts).
74
75// We want to use these instructions, and using fp32 denormals also causes
76// instructions to run at the double precision rate for the device so it's
77// probably best to just report no single precision denormals.
84
85static AsmPrinter *
87 std::unique_ptr<MCStreamer> &&Streamer) {
88 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
89}
90
98
99namespace {
100class AMDGPUAsmPrinterHandler : public AsmPrinterHandler {
101protected:
102 AMDGPUAsmPrinter *Asm;
103
104public:
105 AMDGPUAsmPrinterHandler(AMDGPUAsmPrinter *A) : Asm(A) {}
106
107 void beginFunction(const MachineFunction *MF) override {}
108
109 void endFunction(const MachineFunction *MF) override { Asm->endFunction(MF); }
110
111 void endModule() override {}
112};
113} // End anonymous namespace
114
116 std::unique_ptr<MCStreamer> Streamer)
117 : AsmPrinter(TM, std::move(Streamer)) {
118 assert(OutStreamer && "AsmPrinter constructed without streamer");
119}
120
122 return "AMDGPU Assembly Printer";
123}
124
126 return &TM.getMCSubtargetInfo();
127}
128
130 if (!OutStreamer)
131 return nullptr;
132 return static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
133}
134
138
139void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
141
142 // TODO: Which one is called first, emitStartOfAsmFile or
143 // emitFunctionBodyStart?
144 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
145 initializeTargetID(M);
146
147 const Triple &TT = M.getTargetTriple();
148 if (TT.getOS() != Triple::AMDHSA && TT.getOS() != Triple::AMDPAL)
149 return;
150
152
153 if (TT.getOS() == Triple::AMDHSA) {
155 CodeObjectVersion);
156 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
157 }
158
159 if (TT.getOS() == Triple::AMDPAL)
161}
162
164 // Init target streamer if it has not yet happened
166 initTargetStreamer(M);
167
168 const Triple &TT = M.getTargetTriple();
169 if (TT.getOS() != Triple::AMDHSA)
171
172 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
173 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
174 if (TT.getOS() == Triple::AMDHSA) {
175 HSAMetadataStream->end();
176 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
177 (void)Success;
178 assert(Success && "Malformed HSA Metadata");
179 }
180}
181
183 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
184 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
185 const Function &F = MF->getFunction();
186
187 // TODO: We're checking this late, would be nice to check it earlier.
188 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
190 STM.getCPU() + " is only available on code object version 6 or better");
191 }
192
193 // TODO: Which one is called first, emitStartOfAsmFile or
194 // emitFunctionBodyStart?
195 if (!getTargetStreamer()->getTargetID())
196 initializeTargetID(*F.getParent());
197
198 const auto &FunctionTargetID = STM.getTargetID();
199 // Make sure function's xnack settings are compatible with module's
200 // xnack settings.
201 if (FunctionTargetID.isXnackSupported() &&
202 FunctionTargetID.getXnackSetting() != AMDGPU::TargetIDSetting::Any &&
203 FunctionTargetID.getXnackSetting() !=
204 getTargetStreamer()->getTargetID()->getXnackSetting()) {
205 OutContext.reportError(
206 {}, "xnack setting of '" + Twine(MF->getName()) +
207 "' function does not match module xnack setting");
208 return;
209 }
210 // Make sure function's sramecc settings are compatible with module's
211 // sramecc settings.
212 if (FunctionTargetID.isSramEccSupported() &&
213 FunctionTargetID.getSramEccSetting() != AMDGPU::TargetIDSetting::Any &&
214 FunctionTargetID.getSramEccSetting() !=
215 getTargetStreamer()->getTargetID()->getSramEccSetting()) {
216 OutContext.reportError(
217 {}, "sramecc setting of '" + Twine(MF->getName()) +
218 "' function does not match module sramecc setting");
219 return;
220 }
221
222 if (!MFI.isEntryFunction())
223 return;
224
225 if (STM.isMesaKernel(F) &&
226 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
227 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
228 AMDGPUMCKernelCodeT KernelCode;
229 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
230 KernelCode.validate(&STM, MF->getContext());
232 }
233
234 if (STM.isAmdHsaOS())
235 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
236}
237
238/// Set bits in a kernel descriptor MCExpr field:
239/// return ((Dst & ~Mask) | (Value << Shift))
240static const MCExpr *setBits(const MCExpr *Dst, const MCExpr *Value,
241 uint32_t Mask, uint32_t Shift, MCContext &Ctx) {
242 const auto *Shft = MCConstantExpr::create(Shift, Ctx);
243 const auto *Msk = MCConstantExpr::create(Mask, Ctx);
244 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
246 Ctx);
247 return Dst;
248}
249
251 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
252 if (!MFI.isEntryFunction())
253 return;
254
255 assert(TM.getTargetTriple().getOS() == Triple::AMDHSA);
256
257 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
258 MCContext &Ctx = MF->getContext();
259
261 getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo);
262
263 // Compute inst_pref_size using MCExpr label subtraction for exact code
264 // size. At this point .Lfunc_end has been emitted (by the base AsmPrinter)
265 // right after the function code, so (Lfunc_end - func_sym) gives the
266 // exact function code size in bytes.
267 if (STM.hasInstPrefSize()) {
268 const MCExpr *CodeSizeExpr = MCBinaryExpr::createSub(
271
272 uint32_t Mask, Shift, Width, CacheLineSize;
273 STM.getInstPrefSizeArgs(Mask, Shift, Width, CacheLineSize);
274 const MCExpr *InstPrefSize =
275 AMDGPUMCExpr::createInstPrefSize(CodeSizeExpr, Ctx);
277 setBits(KD.compute_pgm_rsrc3, InstPrefSize, Mask, Shift, Ctx);
278 }
279
280 auto &Streamer = getTargetStreamer()->getStreamer();
281 auto &Context = Streamer.getContext();
282 auto &ObjectFileInfo = *Context.getObjectFileInfo();
283 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
284
285 Streamer.pushSection();
286 Streamer.switchSection(&ReadOnlySection);
287
288 // CP microcode requires the kernel descriptor to be allocated on 64 byte
289 // alignment.
290 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
291 ReadOnlySection.ensureMinAlignment(Align(64));
292
293 SmallString<128> KernelName;
294 getNameWithPrefix(KernelName, &MF->getFunction());
296 STM, KernelName, KD, CurrentProgramInfo.NumVGPRsForWavesPerEU,
298 CurrentProgramInfo.NumSGPRsForWavesPerEU,
300 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
301 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
302 Context),
303 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
304
305 Streamer.popSection();
306}
307
309 Register RegNo = MI->getOperand(0).getReg();
310
312 raw_svector_ostream OS(Str);
313 OS << "implicit-def: "
314 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
315
316 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
317 OS << " : SGPR spill to VGPR lane";
318
319 OutStreamer->AddComment(OS.str());
320 OutStreamer->addBlankLine();
321}
322
324 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
326 return;
327 }
328
329 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
330 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
331 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
332 SmallString<128> SymbolName;
333 getNameWithPrefix(SymbolName, &MF->getFunction()),
336 }
337 if (DumpCodeInstEmitter) {
338 // Disassemble function name label to text.
339 DisasmLines.push_back(MF->getName().str() + ":");
340 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
341 HexLines.emplace_back("");
342 }
343
345}
346
348 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
349 // Write a line for the basic block label if it is not only fallthrough.
350 DisasmLines.push_back((Twine("BB") + Twine(getFunctionNumber()) + "_" +
351 Twine(MBB.getNumber()) + ":")
352 .str());
353 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
354 HexLines.emplace_back("");
355 }
357}
358
361 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
362 OutContext.reportError({},
363 Twine(GV->getName()) +
364 ": unsupported initializer for address space");
365 return;
366 }
367
368 const Triple::OSType OS = TM.getTargetTriple().getOS();
369 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
371 return;
372 // With object linking, LDS definitions should have been externalized
373 // by earlier passes (e.g. LDS lowering, named barrier lowering).
374 // Only declarations reach here, emitted as SHN_AMDGPU_LDS symbols
375 // so the linker can assign their offsets.
376 assert(GV->isDeclaration() &&
377 "LDS definitions should have been externalized when object "
378 "linking is enabled");
379 }
380
381 MCSymbol *GVSym = getSymbol(GV);
382
383 GVSym->redefineIfPossible();
384 if (GVSym->isDefined() || GVSym->isVariable())
385 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
386 "' is already defined");
387
388 const DataLayout &DL = GV->getDataLayout();
390 Align Alignment = GV->getAlign().value_or(Align(4));
391
392 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
393 emitLinkage(GV, GVSym);
394 auto *TS = getTargetStreamer();
395 TS->emitAMDGPULDS(GVSym, Size, Alignment);
396 return;
397 }
398
400}
401
403 const llvm::Triple &TT = M.getTargetTriple();
404 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
405
406 if (TT.getOS() == Triple::AMDHSA) {
407 switch (CodeObjectVersion) {
409 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
410 break;
412 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
413 break;
415 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
416 break;
417 default:
418 reportFatalUsageError("unsupported code object version");
419 }
420
421 addAsmPrinterHandler(std::make_unique<AMDGPUAsmPrinterHandler>(this));
422 }
423
425}
426
427/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
428///
429/// Remove dependency on GCNSubtarget and depend only only the necessary values
430/// for said occupancy computation. Should match computeOccupancy implementation
431/// without passing \p STM on.
432const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
433 const MCExpr *NumVGPRs,
434 unsigned DynamicVGPRBlockSize,
435 const GCNSubtarget &STM, MCContext &Ctx) {
436 unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(STM);
437 unsigned Granule = IsaInfo::getVGPRAllocGranule(STM, DynamicVGPRBlockSize);
438 unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(STM);
439
440 // Bake the per-function SGPR budget into the operands so the late-evaluated
441 // MCExpr stays arithmetic. The trap reservation in particular is implicit on
442 // amdhsa and lives on STM, not on the assembler's MCSubtargetInfo.
443 unsigned SGPRTotal = IsaInfo::getTotalNumSGPRs(STM);
444 unsigned SGPRGranule = IsaInfo::getSGPRAllocGranule(STM);
445 unsigned SGPRTrapReserve = STM.hasTrapHandler() ? IsaInfo::TRAP_NUM_SGPRS : 0;
446
447 auto CreateExpr = [&Ctx](unsigned Value) {
448 return MCConstantExpr::create(Value, Ctx);
449 };
450
451 // Zero SGPR count when SGPRs don't limit occupancy, so the MCExpr skips the
452 // SGPR term without having to test the generation itself.
453 const MCExpr *SGPRArg =
454 IsaInfo::isSGPROccupancyLimited(STM) ? NumSGPRs : CreateExpr(0);
455
457 {CreateExpr(MaxWaves), CreateExpr(Granule),
458 CreateExpr(TargetTotalNumVGPRs),
459 CreateExpr(InitOcc), CreateExpr(SGPRTotal),
460 CreateExpr(SGPRGranule),
461 CreateExpr(SGPRTrapReserve), SGPRArg, NumVGPRs},
462 Ctx);
463}
464
465void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
466 if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
467 return;
468
470 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
471 MCSymbol *FnSym = TM.getSymbol(&F);
472
473 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
474 int64_t Val;
475 if (Value->evaluateAsAbsolute(Val)) {
476 Res = Val;
477 return true;
478 }
479 return false;
480 };
481
482 const uint64_t MaxScratchPerWorkitem =
484 MCSymbol *ScratchSizeSymbol =
485 RI.getSymbol(FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext);
486 uint64_t ScratchSize;
487 if (ScratchSizeSymbol->isVariable() &&
488 TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
489 ScratchSize > MaxScratchPerWorkitem) {
490 DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
491 DS_Error);
492 F.getContext().diagnose(DiagStackSize);
493 }
494
495 // Validate addressable scalar registers (i.e., prior to added implicit
496 // SGPRs).
497 MCSymbol *NumSGPRSymbol =
498 RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext);
500 !STM.hasSGPRInitBug()) {
501 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
502 uint64_t NumSgpr;
503 if (NumSGPRSymbol->isVariable() &&
504 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
505 NumSgpr > MaxAddressableNumSGPRs) {
506 F.getContext().diagnose(DiagnosticInfoResourceLimit(
507 F, "addressable scalar registers", NumSgpr, MaxAddressableNumSGPRs,
509 return;
510 }
511 }
512
513 MCSymbol *VCCUsedSymbol =
514 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext);
515 MCSymbol *FlatUsedSymbol =
516 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext);
517 uint64_t VCCUsed, FlatUsed, NumSgpr;
518
519 if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
520 FlatUsedSymbol->isVariable() &&
521 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
522 TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
523 TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
524
525 // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
526 // resolvable.
527 NumSgpr += IsaInfo::getNumExtraSGPRs(
528 STM, VCCUsed, FlatUsed,
529 getTargetStreamer()->getTargetID()->isXnackOnOrAny());
531 STM.hasSGPRInitBug()) {
532 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
533 if (NumSgpr > MaxAddressableNumSGPRs) {
534 F.getContext().diagnose(DiagnosticInfoResourceLimit(
535 F, "scalar registers", NumSgpr, MaxAddressableNumSGPRs, DS_Error,
537 return;
538 }
539 }
540
541 MCSymbol *NumVgprSymbol =
542 RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext);
543 MCSymbol *NumAgprSymbol =
544 RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext);
545 uint64_t NumVgpr, NumAgpr;
546
547 MachineModuleInfo &MMI =
549 MachineFunction *MF = MMI.getMachineFunction(F);
550 if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
551 TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
552 TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
553 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
554 unsigned MaxWaves = MFI.getMaxWavesPerEU();
555 uint64_t TotalNumVgpr =
556 getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
557 uint64_t NumVGPRsForWavesPerEU =
558 std::max({TotalNumVgpr, (uint64_t)1,
559 (uint64_t)STM.getMinNumVGPRs(
560 MaxWaves, MFI.getDynamicVGPRBlockSize())});
561 uint64_t NumSGPRsForWavesPerEU = std::max(
562 {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
563 const MCExpr *OccupancyExpr = createOccupancy(
564 STM.getOccupancyWithWorkGroupSizes(*MF).second,
565 MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
566 MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
568 uint64_t Occupancy;
569
570 const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
571 F, "amdgpu-waves-per-eu", {0, 0}, true);
572
573 if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
574 DiagnosticInfoOptimizationFailure Diag(
575 F, F.getSubprogram(),
576 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
577 "'" +
578 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
579 ", final occupancy is " + Twine(Occupancy));
580 F.getContext().diagnose(Diag);
581 return;
582 }
583 }
584 }
585}
586
587static void appendTypeEncoding(std::string &Enc, Type *Ty, const DataLayout &DL,
588 bool IsReturnType) {
589 if (Ty->isVoidTy()) {
590 Enc += 'v';
591 return;
592 }
593 unsigned Bits = DL.getTypeSizeInBits(Ty);
594 // Zero-sized non-void types (e.g. `{}` or `[0 x i8]`) consume no ABI
595 // registers. For returns, emit the same no-result marker as void so the
596 // parameter encoding still has an explicit return-type prefix.
597 if (Bits == 0) {
598 if (IsReturnType)
599 Enc += 'v';
600 return;
601 }
602 if (Bits <= 32)
603 Enc += 'i';
604 else if (Bits <= 64)
605 Enc += 'l';
606 else
607 Enc.append(divideCeil(Bits, 32), 'i');
608}
609
610static std::string computeTypeId(const FunctionType *FTy,
611 const DataLayout &DL) {
612 std::string Enc;
613 appendTypeEncoding(Enc, FTy->getReturnType(), DL, /*IsReturnType=*/true);
614 for (Type *ParamTy : FTy->params())
615 appendTypeEncoding(Enc, ParamTy, DL, /*IsReturnType=*/false);
616 return Enc;
617}
618
619void AMDGPUAsmPrinter::collectCallEdge(const MachineInstr &MI) {
621 return;
622 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
623 const MachineOperand *Callee =
624 TII->getNamedOperand(MI, AMDGPU::OpName::callee);
625 if (!Callee || !Callee->isGlobal())
626 return;
627 DirectCallEdges.insert(
628 {getSymbol(&MF->getFunction()), getSymbol(Callee->getGlobal())});
629}
630
631void AMDGPUAsmPrinter::emitAMDGPUInfo(Module &M) {
633 return;
634
635 const NamedMDNode *LDSMD = M.getNamedMetadata("amdgpu.lds.uses");
636 bool HasLDSUses = LDSMD && LDSMD->getNumOperands() > 0;
637
638 const NamedMDNode *BarMD = M.getNamedMetadata("amdgpu.named_barrier.uses");
639 bool HasNamedBarriers = BarMD && BarMD->getNumOperands() > 0;
640
641 // Collect address-taken functions (with type IDs) and indirect call sites.
642 DenseMap<const Function *, std::string> AddrTakenTypeIds;
643 using IndirectCallInfo = std::pair<const Function *, std::string>;
645
646 for (const Function &F : M) {
647 bool IsKernel = AMDGPU::isKernel(F.getCallingConv());
648
649 if (!IsKernel && F.hasAddressTaken(/*PutOffender=*/nullptr,
650 /*IgnoreCallbackUses=*/false,
651 /*IgnoreAssumeLikeCalls=*/true,
652 /*IgnoreLLVMUsed=*/true)) {
653 AddrTakenTypeIds[&F] =
654 computeTypeId(F.getFunctionType(), M.getDataLayout());
655 }
656
657 if (F.isDeclaration())
658 continue;
659
660 StringSet<> SeenTypeIds;
661 for (const BasicBlock &BB : F) {
662 for (const Instruction &I : BB) {
663 const auto *CB = dyn_cast<CallBase>(&I);
664 if (!CB || !CB->isIndirectCall())
665 continue;
666 std::string TId =
667 computeTypeId(CB->getFunctionType(), M.getDataLayout());
668 if (SeenTypeIds.insert(TId).second)
669 IndirectCalls.push_back({&F, std::move(TId)});
670 }
671 }
672 }
673
674 if (FunctionInfos.empty() && DirectCallEdges.empty() && !HasLDSUses &&
675 !HasNamedBarriers && AddrTakenTypeIds.empty() && IndirectCalls.empty())
676 return;
677
678 AMDGPU::InfoSectionData Data;
679 Data.Funcs = std::move(FunctionInfos);
680
681 for (auto &[F, TypeId] : AddrTakenTypeIds) {
682 MCSymbol *Sym = getSymbol(F);
683 Data.TypeIds.push_back({Sym, TypeId});
684 }
685
686 for (auto &[CallerSym, CalleeSym] : DirectCallEdges)
687 Data.Calls.push_back({CallerSym, CalleeSym});
688 DirectCallEdges.clear();
689
690 if (HasLDSUses) {
691 for (const MDNode *N : LDSMD->operands()) {
692 auto *Func = mdconst::extract<Function>(N->getOperand(0));
693 auto *LdsVar = mdconst::extract<GlobalVariable>(N->getOperand(1));
694 Data.Uses.push_back({getSymbol(Func), getSymbol(LdsVar)});
695 }
696 }
697
698 if (HasNamedBarriers) {
699 for (const MDNode *N : BarMD->operands()) {
700 auto *BarVar = mdconst::extract<GlobalVariable>(N->getOperand(0));
701 MCSymbol *BarSym = getSymbol(BarVar);
702 for (unsigned I = 1, E = N->getNumOperands(); I < E; ++I) {
703 auto *Func = mdconst::extract<Function>(N->getOperand(I));
704 Data.Uses.push_back({getSymbol(Func), BarSym});
705 }
706 }
707 }
708
709 for (auto &[Caller, Enc] : IndirectCalls) {
710 MCSymbol *CallerSym = getSymbol(Caller);
711 Data.IndirectCalls.push_back({CallerSym, Enc});
712 }
713
715}
716
718 const Triple &TT = M.getTargetTriple();
719
720 // Pad with s_code_end to help tools and guard against instruction prefetch
721 // causing stale data in caches. Arguably this should be done by the linker,
722 // which is why this isn't done for Mesa.
723 // Don't do it if there is no code.
724 const MCSubtargetInfo &STI = *getGlobalSTI();
725 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
726 (TT.getOS() == Triple::AMDHSA || TT.getOS() == Triple::AMDPAL)) {
728 if (TextSect->hasInstructions()) {
729 OutStreamer->switchSection(TextSect);
731 }
732 }
733
734 // Emit the unified .amdgpu.info section (per-function resources, call graph,
735 // LDS/named-barrier use edges, indirect calls, and address-taken type IDs).
736 emitAMDGPUInfo(M);
737
738 // Assign expressions which can only be resolved when all other functions are
739 // known.
740 RI.finalize(OutContext);
741
742 // Switch section and emit all GPR maximums within the processed module.
743 OutStreamer->pushSection();
744 MCSectionELF *MaxGPRSection =
745 OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
746 OutStreamer->switchSection(MaxGPRSection);
748 RI.getMaxVGPRSymbol(OutContext), RI.getMaxAGPRSymbol(OutContext),
749 RI.getMaxSGPRSymbol(OutContext), RI.getMaxNamedBarrierSymbol(OutContext));
750 OutStreamer->popSection();
751
752 // In the object-linking pipeline per-function resource MCExprs reference
753 // external callee symbols that cannot be evaluated here, so cross-TU limit
754 // checks would silently no-op for every non-leaf function. Defer resource
755 // sanity checking to the linker, which re-validates against the aggregated
756 // call graph in the combined .amdgpu.info metadata.
758 for (Function &F : M.functions())
759 validateMCResourceInfo(F);
760 }
761
762 RI.reset();
763
765}
766
767SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
769 raw_svector_ostream OSS(Str);
770 auto &Streamer = getTargetStreamer()->getStreamer();
771 auto &Context = Streamer.getContext();
772 const MCExpr *New = foldAMDGPUMCExpr(Value, Context);
773 printAMDGPUMCExpr(New, OSS, &MAI);
774 return Str;
775}
776
777// Print comments that apply to both callable functions and entry points.
778void AMDGPUAsmPrinter::emitCommonFunctionComments(
779 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
780 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
781 const AMDGPUMachineFunctionInfo *MFI) {
782 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
783 OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR),
784 false);
785 OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
786 if (NumAGPR && TotalNumVGPR) {
787 OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
788 OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
789 false);
790 }
791 OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
792 false);
793 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
794 false);
795}
796
797const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
798 const MachineFunction &MF) const {
799 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
800 MCContext &Ctx = MF.getContext();
801 uint16_t KernelCodeProperties = 0;
802 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
803
804 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
805 KernelCodeProperties |=
806 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
807 }
808 if (UserSGPRInfo.hasDispatchPtr()) {
809 KernelCodeProperties |=
810 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
811 }
812 if (UserSGPRInfo.hasQueuePtr()) {
813 KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
814 }
815 if (UserSGPRInfo.hasKernargSegmentPtr()) {
816 KernelCodeProperties |=
817 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
818 }
819 if (UserSGPRInfo.hasDispatchID()) {
820 KernelCodeProperties |=
821 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
822 }
823 if (UserSGPRInfo.hasFlatScratchInit()) {
824 KernelCodeProperties |=
825 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
826 }
827 if (UserSGPRInfo.hasPrivateSegmentSize()) {
828 KernelCodeProperties |=
829 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
830 }
831 if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
832 KernelCodeProperties |=
833 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
834 }
835
836 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
837 // un-evaluatable at this point so it cannot be conditionally checked here.
838 // Instead, we'll directly shift the possibly unknown MCExpr into its place
839 // and bitwise-or it into KernelCodeProperties.
840 const MCExpr *KernelCodePropExpr =
841 MCConstantExpr::create(KernelCodeProperties, Ctx);
842 const MCExpr *OrValue = MCConstantExpr::create(
843 amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
844 OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
845 OrValue, Ctx);
846 KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
847
848 return KernelCodePropExpr;
849}
850
851MCKernelDescriptor
852AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
853 const SIProgramInfo &PI) const {
854 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
855 const Function &F = MF.getFunction();
856 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
857 MCContext &Ctx = MF.getContext();
858
859 MCKernelDescriptor KernelDescriptor;
860
861 KernelDescriptor.group_segment_fixed_size =
863 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
864
865 Align MaxKernArgAlign;
866 KernelDescriptor.kernarg_size = MCConstantExpr::create(
867 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
868
869 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
870 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(STM, Ctx);
871 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
872
873 int64_t PGM_Rsrc3 = 1;
874 bool EvaluatableRsrc3 =
875 CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGM_Rsrc3);
876 (void)PGM_Rsrc3;
877 (void)EvaluatableRsrc3;
879 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() || !EvaluatableRsrc3 ||
880 static_cast<uint64_t>(PGM_Rsrc3) == 0);
881 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
882
883 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
884 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
885 Ctx);
886
887 return KernelDescriptor;
888}
889
891 // Init target streamer lazily on the first function so that previous passes
892 // can set metadata.
894 initTargetStreamer(*MF.getFunction().getParent());
895
896 ResourceUsage =
898 CurrentProgramInfo.reset(MF);
899
900 const AMDGPUMachineFunctionInfo *MFI =
901 MF.getInfo<AMDGPUMachineFunctionInfo>();
902 MCContext &Ctx = MF.getContext();
903
904 // The starting address of all shader programs must be 256 bytes aligned.
905 // Regular functions just need the basic required instruction alignment.
906 MF.ensureAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
907
909
910 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
912 // FIXME: This should be an explicit check for Mesa.
913 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
914 MCSectionELF *ConfigSection =
915 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
916 OutStreamer->switchSection(ConfigSection);
917 }
918
919 RI.gatherResourceInfo(MF, *ResourceUsage, OutContext);
920
923 *ResourceUsage;
924 FunctionInfos.push_back(
925 {/*NumSGPR=*/static_cast<uint32_t>(RU.NumExplicitSGPR),
926 /*NumArchVGPR=*/static_cast<uint32_t>(RU.NumVGPR),
927 /*NumAccVGPR=*/static_cast<uint32_t>(RU.NumAGPR),
928 /*PrivateSegmentSize=*/static_cast<uint32_t>(RU.PrivateSegmentSize),
929 /*UsesVCC=*/RU.UsesVCC,
930 /*UsesFlatScratch=*/RU.UsesFlatScratch,
931 /*HasDynStack=*/RU.HasDynamicallySizedStack,
932 /*Sym=*/getSymbol(&MF.getFunction())});
933 }
934
935 if (MFI->isModuleEntryFunction()) {
936 getSIProgramInfo(CurrentProgramInfo, MF);
937 }
938
939 if (STM.isAmdPalOS()) {
940 if (MFI->isEntryFunction())
941 EmitPALMetadata(MF, CurrentProgramInfo);
942 else if (MFI->isModuleEntryFunction())
943 emitPALFunctionMetadata(MF);
944 } else if (!STM.isAmdHsaOS()) {
945 EmitProgramInfoSI(MF, CurrentProgramInfo);
946 }
947
948 DumpCodeInstEmitter = nullptr;
949 if (STM.dumpCode()) {
950 // For -dumpcode, get the assembler out of the streamer. This only works
951 // with -filetype=obj.
952 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
953 if (Assembler)
954 DumpCodeInstEmitter = Assembler->getEmitterPtr();
955 }
956
957 DisasmLines.clear();
958 HexLines.clear();
960
962
963 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
964 STM.hasMAIInsts());
965
966 {
969 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext),
970 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext),
971 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext),
972 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumNamedBarrier,
973 OutContext),
974 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
975 OutContext),
976 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext),
977 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesFlatScratch,
978 OutContext),
979 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasDynSizedStack,
980 OutContext),
981 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion,
982 OutContext),
983 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasIndirectCall,
984 OutContext));
985 }
986
987 // Emit _dvgpr$ symbol when appropriate.
988 emitDVgprSymbol(MF);
989
990 if (isVerbose()) {
991 MCSectionELF *CommentSection =
992 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
993 OutStreamer->switchSection(CommentSection);
994
995 if (!MFI->isEntryFunction()) {
997 OutStreamer->emitRawComment(" Function info:", false);
998
999 emitCommonFunctionComments(
1000 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext)
1001 ->getVariableValue(),
1002 STM.hasMAIInsts() ? RI.getSymbol(CurrentFnSym->getName(),
1003 RIK::RIK_NumAGPR, OutContext)
1004 ->getVariableValue()
1005 : nullptr,
1006 RI.createTotalNumVGPRs(MF, Ctx),
1007 RI.createTotalNumSGPRs(
1008 MF,
1009 MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
1010 Ctx),
1011 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
1012 OutContext)
1013 ->getVariableValue(),
1014 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
1015 return false;
1016 }
1017
1018 OutStreamer->emitRawComment(" Kernel info:", false);
1019 emitCommonFunctionComments(
1020 CurrentProgramInfo.NumArchVGPR,
1021 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
1022 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
1023 CurrentProgramInfo.ScratchSize,
1024 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
1025
1026 OutStreamer->emitRawComment(
1027 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
1028 OutStreamer->emitRawComment(
1029 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
1030 OutStreamer->emitRawComment(
1031 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
1032 " bytes/workgroup (compile time only)",
1033 false);
1034
1035 OutStreamer->emitRawComment(
1036 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
1037
1038 OutStreamer->emitRawComment(
1039 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
1040
1041 OutStreamer->emitRawComment(
1042 " NumSGPRsForWavesPerEU: " +
1043 getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
1044 false);
1045 OutStreamer->emitRawComment(
1046 " NumVGPRsForWavesPerEU: " +
1047 getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
1048 false);
1049
1050 if (STM.hasGFX90AInsts()) {
1051 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
1052 CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
1053 AdjustedAccum = MCBinaryExpr::createMul(
1054 AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
1055 OutStreamer->emitRawComment(
1056 " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
1057 }
1058
1059 if (STM.hasGFX1250Insts())
1060 OutStreamer->emitRawComment(
1061 " NamedBarCnt: " + getMCExprStr(CurrentProgramInfo.NamedBarCnt),
1062 false);
1063
1064 OutStreamer->emitRawComment(
1065 " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
1066
1067 OutStreamer->emitRawComment(
1068 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
1069
1070 OutStreamer->emitRawComment(
1071 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
1072 getMCExprStr(CurrentProgramInfo.ScratchEnable),
1073 false);
1074 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
1075 Twine(CurrentProgramInfo.UserSGPR),
1076 false);
1077 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
1078 Twine(CurrentProgramInfo.TrapHandlerEnable),
1079 false);
1080 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
1081 Twine(CurrentProgramInfo.TGIdXEnable),
1082 false);
1083 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
1084 Twine(CurrentProgramInfo.TGIdYEnable),
1085 false);
1086 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
1087 Twine(CurrentProgramInfo.TGIdZEnable),
1088 false);
1089 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
1090 Twine(CurrentProgramInfo.TIdIGCompCount),
1091 false);
1092
1093 [[maybe_unused]] int64_t PGMRSrc3;
1095 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() ||
1096 (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
1097 static_cast<uint64_t>(PGMRSrc3) == 0));
1098 if (STM.hasGFX90AInsts()) {
1099 OutStreamer->emitRawComment(
1100 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
1101 getMCExprStr(MCKernelDescriptor::bits_get(
1102 CurrentProgramInfo.ComputePGMRSrc3,
1103 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
1104 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
1105 false);
1106 OutStreamer->emitRawComment(
1107 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
1108 getMCExprStr(MCKernelDescriptor::bits_get(
1109 CurrentProgramInfo.ComputePGMRSrc3,
1110 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
1111 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
1112 false);
1113 }
1114 }
1115
1116 if (DumpCodeInstEmitter) {
1117
1118 OutStreamer->switchSection(
1119 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
1120
1121 for (size_t i = 0; i < DisasmLines.size(); ++i) {
1122 std::string Comment = "\n";
1123 if (!HexLines[i].empty()) {
1124 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
1125 Comment += " ; " + HexLines[i] + "\n";
1126 }
1127
1128 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
1129 OutStreamer->emitBytes(StringRef(Comment));
1130 }
1131 }
1132
1133 return false;
1134}
1135
1136// When appropriate, add a _dvgpr$ symbol, with the value of the function
1137// symbol, plus an offset encoding one less than the number of VGPR blocks used
1138// by the function in bits 5..3 of the symbol value. A "VGPR block" can be
1139// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is
1140// used by a front-end to have functions that are chained rather than called,
1141// and a dispatcher that dynamically resizes the VGPR count before dispatching
1142// to a function.
1143void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) {
1145 if (MFI.isDynamicVGPREnabled() &&
1147 MCContext &Ctx = MF.getContext();
1148 unsigned BlockSize = MFI.getDynamicVGPRBlockSize();
1149
1150 const MCExpr *EncodedBlocks;
1151 MCValue NumVGPRs;
1152 if (CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable(
1153 NumVGPRs, nullptr) &&
1154 NumVGPRs.isAbsolute()) {
1155
1156 // Calculate number of VGPR blocks.
1157 // Treat 0 VGPRs as 1 VGPR to avoid underflowing.
1158 unsigned NumBlocks =
1159 divideCeil(std::max(unsigned(NumVGPRs.getConstant()), 1U), BlockSize);
1160
1161 if (NumBlocks > AMDGPU::IsaInfo::MaxDynamicVGPRBlocks) {
1163 {}, "DVGPR block count " + Twine(NumBlocks) +
1164 " exceeds maximum of " +
1166 " for __dvgpr$ symbol for '" +
1167 Twine(CurrentFnSym->getName()) + "'");
1168 return;
1169 }
1170 unsigned EncodedNumBlocks = (NumBlocks - 1) << 3;
1171 EncodedBlocks = MCConstantExpr::create(EncodedNumBlocks, Ctx);
1172 } else {
1173 // Value not yet available so build a symbolic MCExpr:
1174 // ((alignTo(max(NumVGPRs, 1), BlockSize) / BlockSize - 1) << 3
1175 const MCExpr *One = MCConstantExpr::create(1, Ctx);
1176 const MCExpr *BlockSizeConst = MCConstantExpr::create(BlockSize, Ctx);
1177 const MCExpr *MaxVGPRs = AMDGPUMCExpr::createMax(
1178 {CurrentProgramInfo.NumVGPRsForWavesPerEU, One}, Ctx);
1179 const MCExpr *NumBlocks = MCBinaryExpr::createDiv(
1180 AMDGPUMCExpr::createAlignTo(MaxVGPRs, BlockSizeConst, Ctx),
1181 BlockSizeConst, Ctx);
1182 EncodedBlocks =
1184 MCConstantExpr::create(3, Ctx), Ctx);
1185 }
1186
1187 // Add to function symbol to create _dvgpr$ symbol.
1188 const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd(
1189 MCSymbolRefExpr::create(CurrentFnSym, Ctx), EncodedBlocks, Ctx);
1190 MCSymbol *DVgprFuncSym =
1191 Ctx.getOrCreateSymbol(Twine("_dvgpr$") + CurrentFnSym->getName());
1192 OutStreamer->emitAssignment(DVgprFuncSym, DVgprFuncVal);
1193 emitVisibility(DVgprFuncSym, MF.getFunction().getVisibility());
1194 emitLinkage(&MF.getFunction(), DVgprFuncSym);
1195 }
1196}
1197
1198// TODO: Fold this into emitFunctionBodyStart.
1199void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
1200 // In the beginning all features are either 'Any' or 'NotSupported',
1201 // depending on global target features. This will cover empty modules.
1203 getGlobalSTI()->getFeatureString());
1204
1205 // If module is empty, we are done.
1206 if (M.empty())
1207 return;
1208
1209 // If module is not empty, need to find first 'Off' or 'On' feature
1210 // setting per feature from functions in module.
1211 for (auto &F : M) {
1212 auto &TSTargetID = getTargetStreamer()->getTargetID();
1213 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
1214 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
1215 break;
1216
1217 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
1218 const AMDGPU::TargetID &STMTargetID = STM.getTargetID();
1219 if (TSTargetID->isXnackSupported())
1220 if (TSTargetID->getXnackSetting() == AMDGPU::TargetIDSetting::Any)
1221 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
1222 if (TSTargetID->isSramEccSupported())
1223 if (TSTargetID->getSramEccSetting() == AMDGPU::TargetIDSetting::Any)
1224 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
1225 }
1226}
1227
1228// AccumOffset computed for the MCExpr equivalent of:
1229// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
1230static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
1231 const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
1232 const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
1233
1234 // Can't be lower than 1 for subsequent alignTo.
1235 const MCExpr *MaximumTaken =
1236 AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
1237
1238 // Practically, it's computing divideCeil(MaximumTaken, 4).
1239 const MCExpr *DivCeil = MCBinaryExpr::createDiv(
1240 AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
1241 Ctx);
1242
1243 return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
1244}
1245
1246void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
1247 const MachineFunction &MF) {
1248 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1249 MCContext &Ctx = MF.getContext();
1250
1251 auto CreateExpr = [&Ctx](int64_t Value) {
1252 return MCConstantExpr::create(Value, Ctx);
1253 };
1254
1255 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
1256 int64_t Val;
1257 if (Value->evaluateAsAbsolute(Val)) {
1258 Res = Val;
1259 return true;
1260 }
1261 return false;
1262 };
1263
1264 auto GetSymRefExpr =
1265 [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
1266 MCSymbol *Sym = RI.getSymbol(CurrentFnSym->getName(), RIK, OutContext);
1267 return MCSymbolRefExpr::create(Sym, Ctx);
1268 };
1269
1271 ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
1272 ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
1274 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1275
1276 ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
1277 ProgInfo.TgSplit = STM.isTgSplitEnabled();
1278 ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
1279 ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
1280 ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
1281 ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
1282 ProgInfo.DynamicCallStack =
1283 MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
1284 GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
1285
1286 const MCExpr *BarBlkConst = MCConstantExpr::create(4, Ctx);
1287 const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
1288 GetSymRefExpr(RIK::RIK_NumNamedBarrier), BarBlkConst, Ctx);
1289 ProgInfo.NamedBarCnt = MCBinaryExpr::createDiv(AlignToBlk, BarBlkConst, Ctx);
1290
1291 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1292
1293 // The calculations related to SGPR/VGPR blocks are
1294 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
1295 // unified.
1296 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
1297 ProgInfo.VCCUsed, ProgInfo.FlatUsed,
1298 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
1299
1300 // Check the addressable register limit before we add ExtraSGPRs.
1302 !STM.hasSGPRInitBug()) {
1303 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1304 uint64_t NumSgpr;
1305 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1306 NumSgpr > MaxAddressableNumSGPRs) {
1307 // This can happen due to a compiler bug or when using inline asm.
1308 LLVMContext &Ctx = MF.getFunction().getContext();
1309 Ctx.diagnose(DiagnosticInfoResourceLimit(
1310 MF.getFunction(), "addressable scalar registers", NumSgpr,
1311 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit));
1312 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
1313 }
1314 }
1315
1316 // Account for extra SGPRs and VGPRs reserved for debugger use.
1317 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
1318
1319 const Function &F = MF.getFunction();
1320
1321 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1322 // dispatch registers as function args.
1323 unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
1324 WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
1325
1326 if (WaveDispatchNumSGPR) {
1328 {ProgInfo.NumSGPR,
1329 MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
1330 Ctx)},
1331 Ctx);
1332 }
1333
1334 if (WaveDispatchNumVGPR) {
1336 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
1337
1339 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1340 }
1341
1342 // Adjust number of registers used to meet default/requested minimum/maximum
1343 // number of waves per execution unit request.
1344 unsigned MaxWaves = MFI->getMaxWavesPerEU();
1345 ProgInfo.NumSGPRsForWavesPerEU =
1346 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
1347 CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
1348 Ctx);
1349 ProgInfo.NumVGPRsForWavesPerEU =
1350 AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
1351 CreateExpr(STM.getMinNumVGPRs(
1352 MaxWaves, MFI->getDynamicVGPRBlockSize()))},
1353 Ctx);
1354
1356 STM.hasSGPRInitBug()) {
1357 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1358 uint64_t NumSgpr;
1359 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1360 NumSgpr > MaxAddressableNumSGPRs) {
1361 // This can happen due to a compiler bug or when using inline asm to use
1362 // the registers which are usually reserved for vcc etc.
1363 LLVMContext &Ctx = MF.getFunction().getContext();
1364 Ctx.diagnose(DiagnosticInfoResourceLimit(
1365 MF.getFunction(), "scalar registers", NumSgpr, MaxAddressableNumSGPRs,
1367 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
1368 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
1369 }
1370 }
1371
1372 if (STM.hasSGPRInitBug()) {
1373 ProgInfo.NumSGPR =
1375 ProgInfo.NumSGPRsForWavesPerEU =
1377 }
1378
1379 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1380 LLVMContext &Ctx = MF.getFunction().getContext();
1381 Ctx.diagnose(DiagnosticInfoResourceLimit(
1382 MF.getFunction(), "user SGPRs", MFI->getNumUserSGPRs(),
1384 }
1385
1386 if (MFI->getLDSSize() > STM.getAddressableLocalMemorySize()) {
1387 LLVMContext &Ctx = MF.getFunction().getContext();
1388 Ctx.diagnose(DiagnosticInfoResourceLimit(
1389 MF.getFunction(), "local memory", MFI->getLDSSize(),
1391 }
1392 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
1393 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
1394 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
1395 unsigned Granule) {
1396 const MCExpr *OneConst = CreateExpr(1ul);
1397 const MCExpr *GranuleConst = CreateExpr(Granule);
1398 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
1399 const MCExpr *AlignToGPR =
1400 AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
1401 const MCExpr *DivGPR =
1402 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
1403 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
1404 return SubGPR;
1405 };
1406 // GFX10+ will always allocate 128 SGPRs and this field must be 0
1408 ProgInfo.SGPRBlocks = CreateExpr(0ul);
1409 } else {
1410 ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,
1412 }
1413 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
1415
1416 const SIModeRegisterDefaults Mode = MFI->getMode();
1417
1418 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1419 // register.
1420 ProgInfo.FloatMode = getFPMode(Mode);
1421
1422 ProgInfo.IEEEMode = Mode.IEEE;
1423
1424 // Make clamp modifier on NaN input returns 0.
1425 ProgInfo.DX10Clamp = Mode.DX10Clamp;
1426
1427 unsigned LDSAlignShift = 8;
1428 switch (getLdsDwGranularity(STM)) {
1429 case 512:
1430 case 320:
1431 LDSAlignShift = 11;
1432 break;
1433 case 128:
1434 LDSAlignShift = 9;
1435 break;
1436 case 64:
1437 LDSAlignShift = 8;
1438 break;
1439 default:
1440 llvm_unreachable("invald LDS block size");
1441 }
1442
1443 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1444 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1445
1446 ProgInfo.LDSSize = MFI->getLDSSize();
1447 ProgInfo.LDSBlocks =
1448 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
1449
1450 // The MCExpr equivalent of divideCeil.
1451 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1452 const MCExpr *Ceil =
1453 AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
1454 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
1455 };
1456
1457 // Scratch is allocated in 64-dword or 256-dword blocks.
1458 unsigned ScratchAlignShift =
1459 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1460 // We need to program the hardware with the amount of scratch memory that
1461 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1462 // scratch memory used per thread.
1463 ProgInfo.ScratchBlocks = DivideCeil(
1465 CreateExpr(STM.getWavefrontSize()), Ctx),
1466 CreateExpr(1ULL << ScratchAlignShift));
1467
1468 if (STM.supportsWGP()) {
1469 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1470 }
1471
1472 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1473 ProgInfo.MemOrdered = 1;
1474 ProgInfo.FwdProgress = !F.hasFnAttribute("amdgpu-no-fwd-progress");
1475 }
1476
1477 // 0 = X, 1 = XY, 2 = XYZ
1478 unsigned TIDIGCompCnt = 0;
1479 if (MFI->hasWorkItemIDZ())
1480 TIDIGCompCnt = 2;
1481 else if (MFI->hasWorkItemIDY())
1482 TIDIGCompCnt = 1;
1483
1484 // The private segment wave byte offset is the last of the system SGPRs. We
1485 // initially assumed it was allocated, and may have used it. It shouldn't harm
1486 // anything to disable it if we know the stack isn't used here. We may still
1487 // have emitted code reading it to initialize scratch, but if that's unused
1488 // reading garbage should be OK.
1491 MCConstantExpr::create(0, Ctx), Ctx),
1492 ProgInfo.DynamicCallStack, Ctx);
1493
1494 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1495 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1496 ProgInfo.TrapHandlerEnable = STM.isAmdHsaOS() ? 0 : STM.hasTrapHandler();
1497 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1498 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1499 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1500 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1501 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1502 ProgInfo.EXCPEnMSB = 0;
1503 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1504 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1505 ProgInfo.EXCPEnable = 0;
1506
1507 if (STM.hasGFX90AInsts()) {
1508 ProgInfo.ComputePGMRSrc3 =
1509 setBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
1510 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1511 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT, Ctx);
1512 ProgInfo.ComputePGMRSrc3 =
1513 setBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
1514 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1515 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT, Ctx);
1516 }
1517
1518 if (STM.hasGFX1250Insts())
1519 ProgInfo.ComputePGMRSrc3 =
1520 setBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
1521 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
1522 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT, Ctx);
1523
1524 ProgInfo.Occupancy = createOccupancy(
1525 STM.computeOccupancy(F, ProgInfo.LDSSize).second,
1527 MFI->getDynamicVGPRBlockSize(), STM, Ctx);
1528
1529 const auto [MinWEU, MaxWEU] =
1530 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1531 uint64_t Occupancy;
1532 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1533 DiagnosticInfoOptimizationFailure Diag(
1534 F, F.getSubprogram(),
1535 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1536 "'" +
1537 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1538 ", final occupancy is " + Twine(Occupancy));
1539 F.getContext().diagnose(Diag);
1540 }
1541}
1542
1543static unsigned getRsrcReg(CallingConv::ID CallConv) {
1544 switch (CallConv) {
1545 default:
1546 [[fallthrough]];
1561 }
1562}
1563
1564void AMDGPUAsmPrinter::EmitProgramInfoSI(
1565 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) {
1566 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1567 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1568 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1569 MCContext &Ctx = MF.getContext();
1570
1571 // (((Value) & Mask) << Shift)
1572 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1573 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
1574 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
1576 shft, Ctx);
1577 };
1578
1579 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1580 int64_t Val;
1581 if (Value->evaluateAsAbsolute(Val))
1582 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
1583 else
1584 OutStreamer->emitValue(Value, Size);
1585 };
1586
1587 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
1589
1590 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
1591 /*Size=*/4);
1592
1594 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(STM, Ctx),
1595 /*Size=*/4);
1596
1598
1599 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1600 // appropriate generation.
1601 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1602 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1603 /*Mask=*/0x3FFFF, /*Shift=*/12),
1604 /*Size=*/4);
1605 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1606 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1607 /*Mask=*/0x7FFF, /*Shift=*/12),
1608 /*Size=*/4);
1609 } else {
1610 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1611 /*Mask=*/0x1FFF, /*Shift=*/12),
1612 /*Size=*/4);
1613 }
1614
1615 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1616 // 0" comment but I don't see a corresponding field in the register spec.
1617 } else {
1618 OutStreamer->emitInt32(RsrcReg);
1619
1620 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1621 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1622 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1623 MF.getContext());
1624 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1626
1627 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1628 // appropriate generation.
1629 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1630 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1631 /*Mask=*/0x3FFFF, /*Shift=*/12),
1632 /*Size=*/4);
1633 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1634 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1635 /*Mask=*/0x7FFF, /*Shift=*/12),
1636 /*Size=*/4);
1637 } else {
1638 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1639 /*Mask=*/0x1FFF, /*Shift=*/12),
1640 /*Size=*/4);
1641 }
1642 }
1643
1644 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1646 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1647 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1648 : CurrentProgramInfo.LDSBlocks;
1649 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1651 OutStreamer->emitInt32(MFI->getPSInputEnable());
1653 OutStreamer->emitInt32(MFI->getPSInputAddr());
1654 }
1655
1656 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1657 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1658 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1659 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1660}
1661
1662// Helper function to add common PAL Metadata 3.0+
1664 const SIProgramInfo &CurrentProgramInfo,
1665 CallingConv::ID CC, const GCNSubtarget &ST,
1666 unsigned DynamicVGPRBlockSize) {
1667 if (ST.hasFeature(AMDGPU::FeatureDX10ClampAndIEEEMode))
1668 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1669
1670 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1671 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1672 MD->setHwStage(CC, ".forward_progress", (bool)CurrentProgramInfo.FwdProgress);
1673
1674 if (AMDGPU::isCompute(CC)) {
1675 MD->setHwStage(CC, ".trap_present",
1676 (bool)CurrentProgramInfo.TrapHandlerEnable);
1677 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1678
1679 if (DynamicVGPRBlockSize != 0)
1680 MD->setComputeRegisters(".dynamic_vgpr_en", true);
1681 }
1682
1684 CC, ".lds_size",
1685 (unsigned)(CurrentProgramInfo.LdsSize * getLdsDwGranularity(ST) *
1686 sizeof(uint32_t)));
1687}
1688
1689// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1690// is AMDPAL. It stores each compute/SPI register setting and other PAL
1691// metadata items into the PALMD::Metadata, combining with any provided by the
1692// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1693// is then written as a single block in the .note section.
1694void AMDGPUAsmPrinter::EmitPALMetadata(
1695 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo) {
1696 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1697 auto CC = MF.getFunction().getCallingConv();
1698 auto *MD = getTargetStreamer()->getPALMetadata();
1699 auto &Ctx = MF.getContext();
1700
1701 MD->setEntryPoint(CC, MF.getFunction().getName());
1702 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1703
1704 // For targets that support dynamic VGPRs, set the number of saved dynamic
1705 // VGPRs (if any) in the PAL metadata.
1706 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1707 if (MFI->isDynamicVGPREnabled() &&
1709 MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
1711
1712 // Only set AGPRs for supported devices
1713 if (STM.hasMAIInsts()) {
1714 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1715 }
1716
1717 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1718 if (MD->getPALMajorVersion() < 3) {
1719 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
1720 if (AMDGPU::isCompute(CC)) {
1721 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(STM, Ctx), Ctx);
1722 } else {
1723 const MCExpr *HasScratchBlocks =
1724 MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
1725 MCConstantExpr::create(0, Ctx), Ctx);
1726 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1727 MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1728 }
1729 } else {
1730 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1731 MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
1732 CurrentProgramInfo.ScratchEnable);
1733 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM,
1735 }
1736
1737 // ScratchSize is in bytes, 16 aligned.
1738 MD->setScratchSize(
1739 CC,
1740 AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
1741 MCConstantExpr::create(16, Ctx), Ctx),
1742 Ctx);
1743
1744 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1745 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1746 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1747 : CurrentProgramInfo.LDSBlocks;
1748 if (MD->getPALMajorVersion() < 3) {
1749 MD->setRsrc2(
1750 CC,
1752 Ctx);
1753 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1754 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1755 } else {
1756 // Graphics registers
1757 const unsigned ExtraLdsDwGranularity =
1758 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1759 MD->setGraphicsRegisters(
1760 ".ps_extra_lds_size",
1761 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1762
1763 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1764 static StringLiteral const PsInputFields[] = {
1765 ".persp_sample_ena", ".persp_center_ena",
1766 ".persp_centroid_ena", ".persp_pull_model_ena",
1767 ".linear_sample_ena", ".linear_center_ena",
1768 ".linear_centroid_ena", ".line_stipple_tex_ena",
1769 ".pos_x_float_ena", ".pos_y_float_ena",
1770 ".pos_z_float_ena", ".pos_w_float_ena",
1771 ".front_face_ena", ".ancillary_ena",
1772 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1773 unsigned PSInputEna = MFI->getPSInputEnable();
1774 unsigned PSInputAddr = MFI->getPSInputAddr();
1775 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1776 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1777 (bool)((PSInputEna >> Idx) & 1));
1778 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1779 (bool)((PSInputAddr >> Idx) & 1));
1780 }
1781 }
1782 }
1783
1784 // For version 3 and above the wave front size is already set in the metadata
1785 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1786 MD->setWave32(MF.getFunction().getCallingConv());
1787}
1788
1789void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1790 auto *MD = getTargetStreamer()->getPALMetadata();
1791 const MachineFrameInfo &MFI = MF.getFrameInfo();
1792 StringRef FnName = MF.getFunction().getName();
1793 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1794 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1795 MCContext &Ctx = MF.getContext();
1796
1797 if (MD->getPALMajorVersion() < 3) {
1798 // Set compute registers
1799 MD->setRsrc1(
1801 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1802 MD->setRsrc2(CallingConv::AMDGPU_CS,
1803 CurrentProgramInfo.getComputePGMRSrc2(ST, Ctx), Ctx);
1804 } else {
1806 MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST,
1807 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
1808 }
1809
1810 // Set optional info
1811 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1812 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1813 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1814}
1815
1816// This is supposed to be log2(Size)
1818 switch (Size) {
1819 case 4:
1820 return AMD_ELEMENT_4_BYTES;
1821 case 8:
1822 return AMD_ELEMENT_8_BYTES;
1823 case 16:
1824 return AMD_ELEMENT_16_BYTES;
1825 default:
1826 llvm_unreachable("invalid private_element_size");
1827 }
1828}
1829
1830void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1831 const SIProgramInfo &CurrentProgramInfo,
1832 const MachineFunction &MF) const {
1833 const Function &F = MF.getFunction();
1834 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1835 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1836
1837 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1838 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1839 MCContext &Ctx = MF.getContext();
1840
1841 Out.initDefault(STM, Ctx, /*InitMCExpr=*/false);
1842
1844 CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
1846 CurrentProgramInfo.getComputePGMRSrc2(STM, Ctx);
1848
1849 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1850
1852 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1853
1854 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1855 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1857 }
1858
1859 if (UserSGPRInfo.hasDispatchPtr())
1861
1862 if (UserSGPRInfo.hasQueuePtr())
1864
1865 if (UserSGPRInfo.hasKernargSegmentPtr())
1867
1868 if (UserSGPRInfo.hasDispatchID())
1870
1871 if (UserSGPRInfo.hasFlatScratchInit())
1873
1874 if (UserSGPRInfo.hasPrivateSegmentSize())
1876
1877 if (STM.isXNACKEnabled())
1879
1880 Align MaxKernArgAlign;
1881 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1882 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1883 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1884 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1885 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1886
1887 // kernarg_segment_alignment is specified as log of the alignment.
1888 // The minimum alignment is 16.
1889 // FIXME: The metadata treats the minimum as 4?
1890 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1891}
1892
1894 const char *ExtraCode, raw_ostream &O) {
1895 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1896 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1897 return false;
1898
1899 if (ExtraCode && ExtraCode[0]) {
1900 if (ExtraCode[1] != 0)
1901 return true; // Unknown modifier.
1902
1903 switch (ExtraCode[0]) {
1904 case 'r':
1905 break;
1906 default:
1907 return true;
1908 }
1909 }
1910
1911 // TODO: Should be able to support other operand types like globals.
1912 const MachineOperand &MO = MI->getOperand(OpNo);
1913 if (MO.isReg()) {
1915 *MF->getSubtarget().getRegisterInfo());
1916 return false;
1917 }
1918 if (MO.isImm()) {
1919 int64_t Val = MO.getImm();
1921 O << Val;
1922 } else if (isUInt<16>(Val)) {
1923 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1924 } else if (isUInt<32>(Val)) {
1925 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1926 } else {
1927 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1928 }
1929 return false;
1930 }
1931 return true;
1932}
1933
1941
1942void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1943 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1944 bool isModuleEntryFunction, bool hasMAIInsts) {
1945 if (!ORE)
1946 return;
1947
1948 const char *Name = "kernel-resource-usage";
1949 const char *Indent = " ";
1950
1951 // If the remark is not specifically enabled, do not output to yaml
1953 if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name))
1954 return;
1955
1956 // Currently non-kernel functions have no resources to emit.
1958 return;
1959
1960 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1961 StringRef RemarkLabel, auto Argument) {
1962 // Add an indent for every line besides the line with the kernel name. This
1963 // makes it easier to tell which resource usage go with which kernel since
1964 // the kernel name will always be displayed first.
1965 std::string LabelStr = RemarkLabel.str() + ": ";
1966 if (RemarkName != "FunctionName")
1967 LabelStr = Indent + LabelStr;
1968
1969 ORE->emit([&]() {
1970 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1972 &MF.front())
1973 << LabelStr << ore::NV(RemarkName, Argument);
1974 });
1975 };
1976
1977 // FIXME: Formatting here is pretty nasty because clang does not accept
1978 // newlines from diagnostics. This forces us to emit multiple diagnostic
1979 // remarks to simulate newlines. If and when clang does accept newlines, this
1980 // formatting should be aggregated into one remark with newlines to avoid
1981 // printing multiple diagnostic location and diag opts.
1982 EmitResourceUsageRemark("FunctionName", "Function Name",
1983 MF.getFunction().getName());
1984 EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
1985 getMCExprStr(CurrentProgramInfo.NumSGPR));
1986 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1987 getMCExprStr(CurrentProgramInfo.NumArchVGPR));
1988 if (hasMAIInsts) {
1989 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1990 getMCExprStr(CurrentProgramInfo.NumAccVGPR));
1991 }
1992 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1993 getMCExprStr(CurrentProgramInfo.ScratchSize));
1994 int64_t DynStack;
1995 bool DynStackEvaluatable =
1996 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
1997 StringRef DynamicStackStr =
1998 DynStackEvaluatable && DynStack ? "True" : "False";
1999 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
2000 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
2001 getMCExprStr(CurrentProgramInfo.Occupancy));
2002 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
2003 CurrentProgramInfo.SGPRSpill);
2004 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
2005 CurrentProgramInfo.VGPRSpill);
2006 if (isModuleEntryFunction)
2007 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
2008 CurrentProgramInfo.LDSSize);
2009}
2010
2011char AMDGPUAsmPrinter::ID = 0;
2012
2013INITIALIZE_PASS(AMDGPUAsmPrinter, "amdgpu-asm-printer",
2014 "AMDGPU Assembly Printer", false, false)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, const SIProgramInfo &CurrentProgramInfo, CallingConv::ID CC, const GCNSubtarget &ST, unsigned DynamicVGPRBlockSize)
const AMDGPUMCExpr * createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx)
Mimics GCNSubtarget::computeOccupancy for MCExpr.
static unsigned getRsrcReg(CallingConv::ID CallConv)
LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static const MCExpr * setBits(const MCExpr *Dst, const MCExpr *Value, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
Set bits in a kernel descriptor MCExpr field: return ((Dst & ~Mask) | (Value << Shift))
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static std::string computeTypeId(const FunctionType *FTy, const DataLayout &DL)
static const MCExpr * computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx)
static void appendTypeEncoding(std::string &Enc, Type *Ty, const DataLayout &DL, bool IsReturnType)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
AMDHSA kernel descriptor MCExpr struct for use in MC layer.
MC infrastructure to propagate the function level resource usage info.
Analyzes how many registers and other resources are used by functions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
AMDHSA kernel descriptor definitions.
MC layer struct for AMDGPUMCKernelCodeT, provides MCExpr functionality where required.
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_ABI
Definition Compiler.h:215
#define LLVM_EXTERNAL_VISIBILITY
Definition Compiler.h:132
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
OptimizedStructLayoutField Field
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
R600 Assembly printer class.
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition SIDefines.h:1358
#define R_0286E8_SPI_TMPRING_SIZE
Definition SIDefines.h:1500
#define FP_ROUND_MODE_DP(x)
Definition SIDefines.h:1482
#define C_00B84C_SCRATCH_EN
Definition SIDefines.h:1394
#define FP_ROUND_ROUND_TO_NEAREST
Definition SIDefines.h:1474
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition SIDefines.h:1433
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition SIDefines.h:1495
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition SIDefines.h:1381
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition SIDefines.h:1380
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition SIDefines.h:1389
#define R_0286CC_SPI_PS_INPUT_ENA
Definition SIDefines.h:1432
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition SIDefines.h:1367
#define FP_DENORM_MODE_DP(x)
Definition SIDefines.h:1493
#define R_00B848_COMPUTE_PGM_RSRC1
Definition SIDefines.h:1435
#define R_SPILLED_SGPRS
Definition SIDefines.h:1514
#define FP_ROUND_MODE_SP(x)
Definition SIDefines.h:1481
#define FP_DENORM_MODE_SP(x)
Definition SIDefines.h:1492
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition SIDefines.h:1372
#define R_SPILLED_VGPRS
Definition SIDefines.h:1515
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition SIDefines.h:1366
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition SIDefines.h:1391
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition SIDefines.h:1365
StringSet - A set-like wrapper for the StringMap.
static const int BlockSize
Definition TarWriter.cpp:33
static cl::opt< unsigned > CacheLineSize("cache-line-size", cl::init(0), cl::Hidden, cl::desc("Use this to override the target cache line size when " "specified by the user."))
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
void endFunction(const MachineFunction *MF)
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
bool doFinalization(Module &M) override
doFinalization - Virtual method overriden by subclasses to do any necessary clean up after all passes...
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
doInitialization - Virtual method overridden by subclasses to do any necessary initialization before ...
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(MCRegister Reg, raw_ostream &O, const MCRegisterInfo &MRI)
AMDGPU target specific MCExpr operations.
static const AMDGPUMCExpr * createInstPrefSize(const MCExpr *CodeSizeBytes, MCContext &Ctx)
Create an expression for instruction prefetch size computation: min(divideCeil(CodeSizeBytes,...
static const AMDGPUMCExpr * createMax(ArrayRef< const MCExpr * > Args, MCContext &Ctx)
static const AMDGPUMCExpr * createTotalNumVGPR(const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx)
static const AMDGPUMCExpr * create(VariantKind Kind, ArrayRef< const MCExpr * > Args, MCContext &Ctx)
static const AMDGPUMCExpr * createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, bool XNACKUsed, MCContext &Ctx)
Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed are unresolvable but neede...
static const AMDGPUMCExpr * createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx)
void setHwStage(unsigned CC, StringRef field, unsigned Val)
void updateHwStageMaximum(unsigned CC, StringRef field, unsigned Val)
void setComputeRegisters(StringRef field, unsigned Val)
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR, const MCExpr *NextSGPR, const MCExpr *ReserveVCC, const MCExpr *ReserveFlatScr)
virtual void emitAMDGPUInfo(const AMDGPU::InfoSectionData &Data)
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV)
virtual void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier, const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
const std::optional< AMDGPU::TargetID > & getTargetID() const
void initializeTargetID(const MCSubtargetInfo &STI, StringRef FeatureString)
virtual void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header)
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, const MCSymbol *MaxSGPR, const MCSymbol *MaxNamedBarrier)
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
Collects and handles AsmPrinter objects required to build debug or EH information.
This class is intended to be used as a driving class for all asm writers.
Definition AsmPrinter.h:91
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
MCSymbol * getSymbol(const GlobalValue *GV) const
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
TargetMachine & TM
Target machine description.
Definition AsmPrinter.h:94
MachineFunction * MF
The current machine function.
Definition AsmPrinter.h:109
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
unsigned getFunctionNumber() const
Return a unique ID for the current function.
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition AsmPrinter.h:121
AsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer, char &ID=AsmPrinter::ID)
MCSymbol * CurrentFnSym
The symbol for the current function.
Definition AsmPrinter.h:128
MachineModuleInfo * MMI
This is a pointer to the current MachineModuleInfo.
Definition AsmPrinter.h:112
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition AsmPrinter.h:101
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition AsmPrinter.h:106
const MCAsmInfo & MAI
Target Asm Printer information.
Definition AsmPrinter.h:97
bool isVerbose() const
Return true if assembly output should contain comments.
Definition AsmPrinter.h:310
MCSymbol * getFunctionEnd() const
Definition AsmPrinter.h:320
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
void addAsmPrinterHandler(std::unique_ptr< AsmPrinterHandler > Handler)
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
bool empty() const
Definition DenseMap.h:173
DISubprogram * getSubprogram() const
Get the attached subprogram.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:353
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool isTgSplitEnabled() const
bool hasInstPrefSize() const
bool isCuModeEnabled() const
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
const AMDGPU::TargetID & getTargetID() const
bool isWave32() const
bool supportsWGP() const
void getInstPrefSizeArgs(uint32_t &Mask, uint32_t &Shift, uint32_t &Width, uint32_t &CacheLineSize) const
unsigned getMaxNumUserSGPRs() const
Generation getGeneration() const
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
bool hasPrivateSegmentBuffer() const
VisibilityTypes getVisibility() const
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:346
unsigned getAddressSpace() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:143
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
MaybeAlign getAlign() const
Returns the alignment of the given variable.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:578
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
MCCodeEmitter * getEmitterPtr() const
static const MCBinaryExpr * createAdd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:342
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:347
static const MCBinaryExpr * createOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:407
static const MCBinaryExpr * createLOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:377
static const MCBinaryExpr * createMul(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:397
static const MCBinaryExpr * createGT(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:362
static const MCBinaryExpr * createDiv(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:352
static const MCBinaryExpr * createShl(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:412
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:427
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Context object for machine code objects.
Definition MCContext.h:83
const MCObjectFileInfo * getObjectFileInfo() const
Definition MCContext.h:413
LLVM_ABI void reportError(SMLoc L, const Twine &Msg)
LLVM_ABI MCSymbol * getOrCreateSymbol(const Twine &Name)
Lookup the symbol inside with the specified Name.
Base class for the full range of assembler expressions which are needed for parsing.
Definition MCExpr.h:34
LLVM_ABI bool evaluateAsRelocatable(MCValue &Res, const MCAssembler *Asm) const
Try to evaluate the expression to a relocatable value, i.e.
Definition MCExpr.cpp:450
MCSection * getReadOnlySection() const
MCSection * getTextSection() const
MCContext & getContext() const
This represents a section on linux, lots of unix variants and some bare metal systems.
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:573
void ensureMinAlignment(Align MinAlignment)
Makes sure that Alignment is at least MinAlignment.
Definition MCSection.h:661
bool hasInstructions() const
Definition MCSection.h:669
MCContext & getContext() const
Definition MCStreamer.h:326
Generic base class for all target subtargets.
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:213
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition MCSymbol.h:233
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition MCSymbol.h:267
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition MCSymbol.h:212
const MCExpr * getVariableValue() const
Get the expression of the variable symbol.
Definition MCSymbol.h:270
MCStreamer & getStreamer()
Definition MCStreamer.h:103
static const MCUnaryExpr * createNot(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:272
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
MCContext & getContext() const
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVM_ABI unsigned getNumOperands() const
iterator_range< op_iterator > operands()
Definition Metadata.h:1845
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getScratchReservedForDynamicVGPRs() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void push_back(const T &Elt)
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
std::pair< typename Base::iterator, bool > insert(StringRef key)
Definition StringSet.h:39
Primary interface to the complete machine description for the target machine.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:54
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned getSGPRAllocGranule(const MCSubtargetInfo &STI)
bool isSGPROccupancyLimited(const MCSubtargetInfo &STI)
unsigned getTotalNumSGPRs(const MCSubtargetInfo &STI)
unsigned getVGPREncodingGranule(const MCSubtargetInfo &STI, std::optional< bool > EnableWavefrontSize32)
static constexpr unsigned MaxDynamicVGPRBlocks
Maximum number of VGPR blocks that can be allocated in dynamic VGPR mode.
unsigned getSGPREncodingGranule(const MCSubtargetInfo &STI)
unsigned getTotalNumVGPRs(const MCSubtargetInfo &STI)
unsigned getMaxWavesPerEU(const MCSubtargetInfo &STI)
unsigned getNumExtraSGPRs(const MCSubtargetInfo &STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getVGPRAllocGranule(const MCSubtargetInfo &STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
void printAMDGPUMCExpr(const MCExpr *Expr, raw_ostream &OS, const MCAsmInfo *MAI)
LLVM_READNONE constexpr bool isModuleEntryFunctionCC(CallingConv::ID CC)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
const MCExpr * maskShiftSet(const MCExpr *Val, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
Provided with the MCExpr * Val, uint32 Mask and Shift, will return the masked and left shifted,...
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX90A(const MCSubtargetInfo &STI)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
const MCExpr * foldAMDGPUMCExpr(const MCExpr *Expr, MCContext &Ctx)
bool isGFX10Plus(const MCSubtargetInfo &STI)
AMDGPU::TargetID TargetID
constexpr std::pair< unsigned, unsigned > getShiftMask(unsigned Value)
Deduce the least significant bit aligned shift and mask values for a binary Complement Value (as they...
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ SHT_PROGBITS
Definition ELF.h:1151
@ STT_AMDGPU_HSA_KERNEL
Definition ELF.h:1434
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:668
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:117
@ Success
The lock was released successfully.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
Target & getTheGCNTarget()
The target for GCN GPUs.
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1917
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:860
#define N
AMDGPUResourceUsageAnalysisImpl::SIFunctionResourceInfo FunctionResourceInfo
void initDefault(const MCSubtargetInfo &STI, MCContext &Ctx, bool InitMCExpr=true)
void validate(const MCSubtargetInfo *STI, MCContext &Ctx)
static const MCExpr * bits_get(const MCExpr *Src, uint32_t Shift, uint32_t Mask, MCContext &Ctx)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Track resource usage for kernels / entry functions.
const MCExpr * NumSGPR
const MCExpr * NumArchVGPR
const MCExpr * VGPRBlocks
const MCExpr * ScratchBlocks
const MCExpr * ComputePGMRSrc3
const MCExpr * getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const
Compute the value of the ComputePGMRsrc1 register.
const MCExpr * VCCUsed
const MCExpr * FlatUsed
const MCExpr * NamedBarCnt
const MCExpr * ScratchEnable
const MCExpr * AccumOffset
const MCExpr * NumAccVGPR
const MCExpr * DynamicCallStack
const MCExpr * SGPRBlocks
const MCExpr * NumVGPRsForWavesPerEU
const MCExpr * NumVGPR
const MCExpr * Occupancy
const MCExpr * ScratchSize
const MCExpr * NumSGPRsForWavesPerEU
const MCExpr * getComputePGMRSrc2(const GCNSubtarget &ST, MCContext &Ctx) const
Compute the value of the ComputePGMRsrc2 register.
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.