LLVM 23.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
23#include "GCNSubtarget.h"
28#include "R600AsmPrinter.h"
40#include "llvm/MC/MCAssembler.h"
41#include "llvm/MC/MCContext.h"
43#include "llvm/MC/MCStreamer.h"
44#include "llvm/MC/MCValue.h"
51
52using namespace llvm;
53using namespace llvm::AMDGPU;
54
55// This should get the default rounding mode from the kernel. We just set the
56// default here, but this could change if the OpenCL rounding mode pragmas are
57// used.
58//
59// The denormal mode here should match what is reported by the OpenCL runtime
60// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
61// can also be override to flush with the -cl-denorms-are-zero compiler flag.
62//
63// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
64// precision, and leaves single precision to flush all and does not report
65// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
66// CL_FP_DENORM for both.
67//
68// FIXME: It seems some instructions do not support single precision denormals
69// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
70// and sin_f32, cos_f32 on most parts).
71
72// We want to use these instructions, and using fp32 denormals also causes
73// instructions to run at the double precision rate for the device so it's
74// probably best to just report no single precision denormals.
81
82static AsmPrinter *
84 std::unique_ptr<MCStreamer> &&Streamer) {
85 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
86}
87
95
97 std::unique_ptr<MCStreamer> Streamer)
98 : AsmPrinter(TM, std::move(Streamer)) {
99 assert(OutStreamer && "AsmPrinter constructed without streamer");
100}
101
103 return "AMDGPU Assembly Printer";
104}
105
107 return TM.getMCSubtargetInfo();
108}
109
111 if (!OutStreamer)
112 return nullptr;
113 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
114}
115
119
120void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
122
123 // TODO: Which one is called first, emitStartOfAsmFile or
124 // emitFunctionBodyStart?
125 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
126 initializeTargetID(M);
127
130 return;
131
133
136 CodeObjectVersion);
137 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
138 }
139
142}
143
145 // Init target streamer if it has not yet happened
147 initTargetStreamer(M);
148
149 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
151
152 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
153 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
154 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
155 HSAMetadataStream->end();
156 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
157 (void)Success;
158 assert(Success && "Malformed HSA Metadata");
159 }
160}
161
163 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
164 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
165 const Function &F = MF->getFunction();
166
167 // TODO: We're checking this late, would be nice to check it earlier.
168 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
170 STM.getCPU() + " is only available on code object version 6 or better");
171 }
172
173 // TODO: Which one is called first, emitStartOfAsmFile or
174 // emitFunctionBodyStart?
175 if (!getTargetStreamer()->getTargetID())
176 initializeTargetID(*F.getParent());
177
178 const auto &FunctionTargetID = STM.getTargetID();
179 // Make sure function's xnack settings are compatible with module's
180 // xnack settings.
181 if (FunctionTargetID.isXnackSupported() &&
182 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
183 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
184 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
185 "' function does not match module xnack setting");
186 return;
187 }
188 // Make sure function's sramecc settings are compatible with module's
189 // sramecc settings.
190 if (FunctionTargetID.isSramEccSupported() &&
191 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
192 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
193 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
194 "' function does not match module sramecc setting");
195 return;
196 }
197
198 if (!MFI.isEntryFunction())
199 return;
200
201 if (STM.isMesaKernel(F) &&
202 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
203 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
204 AMDGPUMCKernelCodeT KernelCode;
205 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
206 KernelCode.validate(&STM, MF->getContext());
208 }
209
210 if (STM.isAmdHsaOS())
211 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
212}
213
215 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
216 if (!MFI.isEntryFunction())
217 return;
218
219 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
220 return;
221
222 auto &Streamer = getTargetStreamer()->getStreamer();
223 auto &Context = Streamer.getContext();
224 auto &ObjectFileInfo = *Context.getObjectFileInfo();
225 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
226
227 Streamer.pushSection();
228 Streamer.switchSection(&ReadOnlySection);
229
230 // CP microcode requires the kernel descriptor to be allocated on 64 byte
231 // alignment.
232 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
233 ReadOnlySection.ensureMinAlignment(Align(64));
234
235 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
236
237 SmallString<128> KernelName;
238 getNameWithPrefix(KernelName, &MF->getFunction());
240 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
241 CurrentProgramInfo.NumVGPRsForWavesPerEU,
243 CurrentProgramInfo.NumSGPRsForWavesPerEU,
245 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
246 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
247 Context),
248 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
249
250 Streamer.popSection();
251}
252
254 Register RegNo = MI->getOperand(0).getReg();
255
257 raw_svector_ostream OS(Str);
258 OS << "implicit-def: "
259 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
260
261 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
262 OS << " : SGPR spill to VGPR lane";
263
264 OutStreamer->AddComment(OS.str());
265 OutStreamer->addBlankLine();
266}
267
269 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
271 return;
272 }
273
274 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
275 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
276 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
277 SmallString<128> SymbolName;
278 getNameWithPrefix(SymbolName, &MF->getFunction()),
280 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
281 }
282 if (DumpCodeInstEmitter) {
283 // Disassemble function name label to text.
284 DisasmLines.push_back(MF->getName().str() + ":");
285 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
286 HexLines.emplace_back("");
287 }
288
290}
291
293 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
294 // Write a line for the basic block label if it is not only fallthrough.
295 DisasmLines.push_back(
296 (Twine("BB") + Twine(getFunctionNumber())
297 + "_" + Twine(MBB.getNumber()) + ":").str());
298 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
299 HexLines.emplace_back("");
300 }
302}
303
306 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
307 OutContext.reportError({},
308 Twine(GV->getName()) +
309 ": unsupported initializer for address space");
310 return;
311 }
312
313 // LDS variables aren't emitted in HSA or PAL yet.
314 const Triple::OSType OS = TM.getTargetTriple().getOS();
315 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
316 return;
317
318 MCSymbol *GVSym = getSymbol(GV);
319
320 GVSym->redefineIfPossible();
321 if (GVSym->isDefined() || GVSym->isVariable())
322 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
323 "' is already defined");
324
325 const DataLayout &DL = GV->getDataLayout();
327 Align Alignment = GV->getAlign().value_or(Align(4));
328
329 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
330 emitLinkage(GV, GVSym);
331 auto *TS = getTargetStreamer();
332 TS->emitAMDGPULDS(GVSym, Size, Alignment);
333 return;
334 }
335
337}
338
340 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
341
342 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
343 switch (CodeObjectVersion) {
345 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
346 break;
348 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
349 break;
351 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
352 break;
353 default:
354 reportFatalUsageError("unsupported code object version");
355 }
356 }
357
359}
360
361/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
362///
363/// Remove dependency on GCNSubtarget and depend only only the necessary values
364/// for said occupancy computation. Should match computeOccupancy implementation
365/// without passing \p STM on.
366const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
367 const MCExpr *NumVGPRs,
368 unsigned DynamicVGPRBlockSize,
369 const GCNSubtarget &STM, MCContext &Ctx) {
370 unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
371 unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
372 unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
373 unsigned Generation = STM.getGeneration();
374
375 auto CreateExpr = [&Ctx](unsigned Value) {
376 return MCConstantExpr::create(Value, Ctx);
377 };
378
380 {CreateExpr(MaxWaves), CreateExpr(Granule),
381 CreateExpr(TargetTotalNumVGPRs),
382 CreateExpr(Generation), CreateExpr(InitOcc),
383 NumSGPRs, NumVGPRs},
384 Ctx);
385}
386
387void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
388 if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
389 return;
390
392 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
393 MCSymbol *FnSym = TM.getSymbol(&F);
394 bool IsLocal = F.hasLocalLinkage();
395
396 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
397 int64_t Val;
398 if (Value->evaluateAsAbsolute(Val)) {
399 Res = Val;
400 return true;
401 }
402 return false;
403 };
404
405 const uint64_t MaxScratchPerWorkitem =
407 MCSymbol *ScratchSizeSymbol = RI.getSymbol(
408 FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext, IsLocal);
409 uint64_t ScratchSize;
410 if (ScratchSizeSymbol->isVariable() &&
411 TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
412 ScratchSize > MaxScratchPerWorkitem) {
413 DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
414 DS_Error);
415 F.getContext().diagnose(DiagStackSize);
416 }
417
418 // Validate addressable scalar registers (i.e., prior to added implicit
419 // SGPRs).
420 MCSymbol *NumSGPRSymbol =
421 RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext, IsLocal);
423 !STM.hasSGPRInitBug()) {
424 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
425 uint64_t NumSgpr;
426 if (NumSGPRSymbol->isVariable() &&
427 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
428 NumSgpr > MaxAddressableNumSGPRs) {
429 F.getContext().diagnose(DiagnosticInfoResourceLimit(
430 F, "addressable scalar registers", NumSgpr, MaxAddressableNumSGPRs,
432 return;
433 }
434 }
435
436 MCSymbol *VCCUsedSymbol =
437 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext, IsLocal);
438 MCSymbol *FlatUsedSymbol = RI.getSymbol(
439 FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext, IsLocal);
440 uint64_t VCCUsed, FlatUsed, NumSgpr;
441
442 if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
443 FlatUsedSymbol->isVariable() &&
444 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
445 TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
446 TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
447
448 // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
449 // resolvable.
450 NumSgpr += IsaInfo::getNumExtraSGPRs(
451 &STM, VCCUsed, FlatUsed,
452 getTargetStreamer()->getTargetID()->isXnackOnOrAny());
454 STM.hasSGPRInitBug()) {
455 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
456 if (NumSgpr > MaxAddressableNumSGPRs) {
457 F.getContext().diagnose(DiagnosticInfoResourceLimit(
458 F, "scalar registers", NumSgpr, MaxAddressableNumSGPRs, DS_Error,
460 return;
461 }
462 }
463
464 MCSymbol *NumVgprSymbol =
465 RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext, IsLocal);
466 MCSymbol *NumAgprSymbol =
467 RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext, IsLocal);
468 uint64_t NumVgpr, NumAgpr;
469
470 MachineModuleInfo &MMI =
472 MachineFunction *MF = MMI.getMachineFunction(F);
473 if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
474 TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
475 TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
476 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
477 unsigned MaxWaves = MFI.getMaxWavesPerEU();
478 uint64_t TotalNumVgpr =
479 getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
480 uint64_t NumVGPRsForWavesPerEU =
481 std::max({TotalNumVgpr, (uint64_t)1,
482 (uint64_t)STM.getMinNumVGPRs(
483 MaxWaves, MFI.getDynamicVGPRBlockSize())});
484 uint64_t NumSGPRsForWavesPerEU = std::max(
485 {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
486 const MCExpr *OccupancyExpr = createOccupancy(
487 STM.getOccupancyWithWorkGroupSizes(*MF).second,
488 MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
489 MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
491 uint64_t Occupancy;
492
493 const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
494 F, "amdgpu-waves-per-eu", {0, 0}, true);
495
496 if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
497 DiagnosticInfoOptimizationFailure Diag(
498 F, F.getSubprogram(),
499 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
500 "'" +
501 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
502 ", final occupancy is " + Twine(Occupancy));
503 F.getContext().diagnose(Diag);
504 return;
505 }
506 }
507 }
508}
509
511 // Pad with s_code_end to help tools and guard against instruction prefetch
512 // causing stale data in caches. Arguably this should be done by the linker,
513 // which is why this isn't done for Mesa.
514 // Don't do it if there is no code.
515 const MCSubtargetInfo &STI = *getGlobalSTI();
516 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
520 if (TextSect->hasInstructions()) {
521 OutStreamer->switchSection(TextSect);
523 }
524 }
525
526 // Assign expressions which can only be resolved when all other functions are
527 // known.
528 RI.finalize(OutContext);
529
530 // Switch section and emit all GPR maximums within the processed module.
531 OutStreamer->pushSection();
532 MCSectionELF *MaxGPRSection =
533 OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
534 OutStreamer->switchSection(MaxGPRSection);
536 RI.getMaxVGPRSymbol(OutContext), RI.getMaxAGPRSymbol(OutContext),
537 RI.getMaxSGPRSymbol(OutContext), RI.getMaxNamedBarrierSymbol(OutContext));
538 OutStreamer->popSection();
539
540 for (Function &F : M.functions())
541 validateMCResourceInfo(F);
542
543 RI.reset();
544
546}
547
548SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
550 raw_svector_ostream OSS(Str);
551 auto &Streamer = getTargetStreamer()->getStreamer();
552 auto &Context = Streamer.getContext();
553 const MCExpr *New = foldAMDGPUMCExpr(Value, Context);
554 printAMDGPUMCExpr(New, OSS, MAI);
555 return Str;
556}
557
558// Print comments that apply to both callable functions and entry points.
559void AMDGPUAsmPrinter::emitCommonFunctionComments(
560 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
561 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
562 const AMDGPUMachineFunctionInfo *MFI) {
563 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
564 OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR),
565 false);
566 OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
567 if (NumAGPR && TotalNumVGPR) {
568 OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
569 OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
570 false);
571 }
572 OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
573 false);
574 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
575 false);
576}
577
578const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
579 const MachineFunction &MF) const {
580 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
581 MCContext &Ctx = MF.getContext();
582 uint16_t KernelCodeProperties = 0;
583 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
584
585 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
586 KernelCodeProperties |=
587 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
588 }
589 if (UserSGPRInfo.hasDispatchPtr()) {
590 KernelCodeProperties |=
591 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
592 }
593 if (UserSGPRInfo.hasQueuePtr()) {
594 KernelCodeProperties |=
595 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
596 }
597 if (UserSGPRInfo.hasKernargSegmentPtr()) {
598 KernelCodeProperties |=
599 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
600 }
601 if (UserSGPRInfo.hasDispatchID()) {
602 KernelCodeProperties |=
603 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
604 }
605 if (UserSGPRInfo.hasFlatScratchInit()) {
606 KernelCodeProperties |=
607 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
608 }
609 if (UserSGPRInfo.hasPrivateSegmentSize()) {
610 KernelCodeProperties |=
611 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
612 }
613 if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
614 KernelCodeProperties |=
615 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
616 }
617
618 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
619 // un-evaluatable at this point so it cannot be conditionally checked here.
620 // Instead, we'll directly shift the possibly unknown MCExpr into its place
621 // and bitwise-or it into KernelCodeProperties.
622 const MCExpr *KernelCodePropExpr =
623 MCConstantExpr::create(KernelCodeProperties, Ctx);
624 const MCExpr *OrValue = MCConstantExpr::create(
625 amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
626 OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
627 OrValue, Ctx);
628 KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
629
630 return KernelCodePropExpr;
631}
632
633MCKernelDescriptor
634AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
635 const SIProgramInfo &PI) const {
636 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
637 const Function &F = MF.getFunction();
638 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
639 MCContext &Ctx = MF.getContext();
640
641 MCKernelDescriptor KernelDescriptor;
642
643 KernelDescriptor.group_segment_fixed_size =
645 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
646
647 Align MaxKernArgAlign;
648 KernelDescriptor.kernarg_size = MCConstantExpr::create(
649 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
650
651 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
652 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
653 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
654
655 int64_t PGM_Rsrc3 = 1;
656 bool EvaluatableRsrc3 =
657 CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGM_Rsrc3);
658 (void)PGM_Rsrc3;
659 (void)EvaluatableRsrc3;
661 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() || !EvaluatableRsrc3 ||
662 static_cast<uint64_t>(PGM_Rsrc3) == 0);
663 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
664
665 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
666 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
667 Ctx);
668
669 return KernelDescriptor;
670}
671
673 // Init target streamer lazily on the first function so that previous passes
674 // can set metadata.
676 initTargetStreamer(*MF.getFunction().getParent());
677
678 ResourceUsage =
680 CurrentProgramInfo.reset(MF);
681
682 const AMDGPUMachineFunctionInfo *MFI =
683 MF.getInfo<AMDGPUMachineFunctionInfo>();
684 MCContext &Ctx = MF.getContext();
685
686 // The starting address of all shader programs must be 256 bytes aligned.
687 // Regular functions just need the basic required instruction alignment.
688 MF.ensureAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
689
691
692 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
694 bool IsLocal = MF.getFunction().hasLocalLinkage();
695 // FIXME: This should be an explicit check for Mesa.
696 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
697 MCSectionELF *ConfigSection =
698 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
699 OutStreamer->switchSection(ConfigSection);
700 }
701
702 RI.gatherResourceInfo(MF, *ResourceUsage, OutContext);
703
704 if (MFI->isModuleEntryFunction()) {
705 getSIProgramInfo(CurrentProgramInfo, MF);
706 }
707
708 if (STM.isAmdPalOS()) {
709 if (MFI->isEntryFunction())
710 EmitPALMetadata(MF, CurrentProgramInfo);
711 else if (MFI->isModuleEntryFunction())
712 emitPALFunctionMetadata(MF);
713 } else if (!STM.isAmdHsaOS()) {
714 EmitProgramInfoSI(MF, CurrentProgramInfo);
715 }
716
717 DumpCodeInstEmitter = nullptr;
718 if (STM.dumpCode()) {
719 // For -dumpcode, get the assembler out of the streamer. This only works
720 // with -filetype=obj.
721 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
722 if (Assembler)
723 DumpCodeInstEmitter = Assembler->getEmitterPtr();
724 }
725
726 DisasmLines.clear();
727 HexLines.clear();
729
731
732 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
733 STM.hasMAIInsts());
734
735 {
738 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
739 IsLocal),
740 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext,
741 IsLocal),
742 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext,
743 IsLocal),
744 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumNamedBarrier,
745 OutContext, IsLocal),
746 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
747 OutContext, IsLocal),
748 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext,
749 IsLocal),
750 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesFlatScratch,
751 OutContext, IsLocal),
752 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasDynSizedStack,
753 OutContext, IsLocal),
754 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion, OutContext,
755 IsLocal),
756 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasIndirectCall,
757 OutContext, IsLocal));
758 }
759
760 // Emit _dvgpr$ symbol when appropriate.
761 emitDVgprSymbol(MF);
762
763 if (isVerbose()) {
764 MCSectionELF *CommentSection =
765 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
766 OutStreamer->switchSection(CommentSection);
767
768 if (!MFI->isEntryFunction()) {
770 OutStreamer->emitRawComment(" Function info:", false);
771
772 emitCommonFunctionComments(
773 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext,
774 IsLocal)
775 ->getVariableValue(),
776 STM.hasMAIInsts()
777 ? RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR,
778 OutContext, IsLocal)
779 ->getVariableValue()
780 : nullptr,
781 RI.createTotalNumVGPRs(MF, Ctx),
782 RI.createTotalNumSGPRs(
783 MF,
784 MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
785 Ctx),
786 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
787 OutContext, IsLocal)
788 ->getVariableValue(),
789 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
790 return false;
791 }
792
793 OutStreamer->emitRawComment(" Kernel info:", false);
794 emitCommonFunctionComments(
795 CurrentProgramInfo.NumArchVGPR,
796 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
797 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
798 CurrentProgramInfo.ScratchSize,
799 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
800
801 OutStreamer->emitRawComment(
802 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
803 OutStreamer->emitRawComment(
804 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
805 OutStreamer->emitRawComment(
806 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
807 " bytes/workgroup (compile time only)", false);
808
809 OutStreamer->emitRawComment(
810 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
811
812 OutStreamer->emitRawComment(
813 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
814
815 OutStreamer->emitRawComment(
816 " NumSGPRsForWavesPerEU: " +
817 getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
818 false);
819 OutStreamer->emitRawComment(
820 " NumVGPRsForWavesPerEU: " +
821 getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
822 false);
823
824 if (STM.hasGFX90AInsts()) {
825 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
826 CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
827 AdjustedAccum = MCBinaryExpr::createMul(
828 AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
829 OutStreamer->emitRawComment(
830 " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
831 }
832
833 if (STM.hasGFX1250Insts())
834 OutStreamer->emitRawComment(
835 " NamedBarCnt: " + getMCExprStr(CurrentProgramInfo.NamedBarCnt),
836 false);
837
838 OutStreamer->emitRawComment(
839 " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
840
841 OutStreamer->emitRawComment(
842 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
843
844 OutStreamer->emitRawComment(
845 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
846 getMCExprStr(CurrentProgramInfo.ScratchEnable),
847 false);
848 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
849 Twine(CurrentProgramInfo.UserSGPR),
850 false);
851 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
852 Twine(CurrentProgramInfo.TrapHandlerEnable),
853 false);
854 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
855 Twine(CurrentProgramInfo.TGIdXEnable),
856 false);
857 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
858 Twine(CurrentProgramInfo.TGIdYEnable),
859 false);
860 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
861 Twine(CurrentProgramInfo.TGIdZEnable),
862 false);
863 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
864 Twine(CurrentProgramInfo.TIdIGCompCount),
865 false);
866
867 [[maybe_unused]] int64_t PGMRSrc3;
869 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() ||
870 (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
871 static_cast<uint64_t>(PGMRSrc3) == 0));
872 if (STM.hasGFX90AInsts()) {
873 OutStreamer->emitRawComment(
874 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
875 getMCExprStr(MCKernelDescriptor::bits_get(
876 CurrentProgramInfo.ComputePGMRSrc3,
877 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
878 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
879 false);
880 OutStreamer->emitRawComment(
881 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
882 getMCExprStr(MCKernelDescriptor::bits_get(
883 CurrentProgramInfo.ComputePGMRSrc3,
884 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
885 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
886 false);
887 }
888 }
889
890 if (DumpCodeInstEmitter) {
891
892 OutStreamer->switchSection(
893 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
894
895 for (size_t i = 0; i < DisasmLines.size(); ++i) {
896 std::string Comment = "\n";
897 if (!HexLines[i].empty()) {
898 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
899 Comment += " ; " + HexLines[i] + "\n";
900 }
901
902 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
903 OutStreamer->emitBytes(StringRef(Comment));
904 }
905 }
906
907 return false;
908}
909
910// When appropriate, add a _dvgpr$ symbol, with the value of the function
911// symbol, plus an offset encoding one less than the number of VGPR blocks used
912// by the function in bits 5..3 of the symbol value. A "VGPR block" can be
913// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is
914// used by a front-end to have functions that are chained rather than called,
915// and a dispatcher that dynamically resizes the VGPR count before dispatching
916// to a function.
917void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) {
919 if (MFI.isDynamicVGPREnabled() &&
921 MCContext &Ctx = MF.getContext();
922 unsigned BlockSize = MFI.getDynamicVGPRBlockSize();
923 MCValue NumVGPRs;
924 if (!CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable(
925 NumVGPRs, nullptr) ||
926 !NumVGPRs.isAbsolute()) {
927 llvm_unreachable("unable to resolve NumVGPRs for _dvgpr$ symbol");
928 }
929 // Calculate number of VGPR blocks.
930 // Treat 0 VGPRs as 1 VGPR to avoid underflowing.
931 unsigned NumBlocks =
932 divideCeil(std::max(unsigned(NumVGPRs.getConstant()), 1U), BlockSize);
933
934 if (NumBlocks > 8) {
936 "too many DVGPR blocks for _dvgpr$ symbol for '" +
937 Twine(CurrentFnSym->getName()) + "'");
938 return;
939 }
940 unsigned EncodedNumBlocks = (NumBlocks - 1) << 3;
941 // Add to function symbol to create _dvgpr$ symbol.
942 const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd(
944 MCConstantExpr::create(EncodedNumBlocks, Ctx), Ctx);
945 MCSymbol *DVgprFuncSym =
946 Ctx.getOrCreateSymbol(Twine("_dvgpr$") + CurrentFnSym->getName());
947 OutStreamer->emitAssignment(DVgprFuncSym, DVgprFuncVal);
948 emitVisibility(DVgprFuncSym, MF.getFunction().getVisibility());
949 emitLinkage(&MF.getFunction(), DVgprFuncSym);
950 }
951}
952
953// TODO: Fold this into emitFunctionBodyStart.
954void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
955 // In the beginning all features are either 'Any' or 'NotSupported',
956 // depending on global target features. This will cover empty modules.
958 getGlobalSTI()->getFeatureString());
959
960 // If module is empty, we are done.
961 if (M.empty())
962 return;
963
964 // If module is not empty, need to find first 'Off' or 'On' feature
965 // setting per feature from functions in module.
966 for (auto &F : M) {
967 auto &TSTargetID = getTargetStreamer()->getTargetID();
968 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
969 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
970 break;
971
972 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
973 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
974 if (TSTargetID->isXnackSupported())
975 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
976 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
977 if (TSTargetID->isSramEccSupported())
978 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
979 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
980 }
981}
982
983// AccumOffset computed for the MCExpr equivalent of:
984// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
985static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
986 const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
987 const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
988
989 // Can't be lower than 1 for subsequent alignTo.
990 const MCExpr *MaximumTaken =
991 AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
992
993 // Practically, it's computing divideCeil(MaximumTaken, 4).
994 const MCExpr *DivCeil = MCBinaryExpr::createDiv(
995 AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
996 Ctx);
997
998 return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
999}
1000
1001void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
1002 const MachineFunction &MF) {
1003 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1004 bool IsLocal = MF.getFunction().hasLocalLinkage();
1005 MCContext &Ctx = MF.getContext();
1006
1007 auto CreateExpr = [&Ctx](int64_t Value) {
1008 return MCConstantExpr::create(Value, Ctx);
1009 };
1010
1011 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
1012 int64_t Val;
1013 if (Value->evaluateAsAbsolute(Val)) {
1014 Res = Val;
1015 return true;
1016 }
1017 return false;
1018 };
1019
1020 auto GetSymRefExpr =
1021 [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
1022 MCSymbol *Sym =
1023 RI.getSymbol(CurrentFnSym->getName(), RIK, OutContext, IsLocal);
1024 return MCSymbolRefExpr::create(Sym, Ctx);
1025 };
1026
1028 ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
1029 ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
1031 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1032
1033 ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
1034 ProgInfo.TgSplit = STM.isTgSplitEnabled();
1035 ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
1036 ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
1037 ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
1038 ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
1039 ProgInfo.DynamicCallStack =
1040 MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
1041 GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
1042
1043 const MCExpr *BarBlkConst = MCConstantExpr::create(4, Ctx);
1044 const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
1045 GetSymRefExpr(RIK::RIK_NumNamedBarrier), BarBlkConst, Ctx);
1046 ProgInfo.NamedBarCnt = MCBinaryExpr::createDiv(AlignToBlk, BarBlkConst, Ctx);
1047
1048 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1049
1050 // The calculations related to SGPR/VGPR blocks are
1051 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
1052 // unified.
1053 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
1054 ProgInfo.VCCUsed, ProgInfo.FlatUsed,
1055 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
1056
1057 // Check the addressable register limit before we add ExtraSGPRs.
1059 !STM.hasSGPRInitBug()) {
1060 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1061 uint64_t NumSgpr;
1062 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1063 NumSgpr > MaxAddressableNumSGPRs) {
1064 // This can happen due to a compiler bug or when using inline asm.
1065 LLVMContext &Ctx = MF.getFunction().getContext();
1066 Ctx.diagnose(DiagnosticInfoResourceLimit(
1067 MF.getFunction(), "addressable scalar registers", NumSgpr,
1068 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit));
1069 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
1070 }
1071 }
1072
1073 // Account for extra SGPRs and VGPRs reserved for debugger use.
1074 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
1075
1076 const Function &F = MF.getFunction();
1077
1078 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1079 // dispatch registers as function args.
1080 unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
1081 WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
1082
1083 if (WaveDispatchNumSGPR) {
1085 {ProgInfo.NumSGPR,
1086 MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
1087 Ctx)},
1088 Ctx);
1089 }
1090
1091 if (WaveDispatchNumVGPR) {
1093 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
1094
1096 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1097 }
1098
1099 // Adjust number of registers used to meet default/requested minimum/maximum
1100 // number of waves per execution unit request.
1101 unsigned MaxWaves = MFI->getMaxWavesPerEU();
1102 ProgInfo.NumSGPRsForWavesPerEU =
1103 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
1104 CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
1105 Ctx);
1106 ProgInfo.NumVGPRsForWavesPerEU =
1107 AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
1108 CreateExpr(STM.getMinNumVGPRs(
1109 MaxWaves, MFI->getDynamicVGPRBlockSize()))},
1110 Ctx);
1111
1113 STM.hasSGPRInitBug()) {
1114 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1115 uint64_t NumSgpr;
1116 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1117 NumSgpr > MaxAddressableNumSGPRs) {
1118 // This can happen due to a compiler bug or when using inline asm to use
1119 // the registers which are usually reserved for vcc etc.
1120 LLVMContext &Ctx = MF.getFunction().getContext();
1121 Ctx.diagnose(DiagnosticInfoResourceLimit(
1122 MF.getFunction(), "scalar registers", NumSgpr, MaxAddressableNumSGPRs,
1124 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
1125 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
1126 }
1127 }
1128
1129 if (STM.hasSGPRInitBug()) {
1130 ProgInfo.NumSGPR =
1132 ProgInfo.NumSGPRsForWavesPerEU =
1134 }
1135
1136 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1137 LLVMContext &Ctx = MF.getFunction().getContext();
1138 Ctx.diagnose(DiagnosticInfoResourceLimit(
1139 MF.getFunction(), "user SGPRs", MFI->getNumUserSGPRs(),
1141 }
1142
1143 if (MFI->getLDSSize() > STM.getAddressableLocalMemorySize()) {
1144 LLVMContext &Ctx = MF.getFunction().getContext();
1145 Ctx.diagnose(DiagnosticInfoResourceLimit(
1146 MF.getFunction(), "local memory", MFI->getLDSSize(),
1148 }
1149 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
1150 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
1151 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
1152 unsigned Granule) {
1153 const MCExpr *OneConst = CreateExpr(1ul);
1154 const MCExpr *GranuleConst = CreateExpr(Granule);
1155 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
1156 const MCExpr *AlignToGPR =
1157 AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
1158 const MCExpr *DivGPR =
1159 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
1160 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
1161 return SubGPR;
1162 };
1163 // GFX10+ will always allocate 128 SGPRs and this field must be 0
1165 ProgInfo.SGPRBlocks = CreateExpr(0ul);
1166 } else {
1167 ProgInfo.SGPRBlocks = GetNumGPRBlocks(
1169 }
1170 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
1172
1173 const SIModeRegisterDefaults Mode = MFI->getMode();
1174
1175 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1176 // register.
1177 ProgInfo.FloatMode = getFPMode(Mode);
1178
1179 ProgInfo.IEEEMode = Mode.IEEE;
1180
1181 // Make clamp modifier on NaN input returns 0.
1182 ProgInfo.DX10Clamp = Mode.DX10Clamp;
1183
1184 unsigned LDSAlignShift = 8;
1185 switch (getLdsDwGranularity(STM)) {
1186 case 512:
1187 case 320:
1188 LDSAlignShift = 11;
1189 break;
1190 case 128:
1191 LDSAlignShift = 9;
1192 break;
1193 case 64:
1194 LDSAlignShift = 8;
1195 break;
1196 default:
1197 llvm_unreachable("invald LDS block size");
1198 }
1199
1200 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1201 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1202
1203 ProgInfo.LDSSize = MFI->getLDSSize();
1204 ProgInfo.LDSBlocks =
1205 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
1206
1207 // The MCExpr equivalent of divideCeil.
1208 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1209 const MCExpr *Ceil =
1210 AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
1211 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
1212 };
1213
1214 // Scratch is allocated in 64-dword or 256-dword blocks.
1215 unsigned ScratchAlignShift =
1216 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1217 // We need to program the hardware with the amount of scratch memory that
1218 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1219 // scratch memory used per thread.
1220 ProgInfo.ScratchBlocks = DivideCeil(
1222 CreateExpr(STM.getWavefrontSize()), Ctx),
1223 CreateExpr(1ULL << ScratchAlignShift));
1224
1225 if (STM.supportsWGP()) {
1226 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1227 }
1228
1229 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1230 ProgInfo.MemOrdered = 1;
1231 ProgInfo.FwdProgress = !F.hasFnAttribute("amdgpu-no-fwd-progress");
1232 }
1233
1234 // 0 = X, 1 = XY, 2 = XYZ
1235 unsigned TIDIGCompCnt = 0;
1236 if (MFI->hasWorkItemIDZ())
1237 TIDIGCompCnt = 2;
1238 else if (MFI->hasWorkItemIDY())
1239 TIDIGCompCnt = 1;
1240
1241 // The private segment wave byte offset is the last of the system SGPRs. We
1242 // initially assumed it was allocated, and may have used it. It shouldn't harm
1243 // anything to disable it if we know the stack isn't used here. We may still
1244 // have emitted code reading it to initialize scratch, but if that's unused
1245 // reading garbage should be OK.
1248 MCConstantExpr::create(0, Ctx), Ctx),
1249 ProgInfo.DynamicCallStack, Ctx);
1250
1251 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1252 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1253 ProgInfo.TrapHandlerEnable = STM.isAmdHsaOS() ? 0 : STM.hasTrapHandler();
1254 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1255 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1256 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1257 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1258 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1259 ProgInfo.EXCPEnMSB = 0;
1260 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1261 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1262 ProgInfo.EXCPEnable = 0;
1263
1264 // return ((Dst & ~Mask) | (Value << Shift))
1265 auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1266 uint32_t Shift) {
1267 const auto *Shft = MCConstantExpr::create(Shift, Ctx);
1268 const auto *Msk = MCConstantExpr::create(Mask, Ctx);
1269 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1271 Ctx);
1272 return Dst;
1273 };
1274
1275 if (STM.hasGFX90AInsts()) {
1276 ProgInfo.ComputePGMRSrc3 =
1277 SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
1278 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1279 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1280 ProgInfo.ComputePGMRSrc3 =
1281 SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
1282 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1283 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1284 }
1285
1286 if (STM.hasGFX1250Insts())
1287 ProgInfo.ComputePGMRSrc3 =
1288 SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
1289 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
1290 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
1291
1292 ProgInfo.Occupancy = createOccupancy(
1293 STM.computeOccupancy(F, ProgInfo.LDSSize).second,
1295 MFI->getDynamicVGPRBlockSize(), STM, Ctx);
1296
1297 const auto [MinWEU, MaxWEU] =
1298 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1299 uint64_t Occupancy;
1300 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1301 DiagnosticInfoOptimizationFailure Diag(
1302 F, F.getSubprogram(),
1303 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1304 "'" +
1305 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1306 ", final occupancy is " + Twine(Occupancy));
1307 F.getContext().diagnose(Diag);
1308 }
1309
1310 if (isGFX11Plus(STM)) {
1311 uint32_t CodeSizeInBytes = (uint32_t)std::min(
1312 ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */),
1313 (uint64_t)std::numeric_limits<uint32_t>::max());
1314 uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
1315 uint32_t Field, Shift, Width;
1316 if (isGFX11(STM)) {
1317 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
1318 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
1319 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
1320 } else {
1321 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
1322 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
1323 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
1324 }
1325 uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
1326 ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
1327 CreateExpr(InstPrefSize), Field, Shift);
1328 }
1329}
1330
1343
1344void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1345 const SIProgramInfo &CurrentProgramInfo) {
1346 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1347 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1348 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1349 MCContext &Ctx = MF.getContext();
1350
1351 // (((Value) & Mask) << Shift)
1352 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1353 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
1354 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
1356 shft, Ctx);
1357 };
1358
1359 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1360 int64_t Val;
1361 if (Value->evaluateAsAbsolute(Val))
1362 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
1363 else
1364 OutStreamer->emitValue(Value, Size);
1365 };
1366
1367 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
1369
1370 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
1371 /*Size=*/4);
1372
1374 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
1375
1377
1378 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1379 // appropriate generation.
1380 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1381 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1382 /*Mask=*/0x3FFFF, /*Shift=*/12),
1383 /*Size=*/4);
1384 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1385 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1386 /*Mask=*/0x7FFF, /*Shift=*/12),
1387 /*Size=*/4);
1388 } else {
1389 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1390 /*Mask=*/0x1FFF, /*Shift=*/12),
1391 /*Size=*/4);
1392 }
1393
1394 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1395 // 0" comment but I don't see a corresponding field in the register spec.
1396 } else {
1397 OutStreamer->emitInt32(RsrcReg);
1398
1399 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1400 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1401 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1402 MF.getContext());
1403 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1405
1406 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1407 // appropriate generation.
1408 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1409 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1410 /*Mask=*/0x3FFFF, /*Shift=*/12),
1411 /*Size=*/4);
1412 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1413 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1414 /*Mask=*/0x7FFF, /*Shift=*/12),
1415 /*Size=*/4);
1416 } else {
1417 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1418 /*Mask=*/0x1FFF, /*Shift=*/12),
1419 /*Size=*/4);
1420 }
1421 }
1422
1423 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1425 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1426 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1427 : CurrentProgramInfo.LDSBlocks;
1428 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1430 OutStreamer->emitInt32(MFI->getPSInputEnable());
1432 OutStreamer->emitInt32(MFI->getPSInputAddr());
1433 }
1434
1435 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1436 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1437 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1438 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1439}
1440
1441// Helper function to add common PAL Metadata 3.0+
1443 const SIProgramInfo &CurrentProgramInfo,
1444 CallingConv::ID CC, const GCNSubtarget &ST,
1445 unsigned DynamicVGPRBlockSize) {
1446 if (ST.hasFeature(AMDGPU::FeatureDX10ClampAndIEEEMode))
1447 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1448
1449 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1450 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1451 MD->setHwStage(CC, ".forward_progress", (bool)CurrentProgramInfo.FwdProgress);
1452
1453 if (AMDGPU::isCompute(CC)) {
1454 MD->setHwStage(CC, ".trap_present",
1455 (bool)CurrentProgramInfo.TrapHandlerEnable);
1456 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1457
1458 if (DynamicVGPRBlockSize != 0)
1459 MD->setComputeRegisters(".dynamic_vgpr_en", true);
1460 }
1461
1463 CC, ".lds_size",
1464 (unsigned)(CurrentProgramInfo.LdsSize * getLdsDwGranularity(ST) *
1465 sizeof(uint32_t)));
1466}
1467
1468// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1469// is AMDPAL. It stores each compute/SPI register setting and other PAL
1470// metadata items into the PALMD::Metadata, combining with any provided by the
1471// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1472// is then written as a single block in the .note section.
1473void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1474 const SIProgramInfo &CurrentProgramInfo) {
1475 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1476 auto CC = MF.getFunction().getCallingConv();
1477 auto *MD = getTargetStreamer()->getPALMetadata();
1478 auto &Ctx = MF.getContext();
1479
1480 MD->setEntryPoint(CC, MF.getFunction().getName());
1481 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1482
1483 // For targets that support dynamic VGPRs, set the number of saved dynamic
1484 // VGPRs (if any) in the PAL metadata.
1485 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1486 if (MFI->isDynamicVGPREnabled() &&
1488 MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
1490
1491 // Only set AGPRs for supported devices
1492 if (STM.hasMAIInsts()) {
1493 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1494 }
1495
1496 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1497 if (MD->getPALMajorVersion() < 3) {
1498 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
1499 if (AMDGPU::isCompute(CC)) {
1500 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1501 } else {
1502 const MCExpr *HasScratchBlocks =
1503 MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
1504 MCConstantExpr::create(0, Ctx), Ctx);
1505 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1506 MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1507 }
1508 } else {
1509 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1510 MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
1511 CurrentProgramInfo.ScratchEnable);
1512 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM,
1514 }
1515
1516 // ScratchSize is in bytes, 16 aligned.
1517 MD->setScratchSize(
1518 CC,
1519 AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
1520 MCConstantExpr::create(16, Ctx), Ctx),
1521 Ctx);
1522
1523 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1524 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1525 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1526 : CurrentProgramInfo.LDSBlocks;
1527 if (MD->getPALMajorVersion() < 3) {
1528 MD->setRsrc2(
1529 CC,
1531 Ctx);
1532 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1533 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1534 } else {
1535 // Graphics registers
1536 const unsigned ExtraLdsDwGranularity =
1537 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1538 MD->setGraphicsRegisters(
1539 ".ps_extra_lds_size",
1540 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1541
1542 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1543 static StringLiteral const PsInputFields[] = {
1544 ".persp_sample_ena", ".persp_center_ena",
1545 ".persp_centroid_ena", ".persp_pull_model_ena",
1546 ".linear_sample_ena", ".linear_center_ena",
1547 ".linear_centroid_ena", ".line_stipple_tex_ena",
1548 ".pos_x_float_ena", ".pos_y_float_ena",
1549 ".pos_z_float_ena", ".pos_w_float_ena",
1550 ".front_face_ena", ".ancillary_ena",
1551 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1552 unsigned PSInputEna = MFI->getPSInputEnable();
1553 unsigned PSInputAddr = MFI->getPSInputAddr();
1554 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1555 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1556 (bool)((PSInputEna >> Idx) & 1));
1557 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1558 (bool)((PSInputAddr >> Idx) & 1));
1559 }
1560 }
1561 }
1562
1563 // For version 3 and above the wave front size is already set in the metadata
1564 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1565 MD->setWave32(MF.getFunction().getCallingConv());
1566}
1567
1568void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1569 auto *MD = getTargetStreamer()->getPALMetadata();
1570 const MachineFrameInfo &MFI = MF.getFrameInfo();
1571 StringRef FnName = MF.getFunction().getName();
1572 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1573 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1574 MCContext &Ctx = MF.getContext();
1575
1576 if (MD->getPALMajorVersion() < 3) {
1577 // Set compute registers
1578 MD->setRsrc1(
1580 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1581 MD->setRsrc2(CallingConv::AMDGPU_CS,
1582 CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1583 } else {
1585 MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST,
1586 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
1587 }
1588
1589 // Set optional info
1590 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1591 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1592 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1593}
1594
1595// This is supposed to be log2(Size)
1597 switch (Size) {
1598 case 4:
1599 return AMD_ELEMENT_4_BYTES;
1600 case 8:
1601 return AMD_ELEMENT_8_BYTES;
1602 case 16:
1603 return AMD_ELEMENT_16_BYTES;
1604 default:
1605 llvm_unreachable("invalid private_element_size");
1606 }
1607}
1608
1609void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1610 const SIProgramInfo &CurrentProgramInfo,
1611 const MachineFunction &MF) const {
1612 const Function &F = MF.getFunction();
1613 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1614 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1615
1616 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1617 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1618 MCContext &Ctx = MF.getContext();
1619
1620 Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false);
1621
1623 CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
1625 CurrentProgramInfo.getComputePGMRSrc2(Ctx);
1627
1628 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1629
1631 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1632
1633 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1634 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1636 }
1637
1638 if (UserSGPRInfo.hasDispatchPtr())
1640
1641 if (UserSGPRInfo.hasQueuePtr())
1643
1644 if (UserSGPRInfo.hasKernargSegmentPtr())
1646
1647 if (UserSGPRInfo.hasDispatchID())
1649
1650 if (UserSGPRInfo.hasFlatScratchInit())
1652
1653 if (UserSGPRInfo.hasPrivateSegmentSize())
1655
1656 if (STM.isXNACKEnabled())
1658
1659 Align MaxKernArgAlign;
1660 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1661 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1662 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1663 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1664 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1665
1666 // kernarg_segment_alignment is specified as log of the alignment.
1667 // The minimum alignment is 16.
1668 // FIXME: The metadata treats the minimum as 4?
1669 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1670}
1671
1673 const char *ExtraCode, raw_ostream &O) {
1674 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1675 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1676 return false;
1677
1678 if (ExtraCode && ExtraCode[0]) {
1679 if (ExtraCode[1] != 0)
1680 return true; // Unknown modifier.
1681
1682 switch (ExtraCode[0]) {
1683 case 'r':
1684 break;
1685 default:
1686 return true;
1687 }
1688 }
1689
1690 // TODO: Should be able to support other operand types like globals.
1691 const MachineOperand &MO = MI->getOperand(OpNo);
1692 if (MO.isReg()) {
1694 *MF->getSubtarget().getRegisterInfo());
1695 return false;
1696 }
1697 if (MO.isImm()) {
1698 int64_t Val = MO.getImm();
1700 O << Val;
1701 } else if (isUInt<16>(Val)) {
1702 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1703 } else if (isUInt<32>(Val)) {
1704 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1705 } else {
1706 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1707 }
1708 return false;
1709 }
1710 return true;
1711}
1712
1720
1721void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1722 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1723 bool isModuleEntryFunction, bool hasMAIInsts) {
1724 if (!ORE)
1725 return;
1726
1727 const char *Name = "kernel-resource-usage";
1728 const char *Indent = " ";
1729
1730 // If the remark is not specifically enabled, do not output to yaml
1732 if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name))
1733 return;
1734
1735 // Currently non-kernel functions have no resources to emit.
1737 return;
1738
1739 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1740 StringRef RemarkLabel, auto Argument) {
1741 // Add an indent for every line besides the line with the kernel name. This
1742 // makes it easier to tell which resource usage go with which kernel since
1743 // the kernel name will always be displayed first.
1744 std::string LabelStr = RemarkLabel.str() + ": ";
1745 if (RemarkName != "FunctionName")
1746 LabelStr = Indent + LabelStr;
1747
1748 ORE->emit([&]() {
1749 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1751 &MF.front())
1752 << LabelStr << ore::NV(RemarkName, Argument);
1753 });
1754 };
1755
1756 // FIXME: Formatting here is pretty nasty because clang does not accept
1757 // newlines from diagnostics. This forces us to emit multiple diagnostic
1758 // remarks to simulate newlines. If and when clang does accept newlines, this
1759 // formatting should be aggregated into one remark with newlines to avoid
1760 // printing multiple diagnostic location and diag opts.
1761 EmitResourceUsageRemark("FunctionName", "Function Name",
1762 MF.getFunction().getName());
1763 EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
1764 getMCExprStr(CurrentProgramInfo.NumSGPR));
1765 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1766 getMCExprStr(CurrentProgramInfo.NumArchVGPR));
1767 if (hasMAIInsts) {
1768 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1769 getMCExprStr(CurrentProgramInfo.NumAccVGPR));
1770 }
1771 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1772 getMCExprStr(CurrentProgramInfo.ScratchSize));
1773 int64_t DynStack;
1774 bool DynStackEvaluatable =
1775 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
1776 StringRef DynamicStackStr =
1777 DynStackEvaluatable && DynStack ? "True" : "False";
1778 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1779 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1780 getMCExprStr(CurrentProgramInfo.Occupancy));
1781 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1782 CurrentProgramInfo.SGPRSpill);
1783 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1784 CurrentProgramInfo.VGPRSpill);
1785 if (isModuleEntryFunction)
1786 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1787 CurrentProgramInfo.LDSSize);
1788}
1789
1790char AMDGPUAsmPrinter::ID = 0;
1791
1792INITIALIZE_PASS(AMDGPUAsmPrinter, "amdgpu-asm-printer",
1793 "AMDGPU Assembly Printer", false, false)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, const SIProgramInfo &CurrentProgramInfo, CallingConv::ID CC, const GCNSubtarget &ST, unsigned DynamicVGPRBlockSize)
const AMDGPUMCExpr * createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx)
Mimics GCNSubtarget::computeOccupancy for MCExpr.
static unsigned getRsrcReg(CallingConv::ID CallConv)
LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static const MCExpr * computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
AMDHSA kernel descriptor MCExpr struct for use in MC layer.
MC infrastructure to propagate the function level resource usage info.
Analyzes how many registers and other resources are used by functions.
AMDHSA kernel descriptor definitions.
MC layer struct for AMDGPUMCKernelCodeT, provides MCExpr functionality where required.
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define LLVM_ABI
Definition Compiler.h:213
#define LLVM_EXTERNAL_VISIBILITY
Definition Compiler.h:132
AMD GCN specific subclass of TargetSubtarget.
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
OptimizedStructLayoutField Field
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
R600 Assembly printer class.
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition SIDefines.h:1144
#define R_0286E8_SPI_TMPRING_SIZE
Definition SIDefines.h:1282
#define FP_ROUND_MODE_DP(x)
Definition SIDefines.h:1264
#define C_00B84C_SCRATCH_EN
Definition SIDefines.h:1180
#define FP_ROUND_ROUND_TO_NEAREST
Definition SIDefines.h:1256
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition SIDefines.h:1215
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition SIDefines.h:1277
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition SIDefines.h:1167
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition SIDefines.h:1166
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition SIDefines.h:1175
#define R_0286CC_SPI_PS_INPUT_ENA
Definition SIDefines.h:1214
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition SIDefines.h:1153
#define FP_DENORM_MODE_DP(x)
Definition SIDefines.h:1275
#define R_00B848_COMPUTE_PGM_RSRC1
Definition SIDefines.h:1217
#define R_SPILLED_SGPRS
Definition SIDefines.h:1296
#define FP_ROUND_MODE_SP(x)
Definition SIDefines.h:1263
#define FP_DENORM_MODE_SP(x)
Definition SIDefines.h:1274
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition SIDefines.h:1158
#define R_SPILLED_VGPRS
Definition SIDefines.h:1297
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition SIDefines.h:1152
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition SIDefines.h:1177
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition SIDefines.h:1151
static const int BlockSize
Definition TarWriter.cpp:33
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void emitFunctionBodyEnd() override
Targets can override this to emit stuff after the last basic block in the function.
bool doFinalization(Module &M) override
doFinalization - Virtual method overriden by subclasses to do any necessary clean up after all passes...
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
doInitialization - Virtual method overridden by subclasses to do any necessary initialization before ...
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(MCRegister Reg, raw_ostream &O, const MCRegisterInfo &MRI)
AMDGPU target specific MCExpr operations.
static const AMDGPUMCExpr * createMax(ArrayRef< const MCExpr * > Args, MCContext &Ctx)
static const AMDGPUMCExpr * createTotalNumVGPR(const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx)
static const AMDGPUMCExpr * create(VariantKind Kind, ArrayRef< const MCExpr * > Args, MCContext &Ctx)
static const AMDGPUMCExpr * createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, bool XNACKUsed, MCContext &Ctx)
Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed are unresolvable but neede...
static const AMDGPUMCExpr * createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx)
void setHwStage(unsigned CC, StringRef field, unsigned Val)
void updateHwStageMaximum(unsigned CC, StringRef field, unsigned Val)
void setComputeRegisters(StringRef field, unsigned Val)
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR, const MCExpr *NextSGPR, const MCExpr *ReserveVCC, const MCExpr *ReserveFlatScr)
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV)
void initializeTargetID(const MCSubtargetInfo &STI)
virtual void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier, const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header)
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, const MCSymbol *MaxSGPR, const MCSymbol *MaxNamedBarrier)
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
This class is intended to be used as a driving class for all asm writers.
Definition AsmPrinter.h:91
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
MCSymbol * getSymbol(const GlobalValue *GV) const
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
TargetMachine & TM
Target machine description.
Definition AsmPrinter.h:94
const MCAsmInfo * MAI
Target Asm Printer information.
Definition AsmPrinter.h:97
MachineFunction * MF
The current machine function.
Definition AsmPrinter.h:109
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
unsigned getFunctionNumber() const
Return a unique ID for the current function.
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition AsmPrinter.h:121
AsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer, char &ID=AsmPrinter::ID)
MCSymbol * CurrentFnSym
The symbol for the current function.
Definition AsmPrinter.h:128
MachineModuleInfo * MMI
This is a pointer to the current MachineModuleInfo.
Definition AsmPrinter.h:112
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition AsmPrinter.h:101
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition AsmPrinter.h:106
bool isVerbose() const
Return true if assembly output should contain comments.
Definition AsmPrinter.h:312
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
DISubprogram * getSubprogram() const
Get the attached subprogram.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool isTgSplitEnabled() const
bool isCuModeEnabled() const
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
bool isWave32() const
bool supportsWGP() const
unsigned getMaxNumUserSGPRs() const
Generation getGeneration() const
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
bool hasPrivateSegmentBuffer() const
VisibilityTypes getVisibility() const
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:337
unsigned getAddressSpace() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:141
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
MaybeAlign getAlign() const
Returns the alignment of the given variable.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:569
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
MCCodeEmitter * getEmitterPtr() const
static const MCBinaryExpr * createAdd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:343
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:408
static const MCBinaryExpr * createLOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:378
static const MCBinaryExpr * createMul(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:398
static const MCBinaryExpr * createGT(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:363
static const MCBinaryExpr * createDiv(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:353
static const MCBinaryExpr * createShl(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:413
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Context object for machine code objects.
Definition MCContext.h:83
const MCObjectFileInfo * getObjectFileInfo() const
Definition MCContext.h:414
LLVM_ABI void reportError(SMLoc L, const Twine &Msg)
LLVM_ABI MCSymbol * getOrCreateSymbol(const Twine &Name)
Lookup the symbol inside with the specified Name.
Base class for the full range of assembler expressions which are needed for parsing.
Definition MCExpr.h:34
LLVM_ABI bool evaluateAsRelocatable(MCValue &Res, const MCAssembler *Asm) const
Try to evaluate the expression to a relocatable value, i.e.
Definition MCExpr.cpp:450
MCSection * getReadOnlySection() const
MCSection * getTextSection() const
MCContext & getContext() const
This represents a section on linux, lots of unix variants and some bare metal systems.
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:516
void ensureMinAlignment(Align MinAlignment)
Makes sure that Alignment is at least MinAlignment.
Definition MCSection.h:604
bool hasInstructions() const
Definition MCSection.h:625
MCContext & getContext() const
Definition MCStreamer.h:323
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition MCSymbol.h:233
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition MCSymbol.h:267
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition MCSymbol.h:212
const MCExpr * getVariableValue() const
Get the expression of the variable symbol.
Definition MCSymbol.h:270
MCStreamer & getStreamer()
Definition MCStreamer.h:103
static const MCUnaryExpr * createNot(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:273
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
MCContext & getContext() const
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getScratchReservedForDynamicVGPRs() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:436
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, std::optional< bool > EnableWavefrontSize32)
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI)
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
void printAMDGPUMCExpr(const MCExpr *Expr, raw_ostream &OS, const MCAsmInfo *MAI)
LLVM_READNONE constexpr bool isModuleEntryFunctionCC(CallingConv::ID CC)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
const MCExpr * maskShiftSet(const MCExpr *Val, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
Provided with the MCExpr * Val, uint32 Mask and Shift, will return the masked and left shifted,...
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX90A(const MCSubtargetInfo &STI)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isGFX11Plus(const MCSubtargetInfo &STI)
const MCExpr * foldAMDGPUMCExpr(const MCExpr *Expr, MCContext &Ctx)
bool isGFX10Plus(const MCSubtargetInfo &STI)
constexpr std::pair< unsigned, unsigned > getShiftMask(unsigned Value)
Deduce the least significant bit aligned shift and mask values for a binary Complement Value (as they...
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ SHT_PROGBITS
Definition ELF.h:1148
@ STT_AMDGPU_HSA_KERNEL
Definition ELF.h:1431
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
@ Success
The lock was released successfully.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
Target & getTheGCNTarget()
The target for GCN GPUs.
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1917
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
void validate(const MCSubtargetInfo *STI, MCContext &Ctx)
void initDefault(const MCSubtargetInfo *STI, MCContext &Ctx, bool InitMCExpr=true)
static const MCExpr * bits_get(const MCExpr *Src, uint32_t Shift, uint32_t Mask, MCContext &Ctx)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Track resource usage for kernels / entry functions.
const MCExpr * NumSGPR
const MCExpr * NumArchVGPR
uint64_t getFunctionCodeSize(const MachineFunction &MF, bool IsLowerBound=false)
const MCExpr * getComputePGMRSrc2(MCContext &Ctx) const
Compute the value of the ComputePGMRsrc2 register.
const MCExpr * VGPRBlocks
const MCExpr * ScratchBlocks
const MCExpr * ComputePGMRSrc3
const MCExpr * getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const
Compute the value of the ComputePGMRsrc1 register.
const MCExpr * VCCUsed
const MCExpr * FlatUsed
const MCExpr * NamedBarCnt
const MCExpr * ScratchEnable
const MCExpr * AccumOffset
const MCExpr * NumAccVGPR
const MCExpr * DynamicCallStack
const MCExpr * SGPRBlocks
const MCExpr * NumVGPRsForWavesPerEU
const MCExpr * NumVGPR
const MCExpr * Occupancy
const MCExpr * ScratchSize
const MCExpr * NumSGPRsForWavesPerEU
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.