LLVM 20.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
22#include "GCNSubtarget.h"
27#include "R600AsmPrinter.h"
38#include "llvm/MC/MCAssembler.h"
39#include "llvm/MC/MCContext.h"
41#include "llvm/MC/MCStreamer.h"
47
48using namespace llvm;
49using namespace llvm::AMDGPU;
50
51// This should get the default rounding mode from the kernel. We just set the
52// default here, but this could change if the OpenCL rounding mode pragmas are
53// used.
54//
55// The denormal mode here should match what is reported by the OpenCL runtime
56// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
57// can also be override to flush with the -cl-denorms-are-zero compiler flag.
58//
59// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
60// precision, and leaves single precision to flush all and does not report
61// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
62// CL_FP_DENORM for both.
63//
64// FIXME: It seems some instructions do not support single precision denormals
65// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
66// and sin_f32, cos_f32 on most parts).
67
68// We want to use these instructions, and using fp32 denormals also causes
69// instructions to run at the double precision rate for the device so it's
70// probably best to just report no single precision denormals.
74 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
75 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
76}
77
78static AsmPrinter *
80 std::unique_ptr<MCStreamer> &&Streamer) {
81 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
82}
83
89}
90
92 std::unique_ptr<MCStreamer> Streamer)
93 : AsmPrinter(TM, std::move(Streamer)) {
94 assert(OutStreamer && "AsmPrinter constructed without streamer");
95}
96
98 return "AMDGPU Assembly Printer";
99}
100
102 return TM.getMCSubtargetInfo();
103}
104
106 if (!OutStreamer)
107 return nullptr;
108 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
109}
110
113}
114
115void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
117
118 // TODO: Which one is called first, emitStartOfAsmFile or
119 // emitFunctionBodyStart?
120 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
121 initializeTargetID(M);
122
125 return;
126
128
131 CodeObjectVersion);
132 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
133 }
134
137}
138
140 // Init target streamer if it has not yet happened
142 initTargetStreamer(M);
143
146
147 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
148 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
150 HSAMetadataStream->end();
151 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
152 (void)Success;
153 assert(Success && "Malformed HSA Metadata");
154 }
155}
156
159 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
160 const Function &F = MF->getFunction();
161
162 // TODO: We're checking this late, would be nice to check it earlier.
163 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
165 STM.getCPU() + " is only available on code object version 6 or better",
166 /*gen_crash_diag*/ false);
167 }
168
169 // TODO: Which one is called first, emitStartOfAsmFile or
170 // emitFunctionBodyStart?
171 if (!getTargetStreamer()->getTargetID())
172 initializeTargetID(*F.getParent());
173
174 const auto &FunctionTargetID = STM.getTargetID();
175 // Make sure function's xnack settings are compatible with module's
176 // xnack settings.
177 if (FunctionTargetID.isXnackSupported() &&
178 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
179 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
180 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
181 "' function does not match module xnack setting");
182 return;
183 }
184 // Make sure function's sramecc settings are compatible with module's
185 // sramecc settings.
186 if (FunctionTargetID.isSramEccSupported() &&
187 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
188 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
189 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
190 "' function does not match module sramecc setting");
191 return;
192 }
193
194 if (!MFI.isEntryFunction())
195 return;
196
197 if (STM.isMesaKernel(F) &&
198 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
199 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
200 AMDGPUMCKernelCodeT KernelCode;
201 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
202 KernelCode.validate(&STM, MF->getContext());
204 }
205
206 if (STM.isAmdHsaOS())
207 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
208
209 if (MFI.getNumKernargPreloadedSGPRs() > 0) {
212 STM.isAmdHsaOS());
213 }
214}
215
218 if (!MFI.isEntryFunction())
219 return;
220
222 return;
223
224 auto &Streamer = getTargetStreamer()->getStreamer();
225 auto &Context = Streamer.getContext();
226 auto &ObjectFileInfo = *Context.getObjectFileInfo();
227 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
228
229 Streamer.pushSection();
230 Streamer.switchSection(&ReadOnlySection);
231
232 // CP microcode requires the kernel descriptor to be allocated on 64 byte
233 // alignment.
234 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
235 ReadOnlySection.ensureMinAlignment(Align(64));
236
237 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
238
239 SmallString<128> KernelName;
240 getNameWithPrefix(KernelName, &MF->getFunction());
242 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
243 CurrentProgramInfo.NumVGPRsForWavesPerEU,
245 CurrentProgramInfo.NumSGPRsForWavesPerEU,
247 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
248 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
249 Context),
250 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
251
252 Streamer.popSection();
253}
254
256 Register RegNo = MI->getOperand(0).getReg();
257
260 OS << "implicit-def: "
261 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
262
263 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
264 OS << " : SGPR spill to VGPR lane";
265
266 OutStreamer->AddComment(OS.str());
267 OutStreamer->addBlankLine();
268}
269
273 return;
274 }
275
277 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
278 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
279 SmallString<128> SymbolName;
280 getNameWithPrefix(SymbolName, &MF->getFunction()),
282 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
283 }
284 if (DumpCodeInstEmitter) {
285 // Disassemble function name label to text.
286 DisasmLines.push_back(MF->getName().str() + ":");
287 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
288 HexLines.emplace_back("");
289 }
290
292}
293
295 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
296 // Write a line for the basic block label if it is not only fallthrough.
297 DisasmLines.push_back(
298 (Twine("BB") + Twine(getFunctionNumber())
299 + "_" + Twine(MBB.getNumber()) + ":").str());
300 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
301 HexLines.emplace_back("");
302 }
304}
305
308 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
310 Twine(GV->getName()) +
311 ": unsupported initializer for address space");
312 return;
313 }
314
315 // LDS variables aren't emitted in HSA or PAL yet.
317 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
318 return;
319
320 MCSymbol *GVSym = getSymbol(GV);
321
322 GVSym->redefineIfPossible();
323 if (GVSym->isDefined() || GVSym->isVariable())
324 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
325 "' is already defined");
326
327 const DataLayout &DL = GV->getDataLayout();
328 uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
329 Align Alignment = GV->getAlign().value_or(Align(4));
330
331 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
332 emitLinkage(GV, GVSym);
333 auto TS = getTargetStreamer();
334 TS->emitAMDGPULDS(GVSym, Size, Alignment);
335 return;
336 }
337
339}
340
342 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
343
345 switch (CodeObjectVersion) {
347 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
348 break;
350 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
351 break;
353 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
354 break;
355 default:
356 report_fatal_error("Unexpected code object version");
357 }
358 }
360}
361
363 // Pad with s_code_end to help tools and guard against instruction prefetch
364 // causing stale data in caches. Arguably this should be done by the linker,
365 // which is why this isn't done for Mesa.
366 const MCSubtargetInfo &STI = *getGlobalSTI();
367 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
370 OutStreamer->switchSection(getObjFileLowering().getTextSection());
372 }
373
375}
376
377// Print comments that apply to both callable functions and entry points.
378void AMDGPUAsmPrinter::emitCommonFunctionComments(
379 uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
380 uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
381 const AMDGPUMachineFunction *MFI) {
382 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
383 OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
384 OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
385 if (NumAGPR) {
386 OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
387 OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
388 false);
389 }
390 OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
391 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
392 false);
393}
394
395SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
397 raw_svector_ostream OSS(Str);
398 auto &Streamer = getTargetStreamer()->getStreamer();
399 auto &Context = Streamer.getContext();
400 const MCExpr *New = foldAMDGPUMCExpr(Value, Context);
401 printAMDGPUMCExpr(New, OSS, MAI);
402 return Str;
403}
404
405void AMDGPUAsmPrinter::emitCommonFunctionComments(
406 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
407 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
408 const AMDGPUMachineFunction *MFI) {
409 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
410 OutStreamer->emitRawComment(" NumSgprs: " + getMCExprStr(NumSGPR), false);
411 OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
412 if (NumAGPR && TotalNumVGPR) {
413 OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
414 OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
415 false);
416 }
417 OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
418 false);
419 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
420 false);
421}
422
423const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
424 const MachineFunction &MF) const {
426 MCContext &Ctx = MF.getContext();
427 uint16_t KernelCodeProperties = 0;
428 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
429
430 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
431 KernelCodeProperties |=
432 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
433 }
434 if (UserSGPRInfo.hasDispatchPtr()) {
435 KernelCodeProperties |=
436 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
437 }
438 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {
439 KernelCodeProperties |=
440 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
441 }
442 if (UserSGPRInfo.hasKernargSegmentPtr()) {
443 KernelCodeProperties |=
444 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
445 }
446 if (UserSGPRInfo.hasDispatchID()) {
447 KernelCodeProperties |=
448 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
449 }
450 if (UserSGPRInfo.hasFlatScratchInit()) {
451 KernelCodeProperties |=
452 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
453 }
454 if (UserSGPRInfo.hasPrivateSegmentSize()) {
455 KernelCodeProperties |=
456 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
457 }
459 KernelCodeProperties |=
460 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
461 }
462
463 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
464 // un-evaluatable at this point so it cannot be conditionally checked here.
465 // Instead, we'll directly shift the possibly unknown MCExpr into its place
466 // and bitwise-or it into KernelCodeProperties.
467 const MCExpr *KernelCodePropExpr =
468 MCConstantExpr::create(KernelCodeProperties, Ctx);
469 const MCExpr *OrValue = MCConstantExpr::create(
470 amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
471 OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
472 OrValue, Ctx);
473 KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
474
475 return KernelCodePropExpr;
476}
477
479AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
480 const SIProgramInfo &PI) const {
482 const Function &F = MF.getFunction();
484 MCContext &Ctx = MF.getContext();
485
486 MCKernelDescriptor KernelDescriptor;
487
488 KernelDescriptor.group_segment_fixed_size =
490 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
491
492 Align MaxKernArgAlign;
493 KernelDescriptor.kernarg_size = MCConstantExpr::create(
494 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
495
496 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
497 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
498 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
499
500 int64_t PGRM_Rsrc3 = 1;
501 bool EvaluatableRsrc3 =
502 CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3);
503 (void)PGRM_Rsrc3;
504 (void)EvaluatableRsrc3;
505 assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
506 static_cast<uint64_t>(PGRM_Rsrc3) == 0);
507 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
508
509 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
510 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
511 Ctx);
512
513 return KernelDescriptor;
514}
515
517 // Init target streamer lazily on the first function so that previous passes
518 // can set metadata.
520 initTargetStreamer(*MF.getFunction().getParent());
521
522 ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
523 CurrentProgramInfo.reset(MF);
524
526 MCContext &Ctx = MF.getContext();
527
528 // The starting address of all shader programs must be 256 bytes aligned.
529 // Regular functions just need the basic required instruction alignment.
530 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
531
533
536 // FIXME: This should be an explicit check for Mesa.
537 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
538 MCSectionELF *ConfigSection =
539 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
540 OutStreamer->switchSection(ConfigSection);
541 }
542
543 if (MFI->isModuleEntryFunction()) {
544 getSIProgramInfo(CurrentProgramInfo, MF);
545 }
546
547 if (STM.isAmdPalOS()) {
548 if (MFI->isEntryFunction())
549 EmitPALMetadata(MF, CurrentProgramInfo);
550 else if (MFI->isModuleEntryFunction())
551 emitPALFunctionMetadata(MF);
552 } else if (!STM.isAmdHsaOS()) {
553 EmitProgramInfoSI(MF, CurrentProgramInfo);
554 }
555
556 DumpCodeInstEmitter = nullptr;
557 if (STM.dumpCode()) {
558 // For -dumpcode, get the assembler out of the streamer. This only works
559 // with -filetype=obj.
560 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
561 if (Assembler)
562 DumpCodeInstEmitter = Assembler->getEmitterPtr();
563 }
564
565 DisasmLines.clear();
566 HexLines.clear();
568
570
571 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
572 STM.hasMAIInsts());
573
574 if (isVerbose()) {
575 MCSectionELF *CommentSection =
576 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
577 OutStreamer->switchSection(CommentSection);
578
579 if (!MFI->isEntryFunction()) {
580 OutStreamer->emitRawComment(" Function info:", false);
582 ResourceUsage->getResourceInfo(&MF.getFunction());
583 emitCommonFunctionComments(
584 Info.NumVGPR,
585 STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
586 Info.getTotalNumVGPRs(STM),
587 Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
588 Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI);
589 return false;
590 }
591
592 OutStreamer->emitRawComment(" Kernel info:", false);
593 emitCommonFunctionComments(
594 CurrentProgramInfo.NumArchVGPR,
595 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
596 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
597 CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
598
599 OutStreamer->emitRawComment(
600 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
601 OutStreamer->emitRawComment(
602 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
603 OutStreamer->emitRawComment(
604 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
605 " bytes/workgroup (compile time only)", false);
606
607 OutStreamer->emitRawComment(
608 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
609
610 OutStreamer->emitRawComment(
611 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
612
613 OutStreamer->emitRawComment(
614 " NumSGPRsForWavesPerEU: " +
615 getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
616 false);
617 OutStreamer->emitRawComment(
618 " NumVGPRsForWavesPerEU: " +
619 getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
620 false);
621
622 if (STM.hasGFX90AInsts()) {
623 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
624 CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
625 AdjustedAccum = MCBinaryExpr::createMul(
626 AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
627 OutStreamer->emitRawComment(
628 " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
629 }
630
631 OutStreamer->emitRawComment(
632 " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
633
634 OutStreamer->emitRawComment(
635 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
636
637 OutStreamer->emitRawComment(
638 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
639 getMCExprStr(CurrentProgramInfo.ScratchEnable),
640 false);
641 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
642 Twine(CurrentProgramInfo.UserSGPR),
643 false);
644 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
645 Twine(CurrentProgramInfo.TrapHandlerEnable),
646 false);
647 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
648 Twine(CurrentProgramInfo.TGIdXEnable),
649 false);
650 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
651 Twine(CurrentProgramInfo.TGIdYEnable),
652 false);
653 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
654 Twine(CurrentProgramInfo.TGIdZEnable),
655 false);
656 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
657 Twine(CurrentProgramInfo.TIdIGCompCount),
658 false);
659
660 [[maybe_unused]] int64_t PGMRSrc3;
661 assert(STM.hasGFX90AInsts() ||
662 (CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(
663 PGMRSrc3) &&
664 static_cast<uint64_t>(PGMRSrc3) == 0));
665 if (STM.hasGFX90AInsts()) {
666 OutStreamer->emitRawComment(
667 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
668 getMCExprStr(MCKernelDescriptor::bits_get(
669 CurrentProgramInfo.ComputePGMRSrc3GFX90A,
670 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
671 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
672 false);
673 OutStreamer->emitRawComment(
674 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
675 getMCExprStr(MCKernelDescriptor::bits_get(
676 CurrentProgramInfo.ComputePGMRSrc3GFX90A,
677 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
678 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
679 false);
680 }
681 }
682
683 if (DumpCodeInstEmitter) {
684
685 OutStreamer->switchSection(
686 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
687
688 for (size_t i = 0; i < DisasmLines.size(); ++i) {
689 std::string Comment = "\n";
690 if (!HexLines[i].empty()) {
691 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
692 Comment += " ; " + HexLines[i] + "\n";
693 }
694
695 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
696 OutStreamer->emitBytes(StringRef(Comment));
697 }
698 }
699
700 return false;
701}
702
703// TODO: Fold this into emitFunctionBodyStart.
704void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
705 // In the beginning all features are either 'Any' or 'NotSupported',
706 // depending on global target features. This will cover empty modules.
708 getGlobalSTI()->getFeatureString());
709
710 // If module is empty, we are done.
711 if (M.empty())
712 return;
713
714 // If module is not empty, need to find first 'Off' or 'On' feature
715 // setting per feature from functions in module.
716 for (auto &F : M) {
717 auto &TSTargetID = getTargetStreamer()->getTargetID();
718 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
719 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
720 break;
721
723 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
724 if (TSTargetID->isXnackSupported())
725 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
726 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
727 if (TSTargetID->isSramEccSupported())
728 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
729 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
730 }
731}
732
733uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
735 const SIInstrInfo *TII = STM.getInstrInfo();
736
737 uint64_t CodeSize = 0;
738
739 for (const MachineBasicBlock &MBB : MF) {
740 for (const MachineInstr &MI : MBB) {
741 // TODO: CodeSize should account for multiple functions.
742
743 // TODO: Should we count size of debug info?
744 if (MI.isDebugInstr())
745 continue;
746
747 CodeSize += TII->getInstSizeInBytes(MI);
748 }
749 }
750
751 return CodeSize;
752}
753
754void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
755 const MachineFunction &MF) {
757 ResourceUsage->getResourceInfo(&MF.getFunction());
759 MCContext &Ctx = MF.getContext();
760
761 auto CreateExpr = [&Ctx](int64_t Value) {
762 return MCConstantExpr::create(Value, Ctx);
763 };
764
765 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
766 int64_t Val;
767 if (Value->evaluateAsAbsolute(Val)) {
768 Res = Val;
769 return true;
770 }
771 return false;
772 };
773
774 ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR);
775 ProgInfo.NumAccVGPR = CreateExpr(Info.NumAGPR);
776 ProgInfo.NumVGPR = CreateExpr(Info.getTotalNumVGPRs(STM));
777 ProgInfo.AccumOffset =
778 CreateExpr(alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1);
779 ProgInfo.TgSplit = STM.isTgSplitEnabled();
780 ProgInfo.NumSGPR = CreateExpr(Info.NumExplicitSGPR);
781 ProgInfo.ScratchSize = CreateExpr(Info.PrivateSegmentSize);
782 ProgInfo.VCCUsed = CreateExpr(Info.UsesVCC);
783 ProgInfo.FlatUsed = CreateExpr(Info.UsesFlatScratch);
784 ProgInfo.DynamicCallStack =
785 CreateExpr(Info.HasDynamicallySizedStack || Info.HasRecursion);
786
787 const uint64_t MaxScratchPerWorkitem =
789 uint64_t ScratchSize;
790 if (TryGetMCExprValue(ProgInfo.ScratchSize, ScratchSize) &&
791 ScratchSize > MaxScratchPerWorkitem) {
792 DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ScratchSize,
793 MaxScratchPerWorkitem, DS_Error);
794 MF.getFunction().getContext().diagnose(DiagStackSize);
795 }
796
798
799 // The calculations related to SGPR/VGPR blocks are
800 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
801 // unified.
802 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
803 ProgInfo.VCCUsed, ProgInfo.FlatUsed,
804 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
805
806 // Check the addressable register limit before we add ExtraSGPRs.
808 !STM.hasSGPRInitBug()) {
809 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
810 uint64_t NumSgpr;
811 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
812 NumSgpr > MaxAddressableNumSGPRs) {
813 // This can happen due to a compiler bug or when using inline asm.
816 MF.getFunction(), "addressable scalar registers", NumSgpr,
817 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
818 Ctx.diagnose(Diag);
819 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
820 }
821 }
822
823 // Account for extra SGPRs and VGPRs reserved for debugger use.
824 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
825
826 const Function &F = MF.getFunction();
827
828 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
829 // dispatch registers are function args.
830 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
831
832 if (isShader(F.getCallingConv())) {
833 bool IsPixelShader =
834 F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
835
836 // Calculate the number of VGPR registers based on the SPI input registers
837 uint32_t InputEna = 0;
838 uint32_t InputAddr = 0;
839 unsigned LastEna = 0;
840
841 if (IsPixelShader) {
842 // Note for IsPixelShader:
843 // By this stage, all enabled inputs are tagged in InputAddr as well.
844 // We will use InputAddr to determine whether the input counts against the
845 // vgpr total and only use the InputEnable to determine the last input
846 // that is relevant - if extra arguments are used, then we have to honour
847 // the InputAddr for any intermediate non-enabled inputs.
848 InputEna = MFI->getPSInputEnable();
849 InputAddr = MFI->getPSInputAddr();
850
851 // We only need to consider input args up to the last used arg.
852 assert((InputEna || InputAddr) &&
853 "PSInputAddr and PSInputEnable should "
854 "never both be 0 for AMDGPU_PS shaders");
855 // There are some rare circumstances where InputAddr is non-zero and
856 // InputEna can be set to 0. In this case we default to setting LastEna
857 // to 1.
858 LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
859 }
860
861 // FIXME: We should be using the number of registers determined during
862 // calling convention lowering to legalize the types.
863 const DataLayout &DL = F.getDataLayout();
864 unsigned PSArgCount = 0;
865 unsigned IntermediateVGPR = 0;
866 for (auto &Arg : F.args()) {
867 unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
868 if (Arg.hasAttribute(Attribute::InReg)) {
869 WaveDispatchNumSGPR += NumRegs;
870 } else {
871 // If this is a PS shader and we're processing the PS Input args (first
872 // 16 VGPR), use the InputEna and InputAddr bits to define how many
873 // VGPRs are actually used.
874 // Any extra VGPR arguments are handled as normal arguments (and
875 // contribute to the VGPR count whether they're used or not).
876 if (IsPixelShader && PSArgCount < 16) {
877 if ((1 << PSArgCount) & InputAddr) {
878 if (PSArgCount < LastEna)
879 WaveDispatchNumVGPR += NumRegs;
880 else
881 IntermediateVGPR += NumRegs;
882 }
883 PSArgCount++;
884 } else {
885 // If there are extra arguments we have to include the allocation for
886 // the non-used (but enabled with InputAddr) input arguments
887 if (IntermediateVGPR) {
888 WaveDispatchNumVGPR += IntermediateVGPR;
889 IntermediateVGPR = 0;
890 }
891 WaveDispatchNumVGPR += NumRegs;
892 }
893 }
894 }
896 {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
897
899 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
900
902 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
903 }
904
905 // Adjust number of registers used to meet default/requested minimum/maximum
906 // number of waves per execution unit request.
907 unsigned MaxWaves = MFI->getMaxWavesPerEU();
908 ProgInfo.NumSGPRsForWavesPerEU =
909 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
910 CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
911 Ctx);
912 ProgInfo.NumVGPRsForWavesPerEU =
913 AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
914 CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
915 Ctx);
916
918 STM.hasSGPRInitBug()) {
919 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
920 uint64_t NumSgpr;
921 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
922 NumSgpr > MaxAddressableNumSGPRs) {
923 // This can happen due to a compiler bug or when using inline asm to use
924 // the registers which are usually reserved for vcc etc.
926 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
927 NumSgpr, MaxAddressableNumSGPRs,
929 Ctx.diagnose(Diag);
930 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
931 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
932 }
933 }
934
935 if (STM.hasSGPRInitBug()) {
936 ProgInfo.NumSGPR =
938 ProgInfo.NumSGPRsForWavesPerEU =
940 }
941
942 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
944 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
945 MFI->getNumUserSGPRs(),
947 Ctx.diagnose(Diag);
948 }
949
950 if (MFI->getLDSSize() >
951 static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
954 MF.getFunction(), "local memory", MFI->getLDSSize(),
956 Ctx.diagnose(Diag);
957 }
958 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
959 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
960 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
961 unsigned Granule) {
962 const MCExpr *OneConst = CreateExpr(1ul);
963 const MCExpr *GranuleConst = CreateExpr(Granule);
964 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
965 const MCExpr *AlignToGPR =
966 AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
967 const MCExpr *DivGPR =
968 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
969 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
970 return SubGPR;
971 };
972
973 ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,
975 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
977
978 const SIModeRegisterDefaults Mode = MFI->getMode();
979
980 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
981 // register.
982 ProgInfo.FloatMode = getFPMode(Mode);
983
984 ProgInfo.IEEEMode = Mode.IEEE;
985
986 // Make clamp modifier on NaN input returns 0.
987 ProgInfo.DX10Clamp = Mode.DX10Clamp;
988
989 unsigned LDSAlignShift;
991 // LDS is allocated in 64 dword blocks.
992 LDSAlignShift = 8;
993 } else {
994 // LDS is allocated in 128 dword blocks.
995 LDSAlignShift = 9;
996 }
997
998 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
999 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1000
1001 ProgInfo.LDSSize = MFI->getLDSSize();
1002 ProgInfo.LDSBlocks =
1003 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
1004
1005 // The MCExpr equivalent of divideCeil.
1006 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1007 const MCExpr *Ceil =
1008 AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
1009 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
1010 };
1011
1012 // Scratch is allocated in 64-dword or 256-dword blocks.
1013 unsigned ScratchAlignShift =
1014 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1015 // We need to program the hardware with the amount of scratch memory that
1016 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1017 // scratch memory used per thread.
1018 ProgInfo.ScratchBlocks = DivideCeil(
1020 CreateExpr(STM.getWavefrontSize()), Ctx),
1021 CreateExpr(1ULL << ScratchAlignShift));
1022
1023 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1024 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1025 ProgInfo.MemOrdered = 1;
1026 }
1027
1028 // 0 = X, 1 = XY, 2 = XYZ
1029 unsigned TIDIGCompCnt = 0;
1030 if (MFI->hasWorkItemIDZ())
1031 TIDIGCompCnt = 2;
1032 else if (MFI->hasWorkItemIDY())
1033 TIDIGCompCnt = 1;
1034
1035 // The private segment wave byte offset is the last of the system SGPRs. We
1036 // initially assumed it was allocated, and may have used it. It shouldn't harm
1037 // anything to disable it if we know the stack isn't used here. We may still
1038 // have emitted code reading it to initialize scratch, but if that's unused
1039 // reading garbage should be OK.
1042 MCConstantExpr::create(0, Ctx), Ctx),
1043 ProgInfo.DynamicCallStack, Ctx);
1044
1045 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1046 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1047 ProgInfo.TrapHandlerEnable =
1048 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
1049 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1050 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1051 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1052 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1053 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1054 ProgInfo.EXCPEnMSB = 0;
1055 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1056 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1057 ProgInfo.EXCPEnable = 0;
1058
1059 if (STM.hasGFX90AInsts()) {
1060 // return ((Dst & ~Mask) | (Value << Shift))
1061 auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1062 uint32_t Shift) {
1063 auto Shft = MCConstantExpr::create(Shift, Ctx);
1064 auto Msk = MCConstantExpr::create(Mask, Ctx);
1065 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1067 Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
1068 return Dst;
1069 };
1070
1071 ProgInfo.ComputePGMRSrc3GFX90A =
1072 SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
1073 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1074 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1075 ProgInfo.ComputePGMRSrc3GFX90A =
1076 SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),
1077 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1078 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1079 }
1080
1082 STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
1083 ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
1084
1085 const auto [MinWEU, MaxWEU] =
1086 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1087 uint64_t Occupancy;
1088 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1090 F, F.getSubprogram(),
1091 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1092 "'" +
1093 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1094 ", final occupancy is " + Twine(Occupancy));
1095 F.getContext().diagnose(Diag);
1096 }
1097}
1098
1099static unsigned getRsrcReg(CallingConv::ID CallConv) {
1100 switch (CallConv) {
1101 default: [[fallthrough]];
1109 }
1110}
1111
1112void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1113 const SIProgramInfo &CurrentProgramInfo) {
1115 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1116 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1117 MCContext &Ctx = MF.getContext();
1118
1119 // (((Value) & Mask) << Shift)
1120 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1121 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
1122 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
1124 shft, Ctx);
1125 };
1126
1127 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1128 int64_t Val;
1129 if (Value->evaluateAsAbsolute(Val))
1130 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
1131 else
1132 OutStreamer->emitValue(Value, Size);
1133 };
1134
1137
1138 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
1139 /*Size=*/4);
1140
1142 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
1143
1145
1146 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1147 // appropriate generation.
1148 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1149 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1150 /*Mask=*/0x3FFFF, /*Shift=*/12),
1151 /*Size=*/4);
1152 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1153 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1154 /*Mask=*/0x7FFF, /*Shift=*/12),
1155 /*Size=*/4);
1156 } else {
1157 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1158 /*Mask=*/0x1FFF, /*Shift=*/12),
1159 /*Size=*/4);
1160 }
1161
1162 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1163 // 0" comment but I don't see a corresponding field in the register spec.
1164 } else {
1165 OutStreamer->emitInt32(RsrcReg);
1166
1167 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1168 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1169 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1170 MF.getContext());
1171 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1173
1174 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1175 // appropriate generation.
1176 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1177 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1178 /*Mask=*/0x3FFFF, /*Shift=*/12),
1179 /*Size=*/4);
1180 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1181 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1182 /*Mask=*/0x7FFF, /*Shift=*/12),
1183 /*Size=*/4);
1184 } else {
1185 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1186 /*Mask=*/0x1FFF, /*Shift=*/12),
1187 /*Size=*/4);
1188 }
1189 }
1190
1193 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1194 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1195 : CurrentProgramInfo.LDSBlocks;
1196 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1198 OutStreamer->emitInt32(MFI->getPSInputEnable());
1200 OutStreamer->emitInt32(MFI->getPSInputAddr());
1201 }
1202
1203 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1204 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1205 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1206 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1207}
1208
1209// Helper function to add common PAL Metadata 3.0+
1211 const SIProgramInfo &CurrentProgramInfo,
1212 CallingConv::ID CC, const GCNSubtarget &ST) {
1213 if (ST.hasIEEEMode())
1214 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1215
1216 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1217 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1218
1219 if (AMDGPU::isCompute(CC)) {
1220 MD->setHwStage(CC, ".trap_present",
1221 (bool)CurrentProgramInfo.TrapHandlerEnable);
1222 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1223 }
1224
1225 MD->setHwStage(CC, ".lds_size",
1226 (unsigned)(CurrentProgramInfo.LdsSize *
1227 getLdsDwGranularity(ST) * sizeof(uint32_t)));
1228}
1229
1230// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1231// is AMDPAL. It stores each compute/SPI register setting and other PAL
1232// metadata items into the PALMD::Metadata, combining with any provided by the
1233// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1234// is then written as a single block in the .note section.
1235void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1236 const SIProgramInfo &CurrentProgramInfo) {
1238 auto CC = MF.getFunction().getCallingConv();
1239 auto MD = getTargetStreamer()->getPALMetadata();
1240 auto &Ctx = MF.getContext();
1241
1242 MD->setEntryPoint(CC, MF.getFunction().getName());
1243 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1244
1245 // Only set AGPRs for supported devices
1246 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1247 if (STM.hasMAIInsts()) {
1248 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1249 }
1250
1251 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1252 if (MD->getPALMajorVersion() < 3) {
1253 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
1254 if (AMDGPU::isCompute(CC)) {
1255 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1256 } else {
1257 const MCExpr *HasScratchBlocks =
1258 MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
1259 MCConstantExpr::create(0, Ctx), Ctx);
1260 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1261 MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1262 }
1263 } else {
1264 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1265 MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
1266 CurrentProgramInfo.ScratchEnable);
1267 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
1268 }
1269
1270 // ScratchSize is in bytes, 16 aligned.
1271 MD->setScratchSize(
1272 CC,
1273 AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
1274 MCConstantExpr::create(16, Ctx), Ctx),
1275 Ctx);
1276
1278 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1279 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1280 : CurrentProgramInfo.LDSBlocks;
1281 if (MD->getPALMajorVersion() < 3) {
1282 MD->setRsrc2(
1283 CC,
1285 Ctx);
1286 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1287 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1288 } else {
1289 // Graphics registers
1290 const unsigned ExtraLdsDwGranularity =
1291 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1292 MD->setGraphicsRegisters(
1293 ".ps_extra_lds_size",
1294 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1295
1296 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1297 static StringLiteral const PsInputFields[] = {
1298 ".persp_sample_ena", ".persp_center_ena",
1299 ".persp_centroid_ena", ".persp_pull_model_ena",
1300 ".linear_sample_ena", ".linear_center_ena",
1301 ".linear_centroid_ena", ".line_stipple_tex_ena",
1302 ".pos_x_float_ena", ".pos_y_float_ena",
1303 ".pos_z_float_ena", ".pos_w_float_ena",
1304 ".front_face_ena", ".ancillary_ena",
1305 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1306 unsigned PSInputEna = MFI->getPSInputEnable();
1307 unsigned PSInputAddr = MFI->getPSInputAddr();
1308 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1309 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1310 (bool)((PSInputEna >> Idx) & 1));
1311 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1312 (bool)((PSInputAddr >> Idx) & 1));
1313 }
1314 }
1315 }
1316
1317 // For version 3 and above the wave front size is already set in the metadata
1318 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1319 MD->setWave32(MF.getFunction().getCallingConv());
1320}
1321
1322void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1323 auto *MD = getTargetStreamer()->getPALMetadata();
1324 const MachineFrameInfo &MFI = MF.getFrameInfo();
1325 StringRef FnName = MF.getFunction().getName();
1326 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1328 MCContext &Ctx = MF.getContext();
1329
1330 if (MD->getPALMajorVersion() < 3) {
1331 // Set compute registers
1332 MD->setRsrc1(
1334 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1335 MD->setRsrc2(CallingConv::AMDGPU_CS,
1336 CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1337 } else {
1338 EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
1339 }
1340
1341 // Set optional info
1342 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1343 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1344 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1345}
1346
1347// This is supposed to be log2(Size)
1349 switch (Size) {
1350 case 4:
1351 return AMD_ELEMENT_4_BYTES;
1352 case 8:
1353 return AMD_ELEMENT_8_BYTES;
1354 case 16:
1355 return AMD_ELEMENT_16_BYTES;
1356 default:
1357 llvm_unreachable("invalid private_element_size");
1358 }
1359}
1360
1361void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1362 const SIProgramInfo &CurrentProgramInfo,
1363 const MachineFunction &MF) const {
1364 const Function &F = MF.getFunction();
1365 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1366 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1367
1369 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1370 MCContext &Ctx = MF.getContext();
1371
1372 Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false);
1373
1375 CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
1377 CurrentProgramInfo.getComputePGMRSrc2(Ctx);
1379
1380 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1381
1383 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1384
1385 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1386 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1388 }
1389
1390 if (UserSGPRInfo.hasDispatchPtr())
1392
1393 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)
1395
1396 if (UserSGPRInfo.hasKernargSegmentPtr())
1398
1399 if (UserSGPRInfo.hasDispatchID())
1401
1402 if (UserSGPRInfo.hasFlatScratchInit())
1404
1405 if (UserSGPRInfo.hasPrivateSegmentSize())
1407
1408 if (UserSGPRInfo.hasDispatchPtr())
1410
1411 if (STM.isXNACKEnabled())
1413
1414 Align MaxKernArgAlign;
1415 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1416 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1417 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1418 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1419 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1420
1421 // kernarg_segment_alignment is specified as log of the alignment.
1422 // The minimum alignment is 16.
1423 // FIXME: The metadata treats the minimum as 4?
1424 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1425}
1426
1428 const char *ExtraCode, raw_ostream &O) {
1429 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1430 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1431 return false;
1432
1433 if (ExtraCode && ExtraCode[0]) {
1434 if (ExtraCode[1] != 0)
1435 return true; // Unknown modifier.
1436
1437 switch (ExtraCode[0]) {
1438 case 'r':
1439 break;
1440 default:
1441 return true;
1442 }
1443 }
1444
1445 // TODO: Should be able to support other operand types like globals.
1446 const MachineOperand &MO = MI->getOperand(OpNo);
1447 if (MO.isReg()) {
1450 return false;
1451 }
1452 if (MO.isImm()) {
1453 int64_t Val = MO.getImm();
1455 O << Val;
1456 } else if (isUInt<16>(Val)) {
1457 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1458 } else if (isUInt<32>(Val)) {
1459 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1460 } else {
1461 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1462 }
1463 return false;
1464 }
1465 return true;
1466}
1467
1472}
1473
1474void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1475 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1476 bool isModuleEntryFunction, bool hasMAIInsts) {
1477 if (!ORE)
1478 return;
1479
1480 const char *Name = "kernel-resource-usage";
1481 const char *Indent = " ";
1482
1483 // If the remark is not specifically enabled, do not output to yaml
1486 return;
1487
1488 // Currently non-kernel functions have no resources to emit.
1490 return;
1491
1492 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1493 StringRef RemarkLabel, auto Argument) {
1494 // Add an indent for every line besides the line with the kernel name. This
1495 // makes it easier to tell which resource usage go with which kernel since
1496 // the kernel name will always be displayed first.
1497 std::string LabelStr = RemarkLabel.str() + ": ";
1498 if (RemarkName != "FunctionName")
1499 LabelStr = Indent + LabelStr;
1500
1501 ORE->emit([&]() {
1502 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1504 &MF.front())
1505 << LabelStr << ore::NV(RemarkName, Argument);
1506 });
1507 };
1508
1509 // FIXME: Formatting here is pretty nasty because clang does not accept
1510 // newlines from diagnostics. This forces us to emit multiple diagnostic
1511 // remarks to simulate newlines. If and when clang does accept newlines, this
1512 // formatting should be aggregated into one remark with newlines to avoid
1513 // printing multiple diagnostic location and diag opts.
1514 EmitResourceUsageRemark("FunctionName", "Function Name",
1515 MF.getFunction().getName());
1516 EmitResourceUsageRemark("NumSGPR", "SGPRs",
1517 getMCExprStr(CurrentProgramInfo.NumSGPR));
1518 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1519 getMCExprStr(CurrentProgramInfo.NumArchVGPR));
1520 if (hasMAIInsts) {
1521 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1522 getMCExprStr(CurrentProgramInfo.NumAccVGPR));
1523 }
1524 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1525 getMCExprStr(CurrentProgramInfo.ScratchSize));
1526 int64_t DynStack;
1527 bool DynStackEvaluatable =
1528 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
1529 StringRef DynamicStackStr =
1530 DynStackEvaluatable && DynStack ? "True" : "False";
1531 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1532 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1533 getMCExprStr(CurrentProgramInfo.Occupancy));
1534 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1535 CurrentProgramInfo.SGPRSpill);
1536 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1537 CurrentProgramInfo.VGPRSpill);
1538 if (isModuleEntryFunction)
1539 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1540 CurrentProgramInfo.LDSSize);
1541}
#define Success
LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, const SIProgramInfo &CurrentProgramInfo, CallingConv::ID CC, const GCNSubtarget &ST)
static unsigned getRsrcReg(CallingConv::ID CallConv)
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
AMDHSA kernel descriptor MCExpr struct for use in MC layer.
Analyzes how many registers and other resources are used by functions.
AMDHSA kernel descriptor definitions.
MC layer struct for AMDGPUMCKernelCodeT, provides MCExpr functionality where required.
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_EXTERNAL_VISIBILITY
Definition: Compiler.h:131
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::string Name
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
R600 Assembly printer class.
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition: SIDefines.h:1047
#define R_0286E8_SPI_TMPRING_SIZE
Definition: SIDefines.h:1185
#define FP_ROUND_MODE_DP(x)
Definition: SIDefines.h:1167
#define C_00B84C_SCRATCH_EN
Definition: SIDefines.h:1083
#define FP_ROUND_ROUND_TO_NEAREST
Definition: SIDefines.h:1159
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition: SIDefines.h:1118
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition: SIDefines.h:1180
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition: SIDefines.h:1070
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition: SIDefines.h:1069
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition: SIDefines.h:1078
#define R_0286CC_SPI_PS_INPUT_ENA
Definition: SIDefines.h:1117
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition: SIDefines.h:1056
#define FP_DENORM_MODE_DP(x)
Definition: SIDefines.h:1178
#define R_00B848_COMPUTE_PGM_RSRC1
Definition: SIDefines.h:1120
#define R_SPILLED_SGPRS
Definition: SIDefines.h:1199
#define FP_ROUND_MODE_SP(x)
Definition: SIDefines.h:1166
#define FP_DENORM_MODE_SP(x)
Definition: SIDefines.h:1177
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition: SIDefines.h:1061
#define R_SPILLED_VGPRS
Definition: SIDefines.h:1200
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition: SIDefines.h:1055
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition: SIDefines.h:1080
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition: SIDefines.h:1054
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
Emit the specified function out to the OutStreamer.
void emitFunctionBodyEnd() override
Targets can override this to emit stuff after the last basic block in the function.
bool doFinalization(Module &M) override
Shut down the asmprinter.
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(unsigned RegNo, raw_ostream &O, const MCRegisterInfo &MRI)
static const AMDGPUMCExpr * createMax(ArrayRef< const MCExpr * > Args, MCContext &Ctx)
Definition: AMDGPUMCExpr.h:69
static const AMDGPUMCExpr * createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs, const GCNSubtarget &STM, MCContext &Ctx)
Mimics GCNSubtarget::computeOccupancy for MCExpr.
static const AMDGPUMCExpr * createTotalNumVGPR(const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx)
static const AMDGPUMCExpr * createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, bool XNACKUsed, MCContext &Ctx)
Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed are unresolvable but neede...
static const AMDGPUMCExpr * createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx)
Definition: AMDGPUMCExpr.h:83
void setHwStage(unsigned CC, StringRef field, unsigned Val)
unsigned getAddressableLocalMemorySize() const
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR, const MCExpr *NextSGPR, const MCExpr *ReserveVCC, const MCExpr *ReserveFlatScr)
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV)
void initializeTargetID(const MCSubtargetInfo &STI)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitDirectiveAMDGCNTarget()
virtual void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header)
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI, bool TrapEnabled)
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
This class is intended to be used as a driving class for all asm writers.
Definition: AsmPrinter.h:86
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
Definition: AsmPrinter.cpp:383
MCSymbol * getSymbol(const GlobalValue *GV) const
Definition: AsmPrinter.cpp:676
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
Definition: AsmPrinter.cpp:698
TargetMachine & TM
Target machine description.
Definition: AsmPrinter.h:89
const MCAsmInfo * MAI
Target Asm Printer information.
Definition: AsmPrinter.h:92
MachineFunction * MF
The current machine function.
Definition: AsmPrinter.h:104
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
Definition: AsmPrinter.cpp:434
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
Definition: AsmPrinter.cpp:631
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
Definition: AsmPrinter.cpp:425
unsigned getFunctionNumber() const
Return a unique ID for the current function.
Definition: AsmPrinter.cpp:379
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition: AsmPrinter.h:116
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition: AsmPrinter.h:96
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition: AsmPrinter.h:101
bool isVerbose() const
Return true if assembly output should contain comments.
Definition: AsmPrinter.h:257
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
Definition: AsmPrinter.cpp:671
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Diagnostic information for optimization failures.
Diagnostic information for stack size etc.
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1837
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:281
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:380
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
bool hasMAIInsts() const
Definition: GCNSubtarget.h:815
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:266
bool hasSGPRInitBug() const
bool isTgSplitEnabled() const
Definition: GCNSubtarget.h:609
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool isCuModeEnabled() const
Definition: GCNSubtarget.h:613
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
Definition: GCNSubtarget.h:302
bool dumpCode() const
Definition: GCNSubtarget.h:513
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:601
bool isWave32() const
unsigned getMaxNumUserSGPRs() const
Definition: GCNSubtarget.h:981
Generation getGeneration() const
Definition: GCNSubtarget.h:317
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
Definition: GCNSubtarget.h:321
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasPrivateSegmentSize() const
MaybeAlign getAlign() const
Returns the alignment of the given variable or function.
Definition: GlobalObject.h:80
VisibilityTypes getVisibility() const
Definition: GlobalValue.h:248
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:290
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition: Globals.cpp:124
Type * getValueType() const
Definition: GlobalValue.h:296
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
const DiagnosticHandler * getDiagHandlerPtr() const
getDiagHandlerPtr - Returns const raw pointer of DiagnosticHandler set by setDiagnosticHandler.
MCCodeEmitter * getEmitterPtr() const
Definition: MCAssembler.h:186
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:537
static const MCBinaryExpr * createAdd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:532
static const MCBinaryExpr * createOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:597
static const MCBinaryExpr * createLOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:567
static const MCBinaryExpr * createMul(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:587
static const MCBinaryExpr * createGT(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:552
static const MCBinaryExpr * createDiv(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:542
static const MCBinaryExpr * createShl(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:602
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:617
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:193
Context object for machine code objects.
Definition: MCContext.h:83
const MCObjectFileInfo * getObjectFileInfo() const
Definition: MCContext.h:416
void reportError(SMLoc L, const Twine &Msg)
Definition: MCContext.cpp:1068
Base class for the full range of assembler expressions which are needed for parsing.
Definition: MCExpr.h:34
MCSection * getReadOnlySection() const
MCContext & getContext() const
This represents a section on linux, lots of unix variants and some bare metal systems.
Definition: MCSectionELF.h:27
void ensureMinAlignment(Align MinAlignment)
Makes sure that Alignment is at least MinAlignment.
Definition: MCSection.h:150
MCContext & getContext() const
Definition: MCStreamer.h:300
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition: MCSymbol.h:250
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:205
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition: MCSymbol.h:300
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition: MCSymbol.h:232
MCStreamer & getStreamer()
Definition: MCStreamer.h:102
static const MCUnaryExpr * createNot(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition: MCExpr.h:462
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void setAlignment(Align A)
setAlignment - Set the alignment of the function.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MCContext & getContext() const
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
Definition: MachineInstr.h:69
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getNumKernargPreloadedSGPRs() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:838
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::string str() const
str - Get the contents as an std::string.
Definition: StringRef.h:215
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const Triple & getTargetTriple() const
const MCSubtargetInfo * getMCSubtargetInfo() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:382
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, std::optional< bool > EnableWavefrontSize32)
unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI)
void printAMDGPUMCExpr(const MCExpr *Expr, raw_ostream &OS, const MCAsmInfo *MAI)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
bool isEntryFunctionCC(CallingConv::ID CC)
IsaVersion getIsaVersion(StringRef GPU)
bool isCompute(CallingConv::ID cc)
const MCExpr * maskShiftSet(const MCExpr *Val, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
Provided with the MCExpr * Val, uint32 Mask and Shift, will return the masked and left shifted,...
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX90A(const MCSubtargetInfo &STI)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isShader(CallingConv::ID cc)
const MCExpr * foldAMDGPUMCExpr(const MCExpr *Expr, MCContext &Ctx)
bool isGFX10Plus(const MCSubtargetInfo &STI)
constexpr std::pair< unsigned, unsigned > getShiftMask(unsigned Value)
Deduce the least significant bit aligned shift and mask values for a binary Complement Value (as they...
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ SHT_PROGBITS
Definition: ELF.h:1089
@ STT_AMDGPU_HSA_KERNEL
Definition: ELF.h:1365
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2431
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
Target & getTheGCNTarget()
The target for GCN GPUs.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1856
@ DS_Error
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
const SIFunctionResourceInfo & getResourceInfo(const Function *F) const
void validate(const MCSubtargetInfo *STI, MCContext &Ctx)
void initDefault(const MCSubtargetInfo *STI, MCContext &Ctx, bool InitMCExpr=true)
static const MCExpr * bits_get(const MCExpr *Src, uint32_t Shift, uint32_t Mask, MCContext &Ctx)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
virtual bool isAnalysisRemarkEnabled(StringRef PassName) const
Return true if analysis remarks are enabled, override to provide different implementation.
Track resource usage for kernels / entry functions.
Definition: SIProgramInfo.h:31
const MCExpr * NumSGPR
Definition: SIProgramInfo.h:70
const MCExpr * ComputePGMRSrc3GFX90A
Definition: SIProgramInfo.h:63
const MCExpr * NumArchVGPR
Definition: SIProgramInfo.h:66
const MCExpr * getComputePGMRSrc2(MCContext &Ctx) const
Compute the value of the ComputePGMRsrc2 register.
const MCExpr * VGPRBlocks
Definition: SIProgramInfo.h:33
const MCExpr * ScratchBlocks
Definition: SIProgramInfo.h:48
const MCExpr * getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const
Compute the value of the ComputePGMRsrc1 register.
const MCExpr * VCCUsed
Definition: SIProgramInfo.h:90
const MCExpr * FlatUsed
Definition: SIProgramInfo.h:74
uint32_t TrapHandlerEnable
Definition: SIProgramInfo.h:53
const MCExpr * ScratchEnable
Definition: SIProgramInfo.h:51
const MCExpr * AccumOffset
Definition: SIProgramInfo.h:68
const MCExpr * NumAccVGPR
Definition: SIProgramInfo.h:67
const MCExpr * DynamicCallStack
Definition: SIProgramInfo.h:87
const MCExpr * SGPRBlocks
Definition: SIProgramInfo.h:34
const MCExpr * NumVGPRsForWavesPerEU
Definition: SIProgramInfo.h:80
const MCExpr * NumVGPR
Definition: SIProgramInfo.h:65
const MCExpr * getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST, MCContext &Ctx) const
const MCExpr * Occupancy
Definition: SIProgramInfo.h:83
const MCExpr * ScratchSize
Definition: SIProgramInfo.h:44
const MCExpr * NumSGPRsForWavesPerEU
Definition: SIProgramInfo.h:77
void reset(const MachineFunction &MF)
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.