LLVM 19.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
22#include "AMDKernelCodeT.h"
23#include "GCNSubtarget.h"
28#include "R600AsmPrinter.h"
37#include "llvm/MC/MCAssembler.h"
38#include "llvm/MC/MCContext.h"
40#include "llvm/MC/MCStreamer.h"
46
47using namespace llvm;
48using namespace llvm::AMDGPU;
49
50// This should get the default rounding mode from the kernel. We just set the
51// default here, but this could change if the OpenCL rounding mode pragmas are
52// used.
53//
54// The denormal mode here should match what is reported by the OpenCL runtime
55// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
56// can also be override to flush with the -cl-denorms-are-zero compiler flag.
57//
58// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
59// precision, and leaves single precision to flush all and does not report
60// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
61// CL_FP_DENORM for both.
62//
63// FIXME: It seems some instructions do not support single precision denormals
64// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
65// and sin_f32, cos_f32 on most parts).
66
67// We want to use these instructions, and using fp32 denormals also causes
68// instructions to run at the double precision rate for the device so it's
69// probably best to just report no single precision denormals.
73 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
74 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
75}
76
77static AsmPrinter *
79 std::unique_ptr<MCStreamer> &&Streamer) {
80 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
81}
82
88}
89
91 std::unique_ptr<MCStreamer> Streamer)
92 : AsmPrinter(TM, std::move(Streamer)) {
93 assert(OutStreamer && "AsmPrinter constructed without streamer");
94}
95
97 return "AMDGPU Assembly Printer";
98}
99
101 return TM.getMCSubtargetInfo();
102}
103
105 if (!OutStreamer)
106 return nullptr;
107 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
108}
109
112}
113
114void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
116
117 // TODO: Which one is called first, emitStartOfAsmFile or
118 // emitFunctionBodyStart?
119 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
120 initializeTargetID(M);
121
124 return;
125
127
130 CodeObjectVersion);
131 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
132 }
133
136}
137
138uint64_t AMDGPUAsmPrinter::getMCExprValue(const MCExpr *Value, MCContext &Ctx) {
139 int64_t Val;
140 if (!Value->evaluateAsAbsolute(Val)) {
141 Ctx.reportError(SMLoc(), "could not resolve expression when required.");
142 return 0;
143 }
144 return static_cast<uint64_t>(Val);
145}
146
148 // Init target streamer if it has not yet happened
150 initTargetStreamer(M);
151
154
155 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
156 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
158 HSAMetadataStream->end();
159 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
160 (void)Success;
161 assert(Success && "Malformed HSA Metadata");
162 }
163}
164
167 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
168 const Function &F = MF->getFunction();
169
170 // TODO: We're checking this late, would be nice to check it earlier.
171 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
173 STM.getCPU() + " is only available on code object version 6 or better",
174 /*gen_crash_diag*/ false);
175 }
176
177 // TODO: Which one is called first, emitStartOfAsmFile or
178 // emitFunctionBodyStart?
179 if (!getTargetStreamer()->getTargetID())
180 initializeTargetID(*F.getParent());
181
182 const auto &FunctionTargetID = STM.getTargetID();
183 // Make sure function's xnack settings are compatible with module's
184 // xnack settings.
185 if (FunctionTargetID.isXnackSupported() &&
186 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
187 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
188 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
189 "' function does not match module xnack setting");
190 return;
191 }
192 // Make sure function's sramecc settings are compatible with module's
193 // sramecc settings.
194 if (FunctionTargetID.isSramEccSupported() &&
195 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
196 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
197 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
198 "' function does not match module sramecc setting");
199 return;
200 }
201
202 if (!MFI.isEntryFunction())
203 return;
204
205 if (STM.isMesaKernel(F) &&
206 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
207 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
208 amd_kernel_code_t KernelCode;
209 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
211 }
212
213 if (STM.isAmdHsaOS())
214 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
215
216 if (MFI.getNumKernargPreloadedSGPRs() > 0) {
219 STM.isAmdHsaOS());
220 }
221}
222
225 if (!MFI.isEntryFunction())
226 return;
227
229 return;
230
231 auto &Streamer = getTargetStreamer()->getStreamer();
232 auto &Context = Streamer.getContext();
233 auto &ObjectFileInfo = *Context.getObjectFileInfo();
234 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
235
236 Streamer.pushSection();
237 Streamer.switchSection(&ReadOnlySection);
238
239 // CP microcode requires the kernel descriptor to be allocated on 64 byte
240 // alignment.
241 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
242 ReadOnlySection.ensureMinAlignment(Align(64));
243
244 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
245
246 SmallString<128> KernelName;
247 getNameWithPrefix(KernelName, &MF->getFunction());
249 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
250 getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Context),
251 getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Context) -
253 &STM, getMCExprValue(CurrentProgramInfo.VCCUsed, Context),
254 getMCExprValue(CurrentProgramInfo.FlatUsed, Context),
255 getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
256 getMCExprValue(CurrentProgramInfo.VCCUsed, Context),
257 getMCExprValue(CurrentProgramInfo.FlatUsed, Context));
258
259 Streamer.popSection();
260}
261
263 Register RegNo = MI->getOperand(0).getReg();
264
267 OS << "implicit-def: "
268 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
269
270 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
271 OS << " : SGPR spill to VGPR lane";
272
273 OutStreamer->AddComment(OS.str());
274 OutStreamer->addBlankLine();
275}
276
280 return;
281 }
282
284 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
285 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
286 SmallString<128> SymbolName;
287 getNameWithPrefix(SymbolName, &MF->getFunction()),
289 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
290 }
291 if (DumpCodeInstEmitter) {
292 // Disassemble function name label to text.
293 DisasmLines.push_back(MF->getName().str() + ":");
294 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
295 HexLines.push_back("");
296 }
297
299}
300
302 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
303 // Write a line for the basic block label if it is not only fallthrough.
304 DisasmLines.push_back(
305 (Twine("BB") + Twine(getFunctionNumber())
306 + "_" + Twine(MBB.getNumber()) + ":").str());
307 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
308 HexLines.push_back("");
309 }
311}
312
315 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
317 Twine(GV->getName()) +
318 ": unsupported initializer for address space");
319 return;
320 }
321
322 // LDS variables aren't emitted in HSA or PAL yet.
324 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
325 return;
326
327 MCSymbol *GVSym = getSymbol(GV);
328
329 GVSym->redefineIfPossible();
330 if (GVSym->isDefined() || GVSym->isVariable())
331 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
332 "' is already defined");
333
334 const DataLayout &DL = GV->getParent()->getDataLayout();
335 uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
336 Align Alignment = GV->getAlign().value_or(Align(4));
337
338 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
339 emitLinkage(GV, GVSym);
340 auto TS = getTargetStreamer();
341 TS->emitAMDGPULDS(GVSym, Size, Alignment);
342 return;
343 }
344
346}
347
349 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
350
352 switch (CodeObjectVersion) {
354 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4());
355 break;
357 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV5());
358 break;
360 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV6());
361 break;
362 default:
363 report_fatal_error("Unexpected code object version");
364 }
365 }
367}
368
370 // Pad with s_code_end to help tools and guard against instruction prefetch
371 // causing stale data in caches. Arguably this should be done by the linker,
372 // which is why this isn't done for Mesa.
373 const MCSubtargetInfo &STI = *getGlobalSTI();
374 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
377 OutStreamer->switchSection(getObjFileLowering().getTextSection());
379 }
380
382}
383
384// Print comments that apply to both callable functions and entry points.
385void AMDGPUAsmPrinter::emitCommonFunctionComments(
386 uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
387 uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
388 const AMDGPUMachineFunction *MFI) {
389 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
390 OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
391 OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
392 if (NumAGPR) {
393 OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
394 OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
395 false);
396 }
397 OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
398 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
399 false);
400}
401
402uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
403 const MachineFunction &MF) const {
405 uint16_t KernelCodeProperties = 0;
406 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
407
408 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
409 KernelCodeProperties |=
410 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
411 }
412 if (UserSGPRInfo.hasDispatchPtr()) {
413 KernelCodeProperties |=
414 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
415 }
416 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {
417 KernelCodeProperties |=
418 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
419 }
420 if (UserSGPRInfo.hasKernargSegmentPtr()) {
421 KernelCodeProperties |=
422 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
423 }
424 if (UserSGPRInfo.hasDispatchID()) {
425 KernelCodeProperties |=
426 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
427 }
428 if (UserSGPRInfo.hasFlatScratchInit()) {
429 KernelCodeProperties |=
430 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
431 }
433 KernelCodeProperties |=
434 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
435 }
436
437 if (getMCExprValue(CurrentProgramInfo.DynamicCallStack, MF.getContext()) &&
438 CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
439 KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
440
441 return KernelCodeProperties;
442}
443
445AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
446 const SIProgramInfo &PI) const {
448 const Function &F = MF.getFunction();
450 MCContext &Ctx = MF.getContext();
451
452 MCKernelDescriptor KernelDescriptor;
453
454 KernelDescriptor.group_segment_fixed_size =
456 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
457
458 Align MaxKernArgAlign;
459 KernelDescriptor.kernarg_size = MCConstantExpr::create(
460 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
461
462 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
463 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
464 KernelDescriptor.kernel_code_properties =
465 MCConstantExpr::create(getAmdhsaKernelCodeProperties(MF), Ctx);
466
467 assert(STM.hasGFX90AInsts() ||
468 getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0);
469 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
470
471 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
472 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
473 Ctx);
474
475 return KernelDescriptor;
476}
477
479 // Init target streamer lazily on the first function so that previous passes
480 // can set metadata.
482 initTargetStreamer(*MF.getFunction().getParent());
483
484 ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
485 CurrentProgramInfo.reset(MF);
486
488 MCContext &Ctx = MF.getContext();
489
490 // The starting address of all shader programs must be 256 bytes aligned.
491 // Regular functions just need the basic required instruction alignment.
492 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
493
495
498 // FIXME: This should be an explicit check for Mesa.
499 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
500 MCSectionELF *ConfigSection =
501 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
502 OutStreamer->switchSection(ConfigSection);
503 }
504
505 if (MFI->isModuleEntryFunction()) {
506 getSIProgramInfo(CurrentProgramInfo, MF);
507 }
508
509 if (STM.isAmdPalOS()) {
510 if (MFI->isEntryFunction())
511 EmitPALMetadata(MF, CurrentProgramInfo);
512 else if (MFI->isModuleEntryFunction())
513 emitPALFunctionMetadata(MF);
514 } else if (!STM.isAmdHsaOS()) {
515 EmitProgramInfoSI(MF, CurrentProgramInfo);
516 }
517
518 DumpCodeInstEmitter = nullptr;
519 if (STM.dumpCode()) {
520 // For -dumpcode, get the assembler out of the streamer, even if it does
521 // not really want to let us have it. This only works with -filetype=obj.
522 bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing();
523 OutStreamer->setUseAssemblerInfoForParsing(true);
524 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
525 OutStreamer->setUseAssemblerInfoForParsing(SaveFlag);
526 if (Assembler)
527 DumpCodeInstEmitter = Assembler->getEmitterPtr();
528 }
529
530 DisasmLines.clear();
531 HexLines.clear();
533
535
536 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
537 STM.hasMAIInsts());
538
539 if (isVerbose()) {
540 MCSectionELF *CommentSection =
541 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
542 OutStreamer->switchSection(CommentSection);
543
544 if (!MFI->isEntryFunction()) {
545 OutStreamer->emitRawComment(" Function info:", false);
547 ResourceUsage->getResourceInfo(&MF.getFunction());
548 emitCommonFunctionComments(
549 Info.NumVGPR,
550 STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
551 Info.getTotalNumVGPRs(STM),
552 Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
553 Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI);
554 return false;
555 }
556
557 OutStreamer->emitRawComment(" Kernel info:", false);
558 emitCommonFunctionComments(
559 getMCExprValue(CurrentProgramInfo.NumArchVGPR, Ctx),
560 STM.hasMAIInsts() ? getMCExprValue(CurrentProgramInfo.NumAccVGPR, Ctx)
561 : std::optional<uint32_t>(),
562 getMCExprValue(CurrentProgramInfo.NumVGPR, Ctx),
563 getMCExprValue(CurrentProgramInfo.NumSGPR, Ctx),
564 getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx),
565 getFunctionCodeSize(MF), MFI);
566
567 OutStreamer->emitRawComment(
568 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
569 OutStreamer->emitRawComment(
570 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
571 OutStreamer->emitRawComment(
572 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
573 " bytes/workgroup (compile time only)", false);
574
575 OutStreamer->emitRawComment(
576 " SGPRBlocks: " +
577 Twine(getMCExprValue(CurrentProgramInfo.SGPRBlocks, Ctx)),
578 false);
579 OutStreamer->emitRawComment(
580 " VGPRBlocks: " +
581 Twine(getMCExprValue(CurrentProgramInfo.VGPRBlocks, Ctx)),
582 false);
583
584 OutStreamer->emitRawComment(
585 " NumSGPRsForWavesPerEU: " +
586 Twine(
587 getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx)),
588 false);
589 OutStreamer->emitRawComment(
590 " NumVGPRsForWavesPerEU: " +
591 Twine(
592 getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx)),
593 false);
594
595 if (STM.hasGFX90AInsts())
596 OutStreamer->emitRawComment(
597 " AccumOffset: " +
598 Twine((getMCExprValue(CurrentProgramInfo.AccumOffset, Ctx) + 1) *
599 4),
600 false);
601
602 OutStreamer->emitRawComment(
603 " Occupancy: " +
604 Twine(getMCExprValue(CurrentProgramInfo.Occupancy, Ctx)),
605 false);
606
607 OutStreamer->emitRawComment(
608 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
609
610 OutStreamer->emitRawComment(
611 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
612 Twine(getMCExprValue(CurrentProgramInfo.ScratchEnable, Ctx)),
613 false);
614 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
615 Twine(CurrentProgramInfo.UserSGPR),
616 false);
617 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
618 Twine(CurrentProgramInfo.TrapHandlerEnable),
619 false);
620 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
621 Twine(CurrentProgramInfo.TGIdXEnable),
622 false);
623 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
624 Twine(CurrentProgramInfo.TGIdYEnable),
625 false);
626 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
627 Twine(CurrentProgramInfo.TGIdZEnable),
628 false);
629 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
630 Twine(CurrentProgramInfo.TIdIGCompCount),
631 false);
632
633 assert(STM.hasGFX90AInsts() ||
634 getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx) == 0);
635 if (STM.hasGFX90AInsts()) {
636 OutStreamer->emitRawComment(
637 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
639 getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx),
640 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
641 false);
642 OutStreamer->emitRawComment(
643 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
645 getMCExprValue(CurrentProgramInfo.ComputePGMRSrc3GFX90A, Ctx),
646 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
647 false);
648 }
649 }
650
651 if (DumpCodeInstEmitter) {
652
653 OutStreamer->switchSection(
654 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
655
656 for (size_t i = 0; i < DisasmLines.size(); ++i) {
657 std::string Comment = "\n";
658 if (!HexLines[i].empty()) {
659 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
660 Comment += " ; " + HexLines[i] + "\n";
661 }
662
663 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
664 OutStreamer->emitBytes(StringRef(Comment));
665 }
666 }
667
668 return false;
669}
670
671// TODO: Fold this into emitFunctionBodyStart.
672void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
673 // In the beginning all features are either 'Any' or 'NotSupported',
674 // depending on global target features. This will cover empty modules.
676 getGlobalSTI()->getFeatureString());
677
678 // If module is empty, we are done.
679 if (M.empty())
680 return;
681
682 // If module is not empty, need to find first 'Off' or 'On' feature
683 // setting per feature from functions in module.
684 for (auto &F : M) {
685 auto &TSTargetID = getTargetStreamer()->getTargetID();
686 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
687 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
688 break;
689
691 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
692 if (TSTargetID->isXnackSupported())
693 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
694 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
695 if (TSTargetID->isSramEccSupported())
696 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
697 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
698 }
699}
700
701uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
703 const SIInstrInfo *TII = STM.getInstrInfo();
704
705 uint64_t CodeSize = 0;
706
707 for (const MachineBasicBlock &MBB : MF) {
708 for (const MachineInstr &MI : MBB) {
709 // TODO: CodeSize should account for multiple functions.
710
711 // TODO: Should we count size of debug info?
712 if (MI.isDebugInstr())
713 continue;
714
715 CodeSize += TII->getInstSizeInBytes(MI);
716 }
717 }
718
719 return CodeSize;
720}
721
722void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
723 const MachineFunction &MF) {
725 ResourceUsage->getResourceInfo(&MF.getFunction());
727 MCContext &Ctx = MF.getContext();
728
729 auto CreateExpr = [&Ctx](int64_t Value) {
730 return MCConstantExpr::create(Value, Ctx);
731 };
732
733 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
734 int64_t Val;
735 if (Value->evaluateAsAbsolute(Val)) {
736 Res = Val;
737 return true;
738 }
739 return false;
740 };
741
742 ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR);
743 ProgInfo.NumAccVGPR = CreateExpr(Info.NumAGPR);
744 ProgInfo.NumVGPR = CreateExpr(Info.getTotalNumVGPRs(STM));
745 ProgInfo.AccumOffset =
746 CreateExpr(alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1);
747 ProgInfo.TgSplit = STM.isTgSplitEnabled();
748 ProgInfo.NumSGPR = CreateExpr(Info.NumExplicitSGPR);
749 ProgInfo.ScratchSize = CreateExpr(Info.PrivateSegmentSize);
750 ProgInfo.VCCUsed = CreateExpr(Info.UsesVCC);
751 ProgInfo.FlatUsed = CreateExpr(Info.UsesFlatScratch);
752 ProgInfo.DynamicCallStack =
753 CreateExpr(Info.HasDynamicallySizedStack || Info.HasRecursion);
754
755 const uint64_t MaxScratchPerWorkitem =
757 uint64_t ScratchSize;
758 if (TryGetMCExprValue(ProgInfo.ScratchSize, ScratchSize) &&
759 ScratchSize > MaxScratchPerWorkitem) {
760 DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ScratchSize,
761 MaxScratchPerWorkitem, DS_Error);
762 MF.getFunction().getContext().diagnose(DiagStackSize);
763 }
764
766
767 // The calculations related to SGPR/VGPR blocks are
768 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
769 // unified.
771 ProgInfo.VCCUsed, ProgInfo.FlatUsed,
772 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
773
774 // Check the addressable register limit before we add ExtraSGPRs.
776 !STM.hasSGPRInitBug()) {
777 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
778 uint64_t NumSgpr;
779 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
780 NumSgpr > MaxAddressableNumSGPRs) {
781 // This can happen due to a compiler bug or when using inline asm.
784 MF.getFunction(), "addressable scalar registers", NumSgpr,
785 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
786 Ctx.diagnose(Diag);
787 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
788 }
789 }
790
791 // Account for extra SGPRs and VGPRs reserved for debugger use.
792 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
793
794 const Function &F = MF.getFunction();
795
796 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
797 // dispatch registers are function args.
798 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
799
800 if (isShader(F.getCallingConv())) {
801 bool IsPixelShader =
802 F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
803
804 // Calculate the number of VGPR registers based on the SPI input registers
805 uint32_t InputEna = 0;
806 uint32_t InputAddr = 0;
807 unsigned LastEna = 0;
808
809 if (IsPixelShader) {
810 // Note for IsPixelShader:
811 // By this stage, all enabled inputs are tagged in InputAddr as well.
812 // We will use InputAddr to determine whether the input counts against the
813 // vgpr total and only use the InputEnable to determine the last input
814 // that is relevant - if extra arguments are used, then we have to honour
815 // the InputAddr for any intermediate non-enabled inputs.
816 InputEna = MFI->getPSInputEnable();
817 InputAddr = MFI->getPSInputAddr();
818
819 // We only need to consider input args up to the last used arg.
820 assert((InputEna || InputAddr) &&
821 "PSInputAddr and PSInputEnable should "
822 "never both be 0 for AMDGPU_PS shaders");
823 // There are some rare circumstances where InputAddr is non-zero and
824 // InputEna can be set to 0. In this case we default to setting LastEna
825 // to 1.
826 LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
827 }
828
829 // FIXME: We should be using the number of registers determined during
830 // calling convention lowering to legalize the types.
831 const DataLayout &DL = F.getParent()->getDataLayout();
832 unsigned PSArgCount = 0;
833 unsigned IntermediateVGPR = 0;
834 for (auto &Arg : F.args()) {
835 unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
836 if (Arg.hasAttribute(Attribute::InReg)) {
837 WaveDispatchNumSGPR += NumRegs;
838 } else {
839 // If this is a PS shader and we're processing the PS Input args (first
840 // 16 VGPR), use the InputEna and InputAddr bits to define how many
841 // VGPRs are actually used.
842 // Any extra VGPR arguments are handled as normal arguments (and
843 // contribute to the VGPR count whether they're used or not).
844 if (IsPixelShader && PSArgCount < 16) {
845 if ((1 << PSArgCount) & InputAddr) {
846 if (PSArgCount < LastEna)
847 WaveDispatchNumVGPR += NumRegs;
848 else
849 IntermediateVGPR += NumRegs;
850 }
851 PSArgCount++;
852 } else {
853 // If there are extra arguments we have to include the allocation for
854 // the non-used (but enabled with InputAddr) input arguments
855 if (IntermediateVGPR) {
856 WaveDispatchNumVGPR += IntermediateVGPR;
857 IntermediateVGPR = 0;
858 }
859 WaveDispatchNumVGPR += NumRegs;
860 }
861 }
862 }
864 {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
865
867 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
868
870 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
871 }
872
873 // Adjust number of registers used to meet default/requested minimum/maximum
874 // number of waves per execution unit request.
875 unsigned MaxWaves = MFI->getMaxWavesPerEU();
877 {ProgInfo.NumSGPR, CreateExpr(1ul),
878 CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
879 Ctx);
881 {ProgInfo.NumVGPR, CreateExpr(1ul),
882 CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
883 Ctx);
884
886 STM.hasSGPRInitBug()) {
887 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
888 uint64_t NumSgpr;
889 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
890 NumSgpr > MaxAddressableNumSGPRs) {
891 // This can happen due to a compiler bug or when using inline asm to use
892 // the registers which are usually reserved for vcc etc.
894 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
895 NumSgpr, MaxAddressableNumSGPRs,
897 Ctx.diagnose(Diag);
898 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
899 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
900 }
901 }
902
903 if (STM.hasSGPRInitBug()) {
904 ProgInfo.NumSGPR =
906 ProgInfo.NumSGPRsForWavesPerEU =
908 }
909
910 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
912 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
913 MFI->getNumUserSGPRs(),
915 Ctx.diagnose(Diag);
916 }
917
918 if (MFI->getLDSSize() >
919 static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
922 MF.getFunction(), "local memory", MFI->getLDSSize(),
924 Ctx.diagnose(Diag);
925 }
926 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
927 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
928 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
929 unsigned Granule) {
930 const MCExpr *OneConst = CreateExpr(1ul);
931 const MCExpr *GranuleConst = CreateExpr(Granule);
932 const MCExpr *MaxNumGPR =
933 AMDGPUVariadicMCExpr::createMax({NumGPR, OneConst}, Ctx);
934 const MCExpr *AlignToGPR =
935 AMDGPUVariadicMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
936 const MCExpr *DivGPR =
937 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
938 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
939 return SubGPR;
940 };
941
942 ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,
944 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
946
947 const SIModeRegisterDefaults Mode = MFI->getMode();
948
949 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
950 // register.
951 ProgInfo.FloatMode = getFPMode(Mode);
952
953 ProgInfo.IEEEMode = Mode.IEEE;
954
955 // Make clamp modifier on NaN input returns 0.
956 ProgInfo.DX10Clamp = Mode.DX10Clamp;
957
958 unsigned LDSAlignShift;
960 // LDS is allocated in 64 dword blocks.
961 LDSAlignShift = 8;
962 } else {
963 // LDS is allocated in 128 dword blocks.
964 LDSAlignShift = 9;
965 }
966
967 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
968 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
969
970 ProgInfo.LDSSize = MFI->getLDSSize();
971 ProgInfo.LDSBlocks =
972 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
973
974 // The MCExpr equivalent of divideCeil.
975 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
976 const MCExpr *Ceil =
977 AMDGPUVariadicMCExpr::createAlignTo(Numerator, Denominator, Ctx);
978 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
979 };
980
981 // Scratch is allocated in 64-dword or 256-dword blocks.
982 unsigned ScratchAlignShift =
983 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
984 // We need to program the hardware with the amount of scratch memory that
985 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
986 // scratch memory used per thread.
987 ProgInfo.ScratchBlocks = DivideCeil(
989 CreateExpr(STM.getWavefrontSize()), Ctx),
990 CreateExpr(1ULL << ScratchAlignShift));
991
992 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
993 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
994 ProgInfo.MemOrdered = 1;
995 }
996
997 // 0 = X, 1 = XY, 2 = XYZ
998 unsigned TIDIGCompCnt = 0;
999 if (MFI->hasWorkItemIDZ())
1000 TIDIGCompCnt = 2;
1001 else if (MFI->hasWorkItemIDY())
1002 TIDIGCompCnt = 1;
1003
1004 // The private segment wave byte offset is the last of the system SGPRs. We
1005 // initially assumed it was allocated, and may have used it. It shouldn't harm
1006 // anything to disable it if we know the stack isn't used here. We may still
1007 // have emitted code reading it to initialize scratch, but if that's unused
1008 // reading garbage should be OK.
1011 MCConstantExpr::create(0, Ctx), Ctx),
1012 ProgInfo.DynamicCallStack, Ctx);
1013
1014 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1015 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1016 ProgInfo.TrapHandlerEnable =
1017 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
1018 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1019 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1020 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1021 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1022 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1023 ProgInfo.EXCPEnMSB = 0;
1024 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1025 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1026 ProgInfo.EXCPEnable = 0;
1027
1028 if (STM.hasGFX90AInsts()) {
1029 // return ((Dst & ~Mask) | (Value << Shift))
1030 auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1031 uint32_t Shift) {
1032 auto Shft = MCConstantExpr::create(Shift, Ctx);
1033 auto Msk = MCConstantExpr::create(Mask, Ctx);
1034 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1036 Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
1037 return Dst;
1038 };
1039
1040 ProgInfo.ComputePGMRSrc3GFX90A =
1041 SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
1042 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1043 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1044 ProgInfo.ComputePGMRSrc3GFX90A =
1045 SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),
1046 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1047 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1048 }
1049
1051 STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
1052 ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
1053
1054 const auto [MinWEU, MaxWEU] =
1055 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1056 uint64_t Occupancy;
1057 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1059 F, F.getSubprogram(),
1060 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1061 "'" +
1062 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1063 ", final occupancy is " + Twine(Occupancy));
1064 F.getContext().diagnose(Diag);
1065 }
1066}
1067
1068static unsigned getRsrcReg(CallingConv::ID CallConv) {
1069 switch (CallConv) {
1070 default: [[fallthrough]];
1078 }
1079}
1080
1081void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1082 const SIProgramInfo &CurrentProgramInfo) {
1084 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1085 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1086 MCContext &Ctx = MF.getContext();
1087
1088 // (((Value) & Mask) << Shift)
1089 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1090 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
1091 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
1093 shft, Ctx);
1094 };
1095
1096 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1097 int64_t Val;
1098 if (Value->evaluateAsAbsolute(Val))
1099 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
1100 else
1101 OutStreamer->emitValue(Value, Size);
1102 };
1103
1106
1107 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
1108 /*Size=*/4);
1109
1111 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
1112
1114
1115 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1116 // appropriate generation.
1117 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1118 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1119 /*Mask=*/0x3FFFF, /*Shift=*/12),
1120 /*Size=*/4);
1121 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1122 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1123 /*Mask=*/0x7FFF, /*Shift=*/12),
1124 /*Size=*/4);
1125 } else {
1126 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1127 /*Mask=*/0x1FFF, /*Shift=*/12),
1128 /*Size=*/4);
1129 }
1130
1131 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1132 // 0" comment but I don't see a corresponding field in the register spec.
1133 } else {
1134 OutStreamer->emitInt32(RsrcReg);
1135
1136 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1137 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1138 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1139 MF.getContext());
1140 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1142
1143 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1144 // appropriate generation.
1145 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1146 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1147 /*Mask=*/0x3FFFF, /*Shift=*/12),
1148 /*Size=*/4);
1149 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1150 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1151 /*Mask=*/0x7FFF, /*Shift=*/12),
1152 /*Size=*/4);
1153 } else {
1154 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1155 /*Mask=*/0x1FFF, /*Shift=*/12),
1156 /*Size=*/4);
1157 }
1158 }
1159
1162 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1163 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1164 : CurrentProgramInfo.LDSBlocks;
1165 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1167 OutStreamer->emitInt32(MFI->getPSInputEnable());
1169 OutStreamer->emitInt32(MFI->getPSInputAddr());
1170 }
1171
1172 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1173 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1174 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1175 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1176}
1177
1178// Helper function to add common PAL Metadata 3.0+
1180 const SIProgramInfo &CurrentProgramInfo,
1181 CallingConv::ID CC, const GCNSubtarget &ST) {
1182 if (ST.hasIEEEMode())
1183 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1184
1185 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1186 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1187
1188 if (AMDGPU::isCompute(CC)) {
1189 MD->setHwStage(CC, ".trap_present",
1190 (bool)CurrentProgramInfo.TrapHandlerEnable);
1191 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1192 }
1193
1194 MD->setHwStage(CC, ".lds_size",
1195 (unsigned)(CurrentProgramInfo.LdsSize *
1196 getLdsDwGranularity(ST) * sizeof(uint32_t)));
1197}
1198
1199// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1200// is AMDPAL. It stores each compute/SPI register setting and other PAL
1201// metadata items into the PALMD::Metadata, combining with any provided by the
1202// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1203// is then written as a single block in the .note section.
1204void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1205 const SIProgramInfo &CurrentProgramInfo) {
1207 auto CC = MF.getFunction().getCallingConv();
1208 auto MD = getTargetStreamer()->getPALMetadata();
1209 auto &Ctx = MF.getContext();
1210
1211 MD->setEntryPoint(CC, MF.getFunction().getName());
1212 MD->setNumUsedVgprs(
1213 CC, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx));
1214
1215 // Only set AGPRs for supported devices
1216 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1217 if (STM.hasMAIInsts()) {
1218 MD->setNumUsedAgprs(CC, getMCExprValue(CurrentProgramInfo.NumAccVGPR, Ctx));
1219 }
1220
1221 MD->setNumUsedSgprs(
1222 CC, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx));
1223 if (MD->getPALMajorVersion() < 3) {
1224 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM));
1225 if (AMDGPU::isCompute(CC)) {
1226 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
1227 } else {
1228 if (getMCExprValue(CurrentProgramInfo.ScratchBlocks, Ctx) > 0)
1229 MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
1230 }
1231 } else {
1232 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1233 MD->setHwStage(CC, ".scratch_en",
1234 (bool)getMCExprValue(CurrentProgramInfo.ScratchEnable, Ctx));
1235 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
1236 }
1237
1238 // ScratchSize is in bytes, 16 aligned.
1239 MD->setScratchSize(
1240 CC, alignTo(getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx), 16));
1242 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1243 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1244 : CurrentProgramInfo.LDSBlocks;
1245 if (MD->getPALMajorVersion() < 3) {
1246 MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1247 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1248 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1249 } else {
1250 // Graphics registers
1251 const unsigned ExtraLdsDwGranularity =
1252 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1253 MD->setGraphicsRegisters(
1254 ".ps_extra_lds_size",
1255 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1256
1257 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1258 static StringLiteral const PsInputFields[] = {
1259 ".persp_sample_ena", ".persp_center_ena",
1260 ".persp_centroid_ena", ".persp_pull_model_ena",
1261 ".linear_sample_ena", ".linear_center_ena",
1262 ".linear_centroid_ena", ".line_stipple_tex_ena",
1263 ".pos_x_float_ena", ".pos_y_float_ena",
1264 ".pos_z_float_ena", ".pos_w_float_ena",
1265 ".front_face_ena", ".ancillary_ena",
1266 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1267 unsigned PSInputEna = MFI->getPSInputEnable();
1268 unsigned PSInputAddr = MFI->getPSInputAddr();
1269 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1270 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1271 (bool)((PSInputEna >> Idx) & 1));
1272 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1273 (bool)((PSInputAddr >> Idx) & 1));
1274 }
1275 }
1276 }
1277
1278 // For version 3 and above the wave front size is already set in the metadata
1279 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1280 MD->setWave32(MF.getFunction().getCallingConv());
1281}
1282
1283void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1284 auto *MD = getTargetStreamer()->getPALMetadata();
1285 const MachineFrameInfo &MFI = MF.getFrameInfo();
1286 StringRef FnName = MF.getFunction().getName();
1287 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1289 MCContext &Ctx = MF.getContext();
1290
1291 if (MD->getPALMajorVersion() < 3) {
1292 // Set compute registers
1293 MD->setRsrc1(CallingConv::AMDGPU_CS,
1294 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST));
1295 MD->setRsrc2(CallingConv::AMDGPU_CS,
1296 CurrentProgramInfo.getComputePGMRSrc2());
1297 } else {
1298 EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
1299 }
1300
1301 // Set optional info
1302 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1303 MD->setFunctionNumUsedVgprs(
1304 FnName, getMCExprValue(CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx));
1305 MD->setFunctionNumUsedSgprs(
1306 FnName, getMCExprValue(CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx));
1307}
1308
1309// This is supposed to be log2(Size)
1311 switch (Size) {
1312 case 4:
1313 return AMD_ELEMENT_4_BYTES;
1314 case 8:
1315 return AMD_ELEMENT_8_BYTES;
1316 case 16:
1317 return AMD_ELEMENT_16_BYTES;
1318 default:
1319 llvm_unreachable("invalid private_element_size");
1320 }
1321}
1322
1323void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
1324 const SIProgramInfo &CurrentProgramInfo,
1325 const MachineFunction &MF) const {
1326 const Function &F = MF.getFunction();
1327 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1328 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1329
1331 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1332 MCContext &Ctx = MF.getContext();
1333
1335
1337 CurrentProgramInfo.getComputePGMRSrc1(STM) |
1338 (CurrentProgramInfo.getComputePGMRSrc2() << 32);
1340
1341 if (getMCExprValue(CurrentProgramInfo.DynamicCallStack, Ctx))
1343
1346 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1347
1348 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1349 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1350 Out.code_properties |=
1352 }
1353
1354 if (UserSGPRInfo.hasDispatchPtr())
1356
1357 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)
1359
1360 if (UserSGPRInfo.hasKernargSegmentPtr())
1362
1363 if (UserSGPRInfo.hasDispatchID())
1365
1366 if (UserSGPRInfo.hasFlatScratchInit())
1368
1369 if (UserSGPRInfo.hasDispatchPtr())
1371
1372 if (STM.isXNACKEnabled())
1374
1375 Align MaxKernArgAlign;
1376 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1377 Out.wavefront_sgpr_count = getMCExprValue(CurrentProgramInfo.NumSGPR, Ctx);
1378 Out.workitem_vgpr_count = getMCExprValue(CurrentProgramInfo.NumVGPR, Ctx);
1380 getMCExprValue(CurrentProgramInfo.ScratchSize, Ctx);
1381 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1382
1383 // kernarg_segment_alignment is specified as log of the alignment.
1384 // The minimum alignment is 16.
1385 // FIXME: The metadata treats the minimum as 4?
1386 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1387}
1388
1390 const char *ExtraCode, raw_ostream &O) {
1391 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1392 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1393 return false;
1394
1395 if (ExtraCode && ExtraCode[0]) {
1396 if (ExtraCode[1] != 0)
1397 return true; // Unknown modifier.
1398
1399 switch (ExtraCode[0]) {
1400 case 'r':
1401 break;
1402 default:
1403 return true;
1404 }
1405 }
1406
1407 // TODO: Should be able to support other operand types like globals.
1408 const MachineOperand &MO = MI->getOperand(OpNo);
1409 if (MO.isReg()) {
1412 return false;
1413 } else if (MO.isImm()) {
1414 int64_t Val = MO.getImm();
1416 O << Val;
1417 } else if (isUInt<16>(Val)) {
1418 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1419 } else if (isUInt<32>(Val)) {
1420 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1421 } else {
1422 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1423 }
1424 return false;
1425 }
1426 return true;
1427}
1428
1433}
1434
1435void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1436 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1437 bool isModuleEntryFunction, bool hasMAIInsts) {
1438 if (!ORE)
1439 return;
1440
1441 const char *Name = "kernel-resource-usage";
1442 const char *Indent = " ";
1443
1444 // If the remark is not specifically enabled, do not output to yaml
1447 return;
1448
1449 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1450 StringRef RemarkLabel, auto Argument) {
1451 // Add an indent for every line besides the line with the kernel name. This
1452 // makes it easier to tell which resource usage go with which kernel since
1453 // the kernel name will always be displayed first.
1454 std::string LabelStr = RemarkLabel.str() + ": ";
1455 if (RemarkName != "FunctionName")
1456 LabelStr = Indent + LabelStr;
1457
1458 ORE->emit([&]() {
1459 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1461 &MF.front())
1462 << LabelStr << ore::NV(RemarkName, Argument);
1463 });
1464 };
1465
1466 // FIXME: Formatting here is pretty nasty because clang does not accept
1467 // newlines from diagnostics. This forces us to emit multiple diagnostic
1468 // remarks to simulate newlines. If and when clang does accept newlines, this
1469 // formatting should be aggregated into one remark with newlines to avoid
1470 // printing multiple diagnostic location and diag opts.
1471 MCContext &MCCtx = MF.getContext();
1472 EmitResourceUsageRemark("FunctionName", "Function Name",
1473 MF.getFunction().getName());
1474 EmitResourceUsageRemark("NumSGPR", "SGPRs",
1475 getMCExprValue(CurrentProgramInfo.NumSGPR, MCCtx));
1476 EmitResourceUsageRemark(
1477 "NumVGPR", "VGPRs",
1478 getMCExprValue(CurrentProgramInfo.NumArchVGPR, MCCtx));
1479 if (hasMAIInsts) {
1480 EmitResourceUsageRemark(
1481 "NumAGPR", "AGPRs",
1482 getMCExprValue(CurrentProgramInfo.NumAccVGPR, MCCtx));
1483 }
1484 EmitResourceUsageRemark(
1485 "ScratchSize", "ScratchSize [bytes/lane]",
1486 getMCExprValue(CurrentProgramInfo.ScratchSize, MCCtx));
1487 StringRef DynamicStackStr =
1488 getMCExprValue(CurrentProgramInfo.DynamicCallStack, MCCtx) ? "True"
1489 : "False";
1490 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1491 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1492 getMCExprValue(CurrentProgramInfo.Occupancy, MCCtx));
1493 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1494 CurrentProgramInfo.SGPRSpill);
1495 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1496 CurrentProgramInfo.VGPRSpill);
1497 if (isModuleEntryFunction)
1498 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1499 CurrentProgramInfo.LDSSize);
1500}
#define Success
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, const SIProgramInfo &CurrentProgramInfo, CallingConv::ID CC, const GCNSubtarget &ST)
static unsigned getRsrcReg(CallingConv::ID CallConv)
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
AMDHSA kernel descriptor MCExpr struct for use in MC layer.
Analyzes how many registers and other resources are used by functions.
AMDHSA kernel descriptor definitions.
#define AMDHSA_BITS_GET(SRC, MSK)
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
@ AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_EXTERNAL_VISIBILITY
Definition: Compiler.h:135
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::string Name
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
LLVMContext & Context
if(VerifyEach)
const char LLVMTargetMachineRef TM
R600 Assembly printer class.
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition: SIDefines.h:1046
#define R_0286E8_SPI_TMPRING_SIZE
Definition: SIDefines.h:1184
#define S_00B84C_SCRATCH_EN(x)
Definition: SIDefines.h:1080
#define FP_ROUND_MODE_DP(x)
Definition: SIDefines.h:1166
#define FP_ROUND_ROUND_TO_NEAREST
Definition: SIDefines.h:1158
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition: SIDefines.h:1117
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition: SIDefines.h:1179
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition: SIDefines.h:1069
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition: SIDefines.h:1068
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition: SIDefines.h:1077
#define R_0286CC_SPI_PS_INPUT_ENA
Definition: SIDefines.h:1116
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition: SIDefines.h:1055
#define FP_DENORM_MODE_DP(x)
Definition: SIDefines.h:1177
#define R_00B848_COMPUTE_PGM_RSRC1
Definition: SIDefines.h:1119
#define R_SPILLED_SGPRS
Definition: SIDefines.h:1198
#define FP_ROUND_MODE_SP(x)
Definition: SIDefines.h:1165
#define FP_DENORM_MODE_SP(x)
Definition: SIDefines.h:1176
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition: SIDefines.h:1060
#define R_SPILLED_VGPRS
Definition: SIDefines.h:1199
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition: SIDefines.h:1054
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition: SIDefines.h:1079
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition: SIDefines.h:1053
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
Emit the specified function out to the OutStreamer.
void emitFunctionBodyEnd() override
Targets can override this to emit stuff after the last basic block in the function.
bool doFinalization(Module &M) override
Shut down the asmprinter.
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(unsigned RegNo, raw_ostream &O, const MCRegisterInfo &MRI)
void setHwStage(unsigned CC, StringRef field, unsigned Val)
unsigned getAddressableLocalMemorySize() const
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV)
void initializeTargetID(const MCSubtargetInfo &STI)
virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitDirectiveAMDGCNTarget()
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, uint64_t NextVGPR, uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr)
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI, bool TrapEnabled)
static const AMDGPUVariadicMCExpr * createMax(ArrayRef< const MCExpr * > Args, MCContext &Ctx)
Definition: AMDGPUMCExpr.h:70
static const AMDGPUVariadicMCExpr * createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx)
Definition: AMDGPUMCExpr.h:85
static const AMDGPUVariadicMCExpr * createTotalNumVGPR(const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx)
static const AMDGPUVariadicMCExpr * createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs, const GCNSubtarget &STM, MCContext &Ctx)
Mimics GCNSubtarget::computeOccupancy for MCExpr.
static const AMDGPUVariadicMCExpr * createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, bool XNACKUsed, MCContext &Ctx)
Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed are unresolvable but neede...
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
This class is intended to be used as a driving class for all asm writers.
Definition: AsmPrinter.h:84
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
Definition: AsmPrinter.cpp:399
MCSymbol * getSymbol(const GlobalValue *GV) const
Definition: AsmPrinter.cpp:704
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
Definition: AsmPrinter.cpp:726
TargetMachine & TM
Target machine description.
Definition: AsmPrinter.h:87
MachineFunction * MF
The current machine function.
Definition: AsmPrinter.h:102
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
Definition: AsmPrinter.cpp:450
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
Definition: AsmPrinter.cpp:659
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
Definition: AsmPrinter.cpp:441
unsigned getFunctionNumber() const
Return a unique ID for the current function.
Definition: AsmPrinter.cpp:395
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition: AsmPrinter.h:114
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition: AsmPrinter.h:94
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition: AsmPrinter.h:99
bool isVerbose() const
Return true if assembly output should contain comments.
Definition: AsmPrinter.h:265
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
Definition: AsmPrinter.cpp:699
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Diagnostic information for optimization failures.
Diagnostic information for stack size etc.
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1830
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:264
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:356
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
bool hasMAIInsts() const
Definition: GCNSubtarget.h:809
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:257
bool hasSGPRInitBug() const
bool isTgSplitEnabled() const
Definition: GCNSubtarget.h:600
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool isCuModeEnabled() const
Definition: GCNSubtarget.h:604
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
Definition: GCNSubtarget.h:293
bool dumpCode() const
Definition: GCNSubtarget.h:504
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:592
bool isWave32() const
unsigned getMaxNumUserSGPRs() const
Definition: GCNSubtarget.h:936
Generation getGeneration() const
Definition: GCNSubtarget.h:308
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
Definition: GCNSubtarget.h:312
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
MaybeAlign getAlign() const
Returns the alignment of the given variable or function.
Definition: GlobalObject.h:80
VisibilityTypes getVisibility() const
Definition: GlobalValue.h:247
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:286
unsigned getAddressSpace() const
Definition: GlobalValue.h:204
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
Type * getValueType() const
Definition: GlobalValue.h:295
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
const DiagnosticHandler * getDiagHandlerPtr() const
getDiagHandlerPtr - Returns const raw pointer of DiagnosticHandler set by setDiagnosticHandler.
MCCodeEmitter * getEmitterPtr() const
Definition: MCAssembler.h:330
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:541
static const MCBinaryExpr * createAdd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:536
static const MCBinaryExpr * createOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:601
static const MCBinaryExpr * createLOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:571
static const MCBinaryExpr * createMul(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:591
static const MCBinaryExpr * createGT(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:556
static const MCBinaryExpr * createDiv(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:546
static const MCBinaryExpr * createShl(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:606
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:621
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:194
Context object for machine code objects.
Definition: MCContext.h:81
void reportError(SMLoc L, const Twine &Msg)
Definition: MCContext.cpp:1073
Base class for the full range of assembler expressions which are needed for parsing.
Definition: MCExpr.h:35
MCContext & getContext() const
This represents a section on linux, lots of unix variants and some bare metal systems.
Definition: MCSectionELF.h:26
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:40
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition: MCSymbol.h:250
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:205
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition: MCSymbol.h:300
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition: MCSymbol.h:232
MCStreamer & getStreamer()
Definition: MCStreamer.h:101
static const MCUnaryExpr * createNot(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition: MCExpr.h:466
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void setAlignment(Align A)
setAlignment - Set the alignment of the function.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MCContext & getContext() const
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
Definition: MachineInstr.h:69
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getNumKernargPreloadedSGPRs() const
Represents a location in source code.
Definition: SMLoc.h:23
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:845
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::string str() const
str - Get the contents as an std::string.
Definition: StringRef.h:222
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
const Triple & getTargetTriple() const
const MCSubtargetInfo * getMCSubtargetInfo() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:382
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:690
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, std::optional< bool > EnableWavefrontSize32)
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI)
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const MCSubtargetInfo *STI)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
IsaVersion getIsaVersion(StringRef GPU)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX90A(const MCSubtargetInfo &STI)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ SHT_PROGBITS
Definition: ELF.h:1063
@ STT_AMDGPU_HSA_KERNEL
Definition: ELF.h:1336
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
Target & getTheGCNTarget()
The target for GCN GPUs.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1849
@ DS_Error
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
AMD Kernel Code Object (amd_kernel_code_t).
uint16_t workitem_vgpr_count
Number of vector registers used by each work-item.
uint32_t code_properties
Code properties.
uint8_t kernarg_segment_alignment
The maximum byte alignment of variables used by the kernel in the specified memory segment.
uint32_t workgroup_group_segment_byte_size
The amount of group segment memory required by a work-group in bytes.
uint16_t wavefront_sgpr_count
Number of scalar registers used by a wavefront.
uint32_t workitem_private_segment_byte_size
The amount of memory required for the combined private, spill and arg segments for a work-item in byt...
uint64_t kernarg_segment_byte_size
The size in bytes of the kernarg segment that holds the values of the arguments to the kernel.
uint64_t compute_pgm_resource_registers
Shader program settings for CS.
const SIFunctionResourceInfo & getResourceInfo(const Function *F) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
virtual bool isAnalysisRemarkEnabled(StringRef PassName) const
Return true if analysis remarks are enabled, override to provide different implementation.
Track resource usage for kernels / entry functions.
Definition: SIProgramInfo.h:31
const MCExpr * NumSGPR
Definition: SIProgramInfo.h:70
const MCExpr * ComputePGMRSrc3GFX90A
Definition: SIProgramInfo.h:63
const MCExpr * NumArchVGPR
Definition: SIProgramInfo.h:66
const MCExpr * VGPRBlocks
Definition: SIProgramInfo.h:33
const MCExpr * ScratchBlocks
Definition: SIProgramInfo.h:48
const MCExpr * VCCUsed
Definition: SIProgramInfo.h:90
uint64_t getComputePGMRSrc1(const GCNSubtarget &ST) const
Compute the value of the ComputePGMRsrc1 register.
const MCExpr * FlatUsed
Definition: SIProgramInfo.h:74
uint32_t TrapHandlerEnable
Definition: SIProgramInfo.h:53
const MCExpr * ScratchEnable
Definition: SIProgramInfo.h:51
uint64_t getComputePGMRSrc2() const
Compute the value of the ComputePGMRsrc2 register.
const MCExpr * AccumOffset
Definition: SIProgramInfo.h:68
const MCExpr * NumAccVGPR
Definition: SIProgramInfo.h:67
const MCExpr * DynamicCallStack
Definition: SIProgramInfo.h:87
const MCExpr * SGPRBlocks
Definition: SIProgramInfo.h:34
const MCExpr * NumVGPRsForWavesPerEU
Definition: SIProgramInfo.h:80
const MCExpr * NumVGPR
Definition: SIProgramInfo.h:65
const MCExpr * Occupancy
Definition: SIProgramInfo.h:83
const MCExpr * ScratchSize
Definition: SIProgramInfo.h:44
const MCExpr * NumSGPRsForWavesPerEU
Definition: SIProgramInfo.h:77
void reset(const MachineFunction &MF)
uint64_t getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST) const
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.