LLVM 19.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
22#include "GCNSubtarget.h"
27#include "R600AsmPrinter.h"
38#include "llvm/MC/MCAssembler.h"
39#include "llvm/MC/MCContext.h"
41#include "llvm/MC/MCStreamer.h"
47
48using namespace llvm;
49using namespace llvm::AMDGPU;
50
51// This should get the default rounding mode from the kernel. We just set the
52// default here, but this could change if the OpenCL rounding mode pragmas are
53// used.
54//
55// The denormal mode here should match what is reported by the OpenCL runtime
56// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
57// can also be override to flush with the -cl-denorms-are-zero compiler flag.
58//
59// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
60// precision, and leaves single precision to flush all and does not report
61// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
62// CL_FP_DENORM for both.
63//
64// FIXME: It seems some instructions do not support single precision denormals
65// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
66// and sin_f32, cos_f32 on most parts).
67
68// We want to use these instructions, and using fp32 denormals also causes
69// instructions to run at the double precision rate for the device so it's
70// probably best to just report no single precision denormals.
74 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
75 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
76}
77
78static AsmPrinter *
80 std::unique_ptr<MCStreamer> &&Streamer) {
81 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
82}
83
89}
90
92 std::unique_ptr<MCStreamer> Streamer)
93 : AsmPrinter(TM, std::move(Streamer)) {
94 assert(OutStreamer && "AsmPrinter constructed without streamer");
95}
96
98 return "AMDGPU Assembly Printer";
99}
100
102 return TM.getMCSubtargetInfo();
103}
104
106 if (!OutStreamer)
107 return nullptr;
108 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
109}
110
113}
114
115void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
117
118 // TODO: Which one is called first, emitStartOfAsmFile or
119 // emitFunctionBodyStart?
120 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
121 initializeTargetID(M);
122
125 return;
126
128
131 CodeObjectVersion);
132 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
133 }
134
137}
138
140 // Init target streamer if it has not yet happened
142 initTargetStreamer(M);
143
146
147 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
148 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
150 HSAMetadataStream->end();
151 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
152 (void)Success;
153 assert(Success && "Malformed HSA Metadata");
154 }
155}
156
159 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
160 const Function &F = MF->getFunction();
161
162 // TODO: We're checking this late, would be nice to check it earlier.
163 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
165 STM.getCPU() + " is only available on code object version 6 or better",
166 /*gen_crash_diag*/ false);
167 }
168
169 // TODO: Which one is called first, emitStartOfAsmFile or
170 // emitFunctionBodyStart?
171 if (!getTargetStreamer()->getTargetID())
172 initializeTargetID(*F.getParent());
173
174 const auto &FunctionTargetID = STM.getTargetID();
175 // Make sure function's xnack settings are compatible with module's
176 // xnack settings.
177 if (FunctionTargetID.isXnackSupported() &&
178 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
179 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
180 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
181 "' function does not match module xnack setting");
182 return;
183 }
184 // Make sure function's sramecc settings are compatible with module's
185 // sramecc settings.
186 if (FunctionTargetID.isSramEccSupported() &&
187 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
188 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
189 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
190 "' function does not match module sramecc setting");
191 return;
192 }
193
194 if (!MFI.isEntryFunction())
195 return;
196
197 if (STM.isMesaKernel(F) &&
198 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
199 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
200 AMDGPUMCKernelCodeT KernelCode;
201 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
202 KernelCode.validate(&STM, MF->getContext());
204 }
205
206 if (STM.isAmdHsaOS())
207 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
208
209 if (MFI.getNumKernargPreloadedSGPRs() > 0) {
212 STM.isAmdHsaOS());
213 }
214}
215
218 if (!MFI.isEntryFunction())
219 return;
220
222 return;
223
224 auto &Streamer = getTargetStreamer()->getStreamer();
225 auto &Context = Streamer.getContext();
226 auto &ObjectFileInfo = *Context.getObjectFileInfo();
227 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
228
229 Streamer.pushSection();
230 Streamer.switchSection(&ReadOnlySection);
231
232 // CP microcode requires the kernel descriptor to be allocated on 64 byte
233 // alignment.
234 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
235 ReadOnlySection.ensureMinAlignment(Align(64));
236
237 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
238
239 SmallString<128> KernelName;
240 getNameWithPrefix(KernelName, &MF->getFunction());
242 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
243 CurrentProgramInfo.NumVGPRsForWavesPerEU,
245 CurrentProgramInfo.NumSGPRsForWavesPerEU,
247 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
248 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
249 Context),
250 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
251
252 Streamer.popSection();
253}
254
256 Register RegNo = MI->getOperand(0).getReg();
257
260 OS << "implicit-def: "
261 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
262
263 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
264 OS << " : SGPR spill to VGPR lane";
265
266 OutStreamer->AddComment(OS.str());
267 OutStreamer->addBlankLine();
268}
269
273 return;
274 }
275
277 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
278 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
279 SmallString<128> SymbolName;
280 getNameWithPrefix(SymbolName, &MF->getFunction()),
282 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
283 }
284 if (DumpCodeInstEmitter) {
285 // Disassemble function name label to text.
286 DisasmLines.push_back(MF->getName().str() + ":");
287 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
288 HexLines.push_back("");
289 }
290
292}
293
295 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
296 // Write a line for the basic block label if it is not only fallthrough.
297 DisasmLines.push_back(
298 (Twine("BB") + Twine(getFunctionNumber())
299 + "_" + Twine(MBB.getNumber()) + ":").str());
300 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
301 HexLines.push_back("");
302 }
304}
305
308 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
310 Twine(GV->getName()) +
311 ": unsupported initializer for address space");
312 return;
313 }
314
315 // LDS variables aren't emitted in HSA or PAL yet.
317 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
318 return;
319
320 MCSymbol *GVSym = getSymbol(GV);
321
322 GVSym->redefineIfPossible();
323 if (GVSym->isDefined() || GVSym->isVariable())
324 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
325 "' is already defined");
326
327 const DataLayout &DL = GV->getDataLayout();
328 uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
329 Align Alignment = GV->getAlign().value_or(Align(4));
330
331 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
332 emitLinkage(GV, GVSym);
333 auto TS = getTargetStreamer();
334 TS->emitAMDGPULDS(GVSym, Size, Alignment);
335 return;
336 }
337
339}
340
342 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
343
345 switch (CodeObjectVersion) {
347 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4());
348 break;
350 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV5());
351 break;
353 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV6());
354 break;
355 default:
356 report_fatal_error("Unexpected code object version");
357 }
358 }
360}
361
363 // Pad with s_code_end to help tools and guard against instruction prefetch
364 // causing stale data in caches. Arguably this should be done by the linker,
365 // which is why this isn't done for Mesa.
366 const MCSubtargetInfo &STI = *getGlobalSTI();
367 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
370 OutStreamer->switchSection(getObjFileLowering().getTextSection());
372 }
373
375}
376
377// Print comments that apply to both callable functions and entry points.
378void AMDGPUAsmPrinter::emitCommonFunctionComments(
379 uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
380 uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
381 const AMDGPUMachineFunction *MFI) {
382 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
383 OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
384 OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
385 if (NumAGPR) {
386 OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
387 OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
388 false);
389 }
390 OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
391 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
392 false);
393}
394
395SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
397 raw_svector_ostream OSS(Str);
398 int64_t IVal;
399 if (Value->evaluateAsAbsolute(IVal)) {
400 OSS << static_cast<uint64_t>(IVal);
401 } else {
402 Value->print(OSS, MAI);
403 }
404 return Str;
405}
406
407void AMDGPUAsmPrinter::emitCommonFunctionComments(
408 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
409 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
410 const AMDGPUMachineFunction *MFI) {
411 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
412 OutStreamer->emitRawComment(" NumSgprs: " + getMCExprStr(NumSGPR), false);
413 OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
414 if (NumAGPR && TotalNumVGPR) {
415 OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
416 OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
417 false);
418 }
419 OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
420 false);
421 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
422 false);
423}
424
425const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
426 const MachineFunction &MF) const {
428 MCContext &Ctx = MF.getContext();
429 uint16_t KernelCodeProperties = 0;
430 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
431
432 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
433 KernelCodeProperties |=
434 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
435 }
436 if (UserSGPRInfo.hasDispatchPtr()) {
437 KernelCodeProperties |=
438 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
439 }
440 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {
441 KernelCodeProperties |=
442 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
443 }
444 if (UserSGPRInfo.hasKernargSegmentPtr()) {
445 KernelCodeProperties |=
446 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
447 }
448 if (UserSGPRInfo.hasDispatchID()) {
449 KernelCodeProperties |=
450 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
451 }
452 if (UserSGPRInfo.hasFlatScratchInit()) {
453 KernelCodeProperties |=
454 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
455 }
456 if (UserSGPRInfo.hasPrivateSegmentSize()) {
457 KernelCodeProperties |=
458 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
459 }
461 KernelCodeProperties |=
462 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
463 }
464
465 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
466 // un-evaluatable at this point so it cannot be conditionally checked here.
467 // Instead, we'll directly shift the possibly unknown MCExpr into its place
468 // and bitwise-or it into KernelCodeProperties.
469 const MCExpr *KernelCodePropExpr =
470 MCConstantExpr::create(KernelCodeProperties, Ctx);
471 const MCExpr *OrValue = MCConstantExpr::create(
472 amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
473 OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
474 OrValue, Ctx);
475 KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
476
477 return KernelCodePropExpr;
478}
479
481AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
482 const SIProgramInfo &PI) const {
484 const Function &F = MF.getFunction();
486 MCContext &Ctx = MF.getContext();
487
488 MCKernelDescriptor KernelDescriptor;
489
490 KernelDescriptor.group_segment_fixed_size =
492 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
493
494 Align MaxKernArgAlign;
495 KernelDescriptor.kernarg_size = MCConstantExpr::create(
496 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
497
498 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
499 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
500 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
501
502 int64_t PGRM_Rsrc3 = 1;
503 bool EvaluatableRsrc3 =
504 CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3);
505 (void)PGRM_Rsrc3;
506 (void)EvaluatableRsrc3;
507 assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
508 static_cast<uint64_t>(PGRM_Rsrc3) == 0);
509 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
510
511 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
512 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
513 Ctx);
514
515 return KernelDescriptor;
516}
517
519 // Init target streamer lazily on the first function so that previous passes
520 // can set metadata.
522 initTargetStreamer(*MF.getFunction().getParent());
523
524 ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
525 CurrentProgramInfo.reset(MF);
526
528 MCContext &Ctx = MF.getContext();
529
530 // The starting address of all shader programs must be 256 bytes aligned.
531 // Regular functions just need the basic required instruction alignment.
532 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
533
535
538 // FIXME: This should be an explicit check for Mesa.
539 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
540 MCSectionELF *ConfigSection =
541 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
542 OutStreamer->switchSection(ConfigSection);
543 }
544
545 if (MFI->isModuleEntryFunction()) {
546 getSIProgramInfo(CurrentProgramInfo, MF);
547 }
548
549 if (STM.isAmdPalOS()) {
550 if (MFI->isEntryFunction())
551 EmitPALMetadata(MF, CurrentProgramInfo);
552 else if (MFI->isModuleEntryFunction())
553 emitPALFunctionMetadata(MF);
554 } else if (!STM.isAmdHsaOS()) {
555 EmitProgramInfoSI(MF, CurrentProgramInfo);
556 }
557
558 DumpCodeInstEmitter = nullptr;
559 if (STM.dumpCode()) {
560 // For -dumpcode, get the assembler out of the streamer. This only works
561 // with -filetype=obj.
562 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
563 if (Assembler)
564 DumpCodeInstEmitter = Assembler->getEmitterPtr();
565 }
566
567 DisasmLines.clear();
568 HexLines.clear();
570
572
573 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
574 STM.hasMAIInsts());
575
576 if (isVerbose()) {
577 MCSectionELF *CommentSection =
578 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
579 OutStreamer->switchSection(CommentSection);
580
581 if (!MFI->isEntryFunction()) {
582 OutStreamer->emitRawComment(" Function info:", false);
584 ResourceUsage->getResourceInfo(&MF.getFunction());
585 emitCommonFunctionComments(
586 Info.NumVGPR,
587 STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
588 Info.getTotalNumVGPRs(STM),
589 Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
590 Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI);
591 return false;
592 }
593
594 OutStreamer->emitRawComment(" Kernel info:", false);
595 emitCommonFunctionComments(
596 CurrentProgramInfo.NumArchVGPR,
597 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
598 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
599 CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
600
601 OutStreamer->emitRawComment(
602 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
603 OutStreamer->emitRawComment(
604 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
605 OutStreamer->emitRawComment(
606 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
607 " bytes/workgroup (compile time only)", false);
608
609 OutStreamer->emitRawComment(
610 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
611
612 OutStreamer->emitRawComment(
613 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
614
615 OutStreamer->emitRawComment(
616 " NumSGPRsForWavesPerEU: " +
617 getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
618 false);
619 OutStreamer->emitRawComment(
620 " NumVGPRsForWavesPerEU: " +
621 getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
622 false);
623
624 if (STM.hasGFX90AInsts()) {
625 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
626 CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
627 AdjustedAccum = MCBinaryExpr::createMul(
628 AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
629 OutStreamer->emitRawComment(
630 " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
631 }
632
633 OutStreamer->emitRawComment(
634 " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
635
636 OutStreamer->emitRawComment(
637 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
638
639 OutStreamer->emitRawComment(
640 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
641 getMCExprStr(CurrentProgramInfo.ScratchEnable),
642 false);
643 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
644 Twine(CurrentProgramInfo.UserSGPR),
645 false);
646 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
647 Twine(CurrentProgramInfo.TrapHandlerEnable),
648 false);
649 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
650 Twine(CurrentProgramInfo.TGIdXEnable),
651 false);
652 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
653 Twine(CurrentProgramInfo.TGIdYEnable),
654 false);
655 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
656 Twine(CurrentProgramInfo.TGIdZEnable),
657 false);
658 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
659 Twine(CurrentProgramInfo.TIdIGCompCount),
660 false);
661
662 [[maybe_unused]] int64_t PGMRSrc3;
663 assert(STM.hasGFX90AInsts() ||
664 (CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(
665 PGMRSrc3) &&
666 static_cast<uint64_t>(PGMRSrc3) == 0));
667 if (STM.hasGFX90AInsts()) {
668 OutStreamer->emitRawComment(
669 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
670 getMCExprStr(MCKernelDescriptor::bits_get(
671 CurrentProgramInfo.ComputePGMRSrc3GFX90A,
672 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
673 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
674 false);
675 OutStreamer->emitRawComment(
676 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
677 getMCExprStr(MCKernelDescriptor::bits_get(
678 CurrentProgramInfo.ComputePGMRSrc3GFX90A,
679 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
680 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
681 false);
682 }
683 }
684
685 if (DumpCodeInstEmitter) {
686
687 OutStreamer->switchSection(
688 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
689
690 for (size_t i = 0; i < DisasmLines.size(); ++i) {
691 std::string Comment = "\n";
692 if (!HexLines[i].empty()) {
693 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
694 Comment += " ; " + HexLines[i] + "\n";
695 }
696
697 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
698 OutStreamer->emitBytes(StringRef(Comment));
699 }
700 }
701
702 return false;
703}
704
705// TODO: Fold this into emitFunctionBodyStart.
706void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
707 // In the beginning all features are either 'Any' or 'NotSupported',
708 // depending on global target features. This will cover empty modules.
710 getGlobalSTI()->getFeatureString());
711
712 // If module is empty, we are done.
713 if (M.empty())
714 return;
715
716 // If module is not empty, need to find first 'Off' or 'On' feature
717 // setting per feature from functions in module.
718 for (auto &F : M) {
719 auto &TSTargetID = getTargetStreamer()->getTargetID();
720 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
721 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
722 break;
723
725 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
726 if (TSTargetID->isXnackSupported())
727 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
728 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
729 if (TSTargetID->isSramEccSupported())
730 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
731 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
732 }
733}
734
735uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
737 const SIInstrInfo *TII = STM.getInstrInfo();
738
739 uint64_t CodeSize = 0;
740
741 for (const MachineBasicBlock &MBB : MF) {
742 for (const MachineInstr &MI : MBB) {
743 // TODO: CodeSize should account for multiple functions.
744
745 // TODO: Should we count size of debug info?
746 if (MI.isDebugInstr())
747 continue;
748
749 CodeSize += TII->getInstSizeInBytes(MI);
750 }
751 }
752
753 return CodeSize;
754}
755
756void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
757 const MachineFunction &MF) {
759 ResourceUsage->getResourceInfo(&MF.getFunction());
761 MCContext &Ctx = MF.getContext();
762
763 auto CreateExpr = [&Ctx](int64_t Value) {
764 return MCConstantExpr::create(Value, Ctx);
765 };
766
767 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
768 int64_t Val;
769 if (Value->evaluateAsAbsolute(Val)) {
770 Res = Val;
771 return true;
772 }
773 return false;
774 };
775
776 ProgInfo.NumArchVGPR = CreateExpr(Info.NumVGPR);
777 ProgInfo.NumAccVGPR = CreateExpr(Info.NumAGPR);
778 ProgInfo.NumVGPR = CreateExpr(Info.getTotalNumVGPRs(STM));
779 ProgInfo.AccumOffset =
780 CreateExpr(alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1);
781 ProgInfo.TgSplit = STM.isTgSplitEnabled();
782 ProgInfo.NumSGPR = CreateExpr(Info.NumExplicitSGPR);
783 ProgInfo.ScratchSize = CreateExpr(Info.PrivateSegmentSize);
784 ProgInfo.VCCUsed = CreateExpr(Info.UsesVCC);
785 ProgInfo.FlatUsed = CreateExpr(Info.UsesFlatScratch);
786 ProgInfo.DynamicCallStack =
787 CreateExpr(Info.HasDynamicallySizedStack || Info.HasRecursion);
788
789 const uint64_t MaxScratchPerWorkitem =
791 uint64_t ScratchSize;
792 if (TryGetMCExprValue(ProgInfo.ScratchSize, ScratchSize) &&
793 ScratchSize > MaxScratchPerWorkitem) {
794 DiagnosticInfoStackSize DiagStackSize(MF.getFunction(), ScratchSize,
795 MaxScratchPerWorkitem, DS_Error);
796 MF.getFunction().getContext().diagnose(DiagStackSize);
797 }
798
800
801 // The calculations related to SGPR/VGPR blocks are
802 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
803 // unified.
804 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
805 ProgInfo.VCCUsed, ProgInfo.FlatUsed,
806 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
807
808 // Check the addressable register limit before we add ExtraSGPRs.
810 !STM.hasSGPRInitBug()) {
811 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
812 uint64_t NumSgpr;
813 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
814 NumSgpr > MaxAddressableNumSGPRs) {
815 // This can happen due to a compiler bug or when using inline asm.
818 MF.getFunction(), "addressable scalar registers", NumSgpr,
819 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
820 Ctx.diagnose(Diag);
821 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
822 }
823 }
824
825 // Account for extra SGPRs and VGPRs reserved for debugger use.
826 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
827
828 const Function &F = MF.getFunction();
829
830 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
831 // dispatch registers are function args.
832 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
833
834 if (isShader(F.getCallingConv())) {
835 bool IsPixelShader =
836 F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
837
838 // Calculate the number of VGPR registers based on the SPI input registers
839 uint32_t InputEna = 0;
840 uint32_t InputAddr = 0;
841 unsigned LastEna = 0;
842
843 if (IsPixelShader) {
844 // Note for IsPixelShader:
845 // By this stage, all enabled inputs are tagged in InputAddr as well.
846 // We will use InputAddr to determine whether the input counts against the
847 // vgpr total and only use the InputEnable to determine the last input
848 // that is relevant - if extra arguments are used, then we have to honour
849 // the InputAddr for any intermediate non-enabled inputs.
850 InputEna = MFI->getPSInputEnable();
851 InputAddr = MFI->getPSInputAddr();
852
853 // We only need to consider input args up to the last used arg.
854 assert((InputEna || InputAddr) &&
855 "PSInputAddr and PSInputEnable should "
856 "never both be 0 for AMDGPU_PS shaders");
857 // There are some rare circumstances where InputAddr is non-zero and
858 // InputEna can be set to 0. In this case we default to setting LastEna
859 // to 1.
860 LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
861 }
862
863 // FIXME: We should be using the number of registers determined during
864 // calling convention lowering to legalize the types.
865 const DataLayout &DL = F.getDataLayout();
866 unsigned PSArgCount = 0;
867 unsigned IntermediateVGPR = 0;
868 for (auto &Arg : F.args()) {
869 unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
870 if (Arg.hasAttribute(Attribute::InReg)) {
871 WaveDispatchNumSGPR += NumRegs;
872 } else {
873 // If this is a PS shader and we're processing the PS Input args (first
874 // 16 VGPR), use the InputEna and InputAddr bits to define how many
875 // VGPRs are actually used.
876 // Any extra VGPR arguments are handled as normal arguments (and
877 // contribute to the VGPR count whether they're used or not).
878 if (IsPixelShader && PSArgCount < 16) {
879 if ((1 << PSArgCount) & InputAddr) {
880 if (PSArgCount < LastEna)
881 WaveDispatchNumVGPR += NumRegs;
882 else
883 IntermediateVGPR += NumRegs;
884 }
885 PSArgCount++;
886 } else {
887 // If there are extra arguments we have to include the allocation for
888 // the non-used (but enabled with InputAddr) input arguments
889 if (IntermediateVGPR) {
890 WaveDispatchNumVGPR += IntermediateVGPR;
891 IntermediateVGPR = 0;
892 }
893 WaveDispatchNumVGPR += NumRegs;
894 }
895 }
896 }
898 {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
899
901 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
902
904 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
905 }
906
907 // Adjust number of registers used to meet default/requested minimum/maximum
908 // number of waves per execution unit request.
909 unsigned MaxWaves = MFI->getMaxWavesPerEU();
910 ProgInfo.NumSGPRsForWavesPerEU =
911 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
912 CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
913 Ctx);
914 ProgInfo.NumVGPRsForWavesPerEU =
915 AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
916 CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
917 Ctx);
918
920 STM.hasSGPRInitBug()) {
921 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
922 uint64_t NumSgpr;
923 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
924 NumSgpr > MaxAddressableNumSGPRs) {
925 // This can happen due to a compiler bug or when using inline asm to use
926 // the registers which are usually reserved for vcc etc.
928 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
929 NumSgpr, MaxAddressableNumSGPRs,
931 Ctx.diagnose(Diag);
932 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
933 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
934 }
935 }
936
937 if (STM.hasSGPRInitBug()) {
938 ProgInfo.NumSGPR =
940 ProgInfo.NumSGPRsForWavesPerEU =
942 }
943
944 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
946 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
947 MFI->getNumUserSGPRs(),
949 Ctx.diagnose(Diag);
950 }
951
952 if (MFI->getLDSSize() >
953 static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
956 MF.getFunction(), "local memory", MFI->getLDSSize(),
958 Ctx.diagnose(Diag);
959 }
960 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
961 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
962 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
963 unsigned Granule) {
964 const MCExpr *OneConst = CreateExpr(1ul);
965 const MCExpr *GranuleConst = CreateExpr(Granule);
966 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
967 const MCExpr *AlignToGPR =
968 AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
969 const MCExpr *DivGPR =
970 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
971 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
972 return SubGPR;
973 };
974
975 ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,
977 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
979
980 const SIModeRegisterDefaults Mode = MFI->getMode();
981
982 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
983 // register.
984 ProgInfo.FloatMode = getFPMode(Mode);
985
986 ProgInfo.IEEEMode = Mode.IEEE;
987
988 // Make clamp modifier on NaN input returns 0.
989 ProgInfo.DX10Clamp = Mode.DX10Clamp;
990
991 unsigned LDSAlignShift;
993 // LDS is allocated in 64 dword blocks.
994 LDSAlignShift = 8;
995 } else {
996 // LDS is allocated in 128 dword blocks.
997 LDSAlignShift = 9;
998 }
999
1000 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1001 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1002
1003 ProgInfo.LDSSize = MFI->getLDSSize();
1004 ProgInfo.LDSBlocks =
1005 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
1006
1007 // The MCExpr equivalent of divideCeil.
1008 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1009 const MCExpr *Ceil =
1010 AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
1011 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
1012 };
1013
1014 // Scratch is allocated in 64-dword or 256-dword blocks.
1015 unsigned ScratchAlignShift =
1016 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1017 // We need to program the hardware with the amount of scratch memory that
1018 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1019 // scratch memory used per thread.
1020 ProgInfo.ScratchBlocks = DivideCeil(
1022 CreateExpr(STM.getWavefrontSize()), Ctx),
1023 CreateExpr(1ULL << ScratchAlignShift));
1024
1025 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1026 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1027 ProgInfo.MemOrdered = 1;
1028 }
1029
1030 // 0 = X, 1 = XY, 2 = XYZ
1031 unsigned TIDIGCompCnt = 0;
1032 if (MFI->hasWorkItemIDZ())
1033 TIDIGCompCnt = 2;
1034 else if (MFI->hasWorkItemIDY())
1035 TIDIGCompCnt = 1;
1036
1037 // The private segment wave byte offset is the last of the system SGPRs. We
1038 // initially assumed it was allocated, and may have used it. It shouldn't harm
1039 // anything to disable it if we know the stack isn't used here. We may still
1040 // have emitted code reading it to initialize scratch, but if that's unused
1041 // reading garbage should be OK.
1044 MCConstantExpr::create(0, Ctx), Ctx),
1045 ProgInfo.DynamicCallStack, Ctx);
1046
1047 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1048 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1049 ProgInfo.TrapHandlerEnable =
1050 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
1051 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1052 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1053 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1054 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1055 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1056 ProgInfo.EXCPEnMSB = 0;
1057 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1058 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1059 ProgInfo.EXCPEnable = 0;
1060
1061 if (STM.hasGFX90AInsts()) {
1062 // return ((Dst & ~Mask) | (Value << Shift))
1063 auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1064 uint32_t Shift) {
1065 auto Shft = MCConstantExpr::create(Shift, Ctx);
1066 auto Msk = MCConstantExpr::create(Mask, Ctx);
1067 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1069 Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
1070 return Dst;
1071 };
1072
1073 ProgInfo.ComputePGMRSrc3GFX90A =
1074 SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
1075 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1076 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1077 ProgInfo.ComputePGMRSrc3GFX90A =
1078 SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),
1079 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1080 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1081 }
1082
1084 STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
1085 ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
1086
1087 const auto [MinWEU, MaxWEU] =
1088 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1089 uint64_t Occupancy;
1090 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1092 F, F.getSubprogram(),
1093 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1094 "'" +
1095 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1096 ", final occupancy is " + Twine(Occupancy));
1097 F.getContext().diagnose(Diag);
1098 }
1099}
1100
1101static unsigned getRsrcReg(CallingConv::ID CallConv) {
1102 switch (CallConv) {
1103 default: [[fallthrough]];
1111 }
1112}
1113
1114void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1115 const SIProgramInfo &CurrentProgramInfo) {
1117 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1118 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1119 MCContext &Ctx = MF.getContext();
1120
1121 // (((Value) & Mask) << Shift)
1122 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1123 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
1124 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
1126 shft, Ctx);
1127 };
1128
1129 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1130 int64_t Val;
1131 if (Value->evaluateAsAbsolute(Val))
1132 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
1133 else
1134 OutStreamer->emitValue(Value, Size);
1135 };
1136
1139
1140 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
1141 /*Size=*/4);
1142
1144 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
1145
1147
1148 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1149 // appropriate generation.
1150 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1151 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1152 /*Mask=*/0x3FFFF, /*Shift=*/12),
1153 /*Size=*/4);
1154 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1155 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1156 /*Mask=*/0x7FFF, /*Shift=*/12),
1157 /*Size=*/4);
1158 } else {
1159 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1160 /*Mask=*/0x1FFF, /*Shift=*/12),
1161 /*Size=*/4);
1162 }
1163
1164 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1165 // 0" comment but I don't see a corresponding field in the register spec.
1166 } else {
1167 OutStreamer->emitInt32(RsrcReg);
1168
1169 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1170 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1171 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1172 MF.getContext());
1173 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1175
1176 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1177 // appropriate generation.
1178 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1179 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1180 /*Mask=*/0x3FFFF, /*Shift=*/12),
1181 /*Size=*/4);
1182 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1183 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1184 /*Mask=*/0x7FFF, /*Shift=*/12),
1185 /*Size=*/4);
1186 } else {
1187 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1188 /*Mask=*/0x1FFF, /*Shift=*/12),
1189 /*Size=*/4);
1190 }
1191 }
1192
1195 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1196 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1197 : CurrentProgramInfo.LDSBlocks;
1198 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1200 OutStreamer->emitInt32(MFI->getPSInputEnable());
1202 OutStreamer->emitInt32(MFI->getPSInputAddr());
1203 }
1204
1205 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1206 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1207 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1208 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1209}
1210
1211// Helper function to add common PAL Metadata 3.0+
1213 const SIProgramInfo &CurrentProgramInfo,
1214 CallingConv::ID CC, const GCNSubtarget &ST) {
1215 if (ST.hasIEEEMode())
1216 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1217
1218 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1219 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1220
1221 if (AMDGPU::isCompute(CC)) {
1222 MD->setHwStage(CC, ".trap_present",
1223 (bool)CurrentProgramInfo.TrapHandlerEnable);
1224 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1225 }
1226
1227 MD->setHwStage(CC, ".lds_size",
1228 (unsigned)(CurrentProgramInfo.LdsSize *
1229 getLdsDwGranularity(ST) * sizeof(uint32_t)));
1230}
1231
1232// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1233// is AMDPAL. It stores each compute/SPI register setting and other PAL
1234// metadata items into the PALMD::Metadata, combining with any provided by the
1235// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1236// is then written as a single block in the .note section.
1237void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1238 const SIProgramInfo &CurrentProgramInfo) {
1240 auto CC = MF.getFunction().getCallingConv();
1241 auto MD = getTargetStreamer()->getPALMetadata();
1242 auto &Ctx = MF.getContext();
1243
1244 MD->setEntryPoint(CC, MF.getFunction().getName());
1245 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1246
1247 // Only set AGPRs for supported devices
1248 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1249 if (STM.hasMAIInsts()) {
1250 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1251 }
1252
1253 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1254 if (MD->getPALMajorVersion() < 3) {
1255 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
1256 if (AMDGPU::isCompute(CC)) {
1257 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1258 } else {
1259 const MCExpr *HasScratchBlocks =
1260 MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
1261 MCConstantExpr::create(0, Ctx), Ctx);
1262 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1263 MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1264 }
1265 } else {
1266 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1267 MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
1268 CurrentProgramInfo.ScratchEnable);
1269 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
1270 }
1271
1272 // ScratchSize is in bytes, 16 aligned.
1273 MD->setScratchSize(
1274 CC,
1275 AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
1276 MCConstantExpr::create(16, Ctx), Ctx),
1277 Ctx);
1278
1280 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1281 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1282 : CurrentProgramInfo.LDSBlocks;
1283 if (MD->getPALMajorVersion() < 3) {
1284 MD->setRsrc2(
1285 CC,
1287 Ctx);
1288 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1289 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1290 } else {
1291 // Graphics registers
1292 const unsigned ExtraLdsDwGranularity =
1293 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1294 MD->setGraphicsRegisters(
1295 ".ps_extra_lds_size",
1296 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1297
1298 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1299 static StringLiteral const PsInputFields[] = {
1300 ".persp_sample_ena", ".persp_center_ena",
1301 ".persp_centroid_ena", ".persp_pull_model_ena",
1302 ".linear_sample_ena", ".linear_center_ena",
1303 ".linear_centroid_ena", ".line_stipple_tex_ena",
1304 ".pos_x_float_ena", ".pos_y_float_ena",
1305 ".pos_z_float_ena", ".pos_w_float_ena",
1306 ".front_face_ena", ".ancillary_ena",
1307 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1308 unsigned PSInputEna = MFI->getPSInputEnable();
1309 unsigned PSInputAddr = MFI->getPSInputAddr();
1310 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1311 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1312 (bool)((PSInputEna >> Idx) & 1));
1313 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1314 (bool)((PSInputAddr >> Idx) & 1));
1315 }
1316 }
1317 }
1318
1319 // For version 3 and above the wave front size is already set in the metadata
1320 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1321 MD->setWave32(MF.getFunction().getCallingConv());
1322}
1323
1324void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1325 auto *MD = getTargetStreamer()->getPALMetadata();
1326 const MachineFrameInfo &MFI = MF.getFrameInfo();
1327 StringRef FnName = MF.getFunction().getName();
1328 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1330 MCContext &Ctx = MF.getContext();
1331
1332 if (MD->getPALMajorVersion() < 3) {
1333 // Set compute registers
1334 MD->setRsrc1(
1336 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1337 MD->setRsrc2(CallingConv::AMDGPU_CS,
1338 CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1339 } else {
1340 EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
1341 }
1342
1343 // Set optional info
1344 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1345 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1346 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1347}
1348
1349// This is supposed to be log2(Size)
1351 switch (Size) {
1352 case 4:
1353 return AMD_ELEMENT_4_BYTES;
1354 case 8:
1355 return AMD_ELEMENT_8_BYTES;
1356 case 16:
1357 return AMD_ELEMENT_16_BYTES;
1358 default:
1359 llvm_unreachable("invalid private_element_size");
1360 }
1361}
1362
1363void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1364 const SIProgramInfo &CurrentProgramInfo,
1365 const MachineFunction &MF) const {
1366 const Function &F = MF.getFunction();
1367 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1368 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1369
1371 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1372 MCContext &Ctx = MF.getContext();
1373
1374 Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false);
1375
1377 CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
1379 CurrentProgramInfo.getComputePGMRSrc2(Ctx);
1381
1382 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1383
1385 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1386
1387 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1388 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1390 }
1391
1392 if (UserSGPRInfo.hasDispatchPtr())
1394
1395 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)
1397
1398 if (UserSGPRInfo.hasKernargSegmentPtr())
1400
1401 if (UserSGPRInfo.hasDispatchID())
1403
1404 if (UserSGPRInfo.hasFlatScratchInit())
1406
1407 if (UserSGPRInfo.hasPrivateSegmentSize())
1409
1410 if (UserSGPRInfo.hasDispatchPtr())
1412
1413 if (STM.isXNACKEnabled())
1415
1416 Align MaxKernArgAlign;
1417 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1418 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1419 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1420 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1421 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1422
1423 // kernarg_segment_alignment is specified as log of the alignment.
1424 // The minimum alignment is 16.
1425 // FIXME: The metadata treats the minimum as 4?
1426 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1427}
1428
1430 const char *ExtraCode, raw_ostream &O) {
1431 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1432 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1433 return false;
1434
1435 if (ExtraCode && ExtraCode[0]) {
1436 if (ExtraCode[1] != 0)
1437 return true; // Unknown modifier.
1438
1439 switch (ExtraCode[0]) {
1440 case 'r':
1441 break;
1442 default:
1443 return true;
1444 }
1445 }
1446
1447 // TODO: Should be able to support other operand types like globals.
1448 const MachineOperand &MO = MI->getOperand(OpNo);
1449 if (MO.isReg()) {
1452 return false;
1453 } else if (MO.isImm()) {
1454 int64_t Val = MO.getImm();
1456 O << Val;
1457 } else if (isUInt<16>(Val)) {
1458 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1459 } else if (isUInt<32>(Val)) {
1460 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1461 } else {
1462 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1463 }
1464 return false;
1465 }
1466 return true;
1467}
1468
1473}
1474
1475void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1476 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1477 bool isModuleEntryFunction, bool hasMAIInsts) {
1478 if (!ORE)
1479 return;
1480
1481 const char *Name = "kernel-resource-usage";
1482 const char *Indent = " ";
1483
1484 // If the remark is not specifically enabled, do not output to yaml
1487 return;
1488
1489 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1490 StringRef RemarkLabel, auto Argument) {
1491 // Add an indent for every line besides the line with the kernel name. This
1492 // makes it easier to tell which resource usage go with which kernel since
1493 // the kernel name will always be displayed first.
1494 std::string LabelStr = RemarkLabel.str() + ": ";
1495 if (RemarkName != "FunctionName")
1496 LabelStr = Indent + LabelStr;
1497
1498 ORE->emit([&]() {
1499 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1501 &MF.front())
1502 << LabelStr << ore::NV(RemarkName, Argument);
1503 });
1504 };
1505
1506 // FIXME: Formatting here is pretty nasty because clang does not accept
1507 // newlines from diagnostics. This forces us to emit multiple diagnostic
1508 // remarks to simulate newlines. If and when clang does accept newlines, this
1509 // formatting should be aggregated into one remark with newlines to avoid
1510 // printing multiple diagnostic location and diag opts.
1511 EmitResourceUsageRemark("FunctionName", "Function Name",
1512 MF.getFunction().getName());
1513 EmitResourceUsageRemark("NumSGPR", "SGPRs",
1514 getMCExprStr(CurrentProgramInfo.NumSGPR));
1515 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1516 getMCExprStr(CurrentProgramInfo.NumArchVGPR));
1517 if (hasMAIInsts) {
1518 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1519 getMCExprStr(CurrentProgramInfo.NumAccVGPR));
1520 }
1521 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1522 getMCExprStr(CurrentProgramInfo.ScratchSize));
1523 int64_t DynStack;
1524 bool DynStackEvaluatable =
1525 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
1526 StringRef DynamicStackStr =
1527 DynStackEvaluatable && DynStack ? "True" : "False";
1528 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1529 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1530 getMCExprStr(CurrentProgramInfo.Occupancy));
1531 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1532 CurrentProgramInfo.SGPRSpill);
1533 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1534 CurrentProgramInfo.VGPRSpill);
1535 if (isModuleEntryFunction)
1536 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1537 CurrentProgramInfo.LDSSize);
1538}
#define Success
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, const SIProgramInfo &CurrentProgramInfo, CallingConv::ID CC, const GCNSubtarget &ST)
static unsigned getRsrcReg(CallingConv::ID CallConv)
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
AMDHSA kernel descriptor MCExpr struct for use in MC layer.
Analyzes how many registers and other resources are used by functions.
AMDHSA kernel descriptor definitions.
MC layer struct for AMDGPUMCKernelCodeT, provides MCExpr functionality where required.
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_EXTERNAL_VISIBILITY
Definition: Compiler.h:135
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::string Name
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
const char LLVMTargetMachineRef TM
R600 Assembly printer class.
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition: SIDefines.h:1046
#define R_0286E8_SPI_TMPRING_SIZE
Definition: SIDefines.h:1184
#define FP_ROUND_MODE_DP(x)
Definition: SIDefines.h:1166
#define C_00B84C_SCRATCH_EN
Definition: SIDefines.h:1082
#define FP_ROUND_ROUND_TO_NEAREST
Definition: SIDefines.h:1158
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition: SIDefines.h:1117
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition: SIDefines.h:1179
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition: SIDefines.h:1069
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition: SIDefines.h:1068
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition: SIDefines.h:1077
#define R_0286CC_SPI_PS_INPUT_ENA
Definition: SIDefines.h:1116
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition: SIDefines.h:1055
#define FP_DENORM_MODE_DP(x)
Definition: SIDefines.h:1177
#define R_00B848_COMPUTE_PGM_RSRC1
Definition: SIDefines.h:1119
#define R_SPILLED_SGPRS
Definition: SIDefines.h:1198
#define FP_ROUND_MODE_SP(x)
Definition: SIDefines.h:1165
#define FP_DENORM_MODE_SP(x)
Definition: SIDefines.h:1176
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition: SIDefines.h:1060
#define R_SPILLED_VGPRS
Definition: SIDefines.h:1199
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition: SIDefines.h:1054
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition: SIDefines.h:1079
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition: SIDefines.h:1053
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
Emit the specified function out to the OutStreamer.
void emitFunctionBodyEnd() override
Targets can override this to emit stuff after the last basic block in the function.
bool doFinalization(Module &M) override
Shut down the asmprinter.
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(unsigned RegNo, raw_ostream &O, const MCRegisterInfo &MRI)
static const AMDGPUMCExpr * createMax(ArrayRef< const MCExpr * > Args, MCContext &Ctx)
Definition: AMDGPUMCExpr.h:69
static const AMDGPUMCExpr * createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs, const GCNSubtarget &STM, MCContext &Ctx)
Mimics GCNSubtarget::computeOccupancy for MCExpr.
static const AMDGPUMCExpr * createTotalNumVGPR(const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx)
static const AMDGPUMCExpr * createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, bool XNACKUsed, MCContext &Ctx)
Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed are unresolvable but neede...
static const AMDGPUMCExpr * createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx)
Definition: AMDGPUMCExpr.h:83
void setHwStage(unsigned CC, StringRef field, unsigned Val)
unsigned getAddressableLocalMemorySize() const
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR, const MCExpr *NextSGPR, const MCExpr *ReserveVCC, const MCExpr *ReserveFlatScr)
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV)
void initializeTargetID(const MCSubtargetInfo &STI)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitDirectiveAMDGCNTarget()
virtual void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header)
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI, bool TrapEnabled)
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
This class is intended to be used as a driving class for all asm writers.
Definition: AsmPrinter.h:85
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
Definition: AsmPrinter.cpp:399
MCSymbol * getSymbol(const GlobalValue *GV) const
Definition: AsmPrinter.cpp:706
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
Definition: AsmPrinter.cpp:728
TargetMachine & TM
Target machine description.
Definition: AsmPrinter.h:88
const MCAsmInfo * MAI
Target Asm Printer information.
Definition: AsmPrinter.h:91
MachineFunction * MF
The current machine function.
Definition: AsmPrinter.h:103
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
Definition: AsmPrinter.cpp:450
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
Definition: AsmPrinter.cpp:661
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
Definition: AsmPrinter.cpp:441
unsigned getFunctionNumber() const
Return a unique ID for the current function.
Definition: AsmPrinter.cpp:395
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition: AsmPrinter.h:115
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition: AsmPrinter.h:95
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition: AsmPrinter.h:100
bool isVerbose() const
Return true if assembly output should contain comments.
Definition: AsmPrinter.h:266
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
Definition: AsmPrinter.cpp:701
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Diagnostic information for optimization failures.
Diagnostic information for stack size etc.
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1830
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:274
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
bool hasMAIInsts() const
Definition: GCNSubtarget.h:814
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:262
bool hasSGPRInitBug() const
bool isTgSplitEnabled() const
Definition: GCNSubtarget.h:605
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool isCuModeEnabled() const
Definition: GCNSubtarget.h:609
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
Definition: GCNSubtarget.h:298
bool dumpCode() const
Definition: GCNSubtarget.h:509
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:597
bool isWave32() const
unsigned getMaxNumUserSGPRs() const
Definition: GCNSubtarget.h:961
Generation getGeneration() const
Definition: GCNSubtarget.h:313
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
Definition: GCNSubtarget.h:317
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasPrivateSegmentSize() const
MaybeAlign getAlign() const
Returns the alignment of the given variable or function.
Definition: GlobalObject.h:80
VisibilityTypes getVisibility() const
Definition: GlobalValue.h:248
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:290
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition: Globals.cpp:124
Type * getValueType() const
Definition: GlobalValue.h:296
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
const DiagnosticHandler * getDiagHandlerPtr() const
getDiagHandlerPtr - Returns const raw pointer of DiagnosticHandler set by setDiagnosticHandler.
MCCodeEmitter * getEmitterPtr() const
Definition: MCAssembler.h:326
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:541
static const MCBinaryExpr * createAdd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:536
static const MCBinaryExpr * createOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:601
static const MCBinaryExpr * createLOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:571
static const MCBinaryExpr * createMul(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:591
static const MCBinaryExpr * createGT(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:556
static const MCBinaryExpr * createDiv(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:546
static const MCBinaryExpr * createShl(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:606
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:621
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:194
Context object for machine code objects.
Definition: MCContext.h:83
const MCObjectFileInfo * getObjectFileInfo() const
Definition: MCContext.h:416
void reportError(SMLoc L, const Twine &Msg)
Definition: MCContext.cpp:1070
Base class for the full range of assembler expressions which are needed for parsing.
Definition: MCExpr.h:35
MCSection * getReadOnlySection() const
MCContext & getContext() const
This represents a section on linux, lots of unix variants and some bare metal systems.
Definition: MCSectionELF.h:27
void ensureMinAlignment(Align MinAlignment)
Makes sure that Alignment is at least MinAlignment.
Definition: MCSection.h:162
MCContext & getContext() const
Definition: MCStreamer.h:304
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition: MCSymbol.h:250
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:205
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition: MCSymbol.h:300
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition: MCSymbol.h:232
MCStreamer & getStreamer()
Definition: MCStreamer.h:102
static const MCUnaryExpr * createNot(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition: MCExpr.h:466
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void setAlignment(Align A)
setAlignment - Set the alignment of the function.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MCContext & getContext() const
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
Definition: MachineInstr.h:69
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getNumKernargPreloadedSGPRs() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:846
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::string str() const
str - Get the contents as an std::string.
Definition: StringRef.h:223
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const Triple & getTargetTriple() const
const MCSubtargetInfo * getMCSubtargetInfo() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:382
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
LLVM Value Representation.
Definition: Value.h:74
void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
Definition: AsmWriter.cpp:5022
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, std::optional< bool > EnableWavefrontSize32)
unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
IsaVersion getIsaVersion(StringRef GPU)
bool isCompute(CallingConv::ID cc)
const MCExpr * maskShiftSet(const MCExpr *Val, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
Provided with the MCExpr * Val, uint32 Mask and Shift, will return the masked and left shifted,...
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX90A(const MCSubtargetInfo &STI)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
constexpr std::pair< unsigned, unsigned > getShiftMask(unsigned Value)
Deduce the least significant bit aligned shift and mask values for a binary Complement Value (as they...
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ SHT_PROGBITS
Definition: ELF.h:1067
@ STT_AMDGPU_HSA_KERNEL
Definition: ELF.h:1340
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:431
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2400
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
Target & getTheGCNTarget()
The target for GCN GPUs.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1849
@ DS_Error
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
const SIFunctionResourceInfo & getResourceInfo(const Function *F) const
void validate(const MCSubtargetInfo *STI, MCContext &Ctx)
void initDefault(const MCSubtargetInfo *STI, MCContext &Ctx, bool InitMCExpr=true)
static const MCExpr * bits_get(const MCExpr *Src, uint32_t Shift, uint32_t Mask, MCContext &Ctx)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
virtual bool isAnalysisRemarkEnabled(StringRef PassName) const
Return true if analysis remarks are enabled, override to provide different implementation.
Track resource usage for kernels / entry functions.
Definition: SIProgramInfo.h:31
const MCExpr * NumSGPR
Definition: SIProgramInfo.h:70
const MCExpr * ComputePGMRSrc3GFX90A
Definition: SIProgramInfo.h:63
const MCExpr * NumArchVGPR
Definition: SIProgramInfo.h:66
const MCExpr * getComputePGMRSrc2(MCContext &Ctx) const
Compute the value of the ComputePGMRsrc2 register.
const MCExpr * VGPRBlocks
Definition: SIProgramInfo.h:33
const MCExpr * ScratchBlocks
Definition: SIProgramInfo.h:48
const MCExpr * getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const
Compute the value of the ComputePGMRsrc1 register.
const MCExpr * VCCUsed
Definition: SIProgramInfo.h:90
const MCExpr * FlatUsed
Definition: SIProgramInfo.h:74
uint32_t TrapHandlerEnable
Definition: SIProgramInfo.h:53
const MCExpr * ScratchEnable
Definition: SIProgramInfo.h:51
const MCExpr * AccumOffset
Definition: SIProgramInfo.h:68
const MCExpr * NumAccVGPR
Definition: SIProgramInfo.h:67
const MCExpr * DynamicCallStack
Definition: SIProgramInfo.h:87
const MCExpr * SGPRBlocks
Definition: SIProgramInfo.h:34
const MCExpr * NumVGPRsForWavesPerEU
Definition: SIProgramInfo.h:80
const MCExpr * NumVGPR
Definition: SIProgramInfo.h:65
const MCExpr * getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST, MCContext &Ctx) const
const MCExpr * Occupancy
Definition: SIProgramInfo.h:83
const MCExpr * ScratchSize
Definition: SIProgramInfo.h:44
const MCExpr * NumSGPRsForWavesPerEU
Definition: SIProgramInfo.h:77
void reset(const MachineFunction &MF)
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.