LLVM 19.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
22#include "AMDKernelCodeT.h"
23#include "GCNSubtarget.h"
26#include "R600AsmPrinter.h"
35#include "llvm/MC/MCAssembler.h"
36#include "llvm/MC/MCContext.h"
38#include "llvm/MC/MCStreamer.h"
44
45using namespace llvm;
46using namespace llvm::AMDGPU;
47
48// This should get the default rounding mode from the kernel. We just set the
49// default here, but this could change if the OpenCL rounding mode pragmas are
50// used.
51//
52// The denormal mode here should match what is reported by the OpenCL runtime
53// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
54// can also be override to flush with the -cl-denorms-are-zero compiler flag.
55//
56// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
57// precision, and leaves single precision to flush all and does not report
58// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
59// CL_FP_DENORM for both.
60//
61// FIXME: It seems some instructions do not support single precision denormals
62// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
63// and sin_f32, cos_f32 on most parts).
64
65// We want to use these instructions, and using fp32 denormals also causes
66// instructions to run at the double precision rate for the device so it's
67// probably best to just report no single precision denormals.
71 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
72 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
73}
74
75static AsmPrinter *
77 std::unique_ptr<MCStreamer> &&Streamer) {
78 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
79}
80
86}
87
89 std::unique_ptr<MCStreamer> Streamer)
90 : AsmPrinter(TM, std::move(Streamer)) {
91 assert(OutStreamer && "AsmPrinter constructed without streamer");
92}
93
95 return "AMDGPU Assembly Printer";
96}
97
99 return TM.getMCSubtargetInfo();
100}
101
103 if (!OutStreamer)
104 return nullptr;
105 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
106}
107
110}
111
112void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
114
115 // TODO: Which one is called first, emitStartOfAsmFile or
116 // emitFunctionBodyStart?
117 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
118 initializeTargetID(M);
119
122 return;
123
125
128 CodeObjectVersion);
129 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
130 }
131
134}
135
137 // Init target streamer if it has not yet happened
139 initTargetStreamer(M);
140
143
144 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
145 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
147 HSAMetadataStream->end();
148 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
149 (void)Success;
150 assert(Success && "Malformed HSA Metadata");
151 }
152}
153
156 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
157 const Function &F = MF->getFunction();
158
159 // TODO: We're checking this late, would be nice to check it earlier.
160 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
162 STM.getCPU() + " is only available on code object version 6 or better",
163 /*gen_crash_diag*/ false);
164 }
165
166 // TODO: Which one is called first, emitStartOfAsmFile or
167 // emitFunctionBodyStart?
168 if (!getTargetStreamer()->getTargetID())
169 initializeTargetID(*F.getParent());
170
171 const auto &FunctionTargetID = STM.getTargetID();
172 // Make sure function's xnack settings are compatible with module's
173 // xnack settings.
174 if (FunctionTargetID.isXnackSupported() &&
175 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
176 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
177 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
178 "' function does not match module xnack setting");
179 return;
180 }
181 // Make sure function's sramecc settings are compatible with module's
182 // sramecc settings.
183 if (FunctionTargetID.isSramEccSupported() &&
184 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
185 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
186 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
187 "' function does not match module sramecc setting");
188 return;
189 }
190
191 if (!MFI.isEntryFunction())
192 return;
193
194 if (STM.isMesaKernel(F) &&
195 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
196 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
197 amd_kernel_code_t KernelCode;
198 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
200 }
201
202 if (STM.isAmdHsaOS())
203 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
204
205 if (MFI.getNumKernargPreloadedSGPRs() > 0) {
208 STM.isAmdHsaOS());
209 }
210}
211
214 if (!MFI.isEntryFunction())
215 return;
216
218 return;
219
220 auto &Streamer = getTargetStreamer()->getStreamer();
221 auto &Context = Streamer.getContext();
222 auto &ObjectFileInfo = *Context.getObjectFileInfo();
223 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
224
225 Streamer.pushSection();
226 Streamer.switchSection(&ReadOnlySection);
227
228 // CP microcode requires the kernel descriptor to be allocated on 64 byte
229 // alignment.
230 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
231 ReadOnlySection.ensureMinAlignment(Align(64));
232
233 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
234
235 SmallString<128> KernelName;
236 getNameWithPrefix(KernelName, &MF->getFunction());
238 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
239 CurrentProgramInfo.NumVGPRsForWavesPerEU,
240 CurrentProgramInfo.NumSGPRsForWavesPerEU -
242 &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
243 getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
244 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
245
246 Streamer.popSection();
247}
248
250 Register RegNo = MI->getOperand(0).getReg();
251
254 OS << "implicit-def: "
255 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
256
257 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
258 OS << " : SGPR spill to VGPR lane";
259
260 OutStreamer->AddComment(OS.str());
261 OutStreamer->addBlankLine();
262}
263
267 return;
268 }
269
271 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
272 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
273 SmallString<128> SymbolName;
274 getNameWithPrefix(SymbolName, &MF->getFunction()),
276 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
277 }
278 if (DumpCodeInstEmitter) {
279 // Disassemble function name label to text.
280 DisasmLines.push_back(MF->getName().str() + ":");
281 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
282 HexLines.push_back("");
283 }
284
286}
287
289 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
290 // Write a line for the basic block label if it is not only fallthrough.
291 DisasmLines.push_back(
292 (Twine("BB") + Twine(getFunctionNumber())
293 + "_" + Twine(MBB.getNumber()) + ":").str());
294 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
295 HexLines.push_back("");
296 }
298}
299
302 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
304 Twine(GV->getName()) +
305 ": unsupported initializer for address space");
306 return;
307 }
308
309 // LDS variables aren't emitted in HSA or PAL yet.
311 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
312 return;
313
314 MCSymbol *GVSym = getSymbol(GV);
315
316 GVSym->redefineIfPossible();
317 if (GVSym->isDefined() || GVSym->isVariable())
318 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
319 "' is already defined");
320
321 const DataLayout &DL = GV->getParent()->getDataLayout();
322 uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
323 Align Alignment = GV->getAlign().value_or(Align(4));
324
325 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
326 emitLinkage(GV, GVSym);
327 auto TS = getTargetStreamer();
328 TS->emitAMDGPULDS(GVSym, Size, Alignment);
329 return;
330 }
331
333}
334
336 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
337
339 switch (CodeObjectVersion) {
341 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4());
342 break;
344 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV5());
345 break;
347 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV6());
348 break;
349 default:
350 report_fatal_error("Unexpected code object version");
351 }
352 }
354}
355
357 // Pad with s_code_end to help tools and guard against instruction prefetch
358 // causing stale data in caches. Arguably this should be done by the linker,
359 // which is why this isn't done for Mesa.
360 const MCSubtargetInfo &STI = *getGlobalSTI();
361 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
364 OutStreamer->switchSection(getObjFileLowering().getTextSection());
366 }
367
369}
370
371// Print comments that apply to both callable functions and entry points.
372void AMDGPUAsmPrinter::emitCommonFunctionComments(
373 uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
374 uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
375 const AMDGPUMachineFunction *MFI) {
376 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
377 OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
378 OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
379 if (NumAGPR) {
380 OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
381 OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
382 false);
383 }
384 OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
385 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
386 false);
387}
388
389uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
390 const MachineFunction &MF) const {
392 uint16_t KernelCodeProperties = 0;
393 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
394
395 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
396 KernelCodeProperties |=
397 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
398 }
399 if (UserSGPRInfo.hasDispatchPtr()) {
400 KernelCodeProperties |=
401 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
402 }
403 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {
404 KernelCodeProperties |=
405 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
406 }
407 if (UserSGPRInfo.hasKernargSegmentPtr()) {
408 KernelCodeProperties |=
409 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
410 }
411 if (UserSGPRInfo.hasDispatchID()) {
412 KernelCodeProperties |=
413 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
414 }
415 if (UserSGPRInfo.hasFlatScratchInit()) {
416 KernelCodeProperties |=
417 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
418 }
420 KernelCodeProperties |=
421 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
422 }
423
424 if (CurrentProgramInfo.DynamicCallStack &&
425 CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
426 KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
427
428 return KernelCodeProperties;
429}
430
431amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
432 const MachineFunction &MF,
433 const SIProgramInfo &PI) const {
435 const Function &F = MF.getFunction();
437
438 amdhsa::kernel_descriptor_t KernelDescriptor;
439 memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
440
441 assert(isUInt<32>(PI.ScratchSize));
442 assert(isUInt<32>(PI.getComputePGMRSrc1(STM)));
443 assert(isUInt<32>(PI.getComputePGMRSrc2()));
444
445 KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
446 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
447
448 Align MaxKernArgAlign;
449 KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
450
451 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM);
452 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2();
453 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
454
455 assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
456 if (STM.hasGFX90AInsts())
457 KernelDescriptor.compute_pgm_rsrc3 =
458 CurrentProgramInfo.ComputePGMRSrc3GFX90A;
459
461 KernelDescriptor.kernarg_preload =
462 static_cast<uint16_t>(Info->getNumKernargPreloadedSGPRs());
463
464 return KernelDescriptor;
465}
466
468 // Init target streamer lazily on the first function so that previous passes
469 // can set metadata.
471 initTargetStreamer(*MF.getFunction().getParent());
472
473 ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
474 CurrentProgramInfo = SIProgramInfo();
475
477
478 // The starting address of all shader programs must be 256 bytes aligned.
479 // Regular functions just need the basic required instruction alignment.
480 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
481
483
486 // FIXME: This should be an explicit check for Mesa.
487 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
488 MCSectionELF *ConfigSection =
489 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
490 OutStreamer->switchSection(ConfigSection);
491 }
492
493 if (MFI->isModuleEntryFunction()) {
494 getSIProgramInfo(CurrentProgramInfo, MF);
495 }
496
497 if (STM.isAmdPalOS()) {
498 if (MFI->isEntryFunction())
499 EmitPALMetadata(MF, CurrentProgramInfo);
500 else if (MFI->isModuleEntryFunction())
501 emitPALFunctionMetadata(MF);
502 } else if (!STM.isAmdHsaOS()) {
503 EmitProgramInfoSI(MF, CurrentProgramInfo);
504 }
505
506 DumpCodeInstEmitter = nullptr;
507 if (STM.dumpCode()) {
508 // For -dumpcode, get the assembler out of the streamer, even if it does
509 // not really want to let us have it. This only works with -filetype=obj.
510 bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing();
511 OutStreamer->setUseAssemblerInfoForParsing(true);
512 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
513 OutStreamer->setUseAssemblerInfoForParsing(SaveFlag);
514 if (Assembler)
515 DumpCodeInstEmitter = Assembler->getEmitterPtr();
516 }
517
518 DisasmLines.clear();
519 HexLines.clear();
521
523
524 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
525 STM.hasMAIInsts());
526
527 if (isVerbose()) {
528 MCSectionELF *CommentSection =
529 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
530 OutStreamer->switchSection(CommentSection);
531
532 if (!MFI->isEntryFunction()) {
533 OutStreamer->emitRawComment(" Function info:", false);
535 ResourceUsage->getResourceInfo(&MF.getFunction());
536 emitCommonFunctionComments(
537 Info.NumVGPR,
538 STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
539 Info.getTotalNumVGPRs(STM),
540 Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
541 Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI);
542 return false;
543 }
544
545 OutStreamer->emitRawComment(" Kernel info:", false);
546 emitCommonFunctionComments(
547 CurrentProgramInfo.NumArchVGPR,
548 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR
549 : std::optional<uint32_t>(),
550 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
551 CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
552
553 OutStreamer->emitRawComment(
554 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
555 OutStreamer->emitRawComment(
556 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
557 OutStreamer->emitRawComment(
558 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
559 " bytes/workgroup (compile time only)", false);
560
561 OutStreamer->emitRawComment(
562 " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
563 OutStreamer->emitRawComment(
564 " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
565
566 OutStreamer->emitRawComment(
567 " NumSGPRsForWavesPerEU: " +
568 Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
569 OutStreamer->emitRawComment(
570 " NumVGPRsForWavesPerEU: " +
571 Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
572
573 if (STM.hasGFX90AInsts())
574 OutStreamer->emitRawComment(
575 " AccumOffset: " +
576 Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false);
577
578 OutStreamer->emitRawComment(
579 " Occupancy: " +
580 Twine(CurrentProgramInfo.Occupancy), false);
581
582 OutStreamer->emitRawComment(
583 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
584
585 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
586 Twine(CurrentProgramInfo.ScratchEnable),
587 false);
588 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
589 Twine(CurrentProgramInfo.UserSGPR),
590 false);
591 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
592 Twine(CurrentProgramInfo.TrapHandlerEnable),
593 false);
594 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
595 Twine(CurrentProgramInfo.TGIdXEnable),
596 false);
597 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
598 Twine(CurrentProgramInfo.TGIdYEnable),
599 false);
600 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
601 Twine(CurrentProgramInfo.TGIdZEnable),
602 false);
603 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
604 Twine(CurrentProgramInfo.TIdIGCompCount),
605 false);
606
607 assert(STM.hasGFX90AInsts() ||
608 CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
609 if (STM.hasGFX90AInsts()) {
610 OutStreamer->emitRawComment(
611 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
612 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
613 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
614 false);
615 OutStreamer->emitRawComment(
616 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
617 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
618 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
619 false);
620 }
621 }
622
623 if (DumpCodeInstEmitter) {
624
625 OutStreamer->switchSection(
626 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
627
628 for (size_t i = 0; i < DisasmLines.size(); ++i) {
629 std::string Comment = "\n";
630 if (!HexLines[i].empty()) {
631 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
632 Comment += " ; " + HexLines[i] + "\n";
633 }
634
635 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
636 OutStreamer->emitBytes(StringRef(Comment));
637 }
638 }
639
640 return false;
641}
642
643// TODO: Fold this into emitFunctionBodyStart.
644void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
645 // In the beginning all features are either 'Any' or 'NotSupported',
646 // depending on global target features. This will cover empty modules.
648 getGlobalSTI()->getFeatureString());
649
650 // If module is empty, we are done.
651 if (M.empty())
652 return;
653
654 // If module is not empty, need to find first 'Off' or 'On' feature
655 // setting per feature from functions in module.
656 for (auto &F : M) {
657 auto &TSTargetID = getTargetStreamer()->getTargetID();
658 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
659 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
660 break;
661
663 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
664 if (TSTargetID->isXnackSupported())
665 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
666 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
667 if (TSTargetID->isSramEccSupported())
668 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
669 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
670 }
671}
672
673uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
675 const SIInstrInfo *TII = STM.getInstrInfo();
676
677 uint64_t CodeSize = 0;
678
679 for (const MachineBasicBlock &MBB : MF) {
680 for (const MachineInstr &MI : MBB) {
681 // TODO: CodeSize should account for multiple functions.
682
683 // TODO: Should we count size of debug info?
684 if (MI.isDebugInstr())
685 continue;
686
687 CodeSize += TII->getInstSizeInBytes(MI);
688 }
689 }
690
691 return CodeSize;
692}
693
694void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
695 const MachineFunction &MF) {
697 ResourceUsage->getResourceInfo(&MF.getFunction());
699
700 ProgInfo.NumArchVGPR = Info.NumVGPR;
701 ProgInfo.NumAccVGPR = Info.NumAGPR;
702 ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
703 ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
704 ProgInfo.TgSplit = STM.isTgSplitEnabled();
705 ProgInfo.NumSGPR = Info.NumExplicitSGPR;
706 ProgInfo.ScratchSize = Info.PrivateSegmentSize;
707 ProgInfo.VCCUsed = Info.UsesVCC;
708 ProgInfo.FlatUsed = Info.UsesFlatScratch;
709 ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
710
711 const uint64_t MaxScratchPerWorkitem =
713 if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
714 DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
715 ProgInfo.ScratchSize,
716 MaxScratchPerWorkitem, DS_Error);
717 MF.getFunction().getContext().diagnose(DiagStackSize);
718 }
719
721
722 // The calculations related to SGPR/VGPR blocks are
723 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
724 // unified.
725 unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
726 &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed,
727 getTargetStreamer()->getTargetID()->isXnackOnOrAny());
728
729 // Check the addressable register limit before we add ExtraSGPRs.
731 !STM.hasSGPRInitBug()) {
732 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
733 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
734 // This can happen due to a compiler bug or when using inline asm.
737 MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR,
738 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
739 Ctx.diagnose(Diag);
740 ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
741 }
742 }
743
744 // Account for extra SGPRs and VGPRs reserved for debugger use.
745 ProgInfo.NumSGPR += ExtraSGPRs;
746
747 const Function &F = MF.getFunction();
748
749 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
750 // dispatch registers are function args.
751 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
752
753 if (isShader(F.getCallingConv())) {
754 bool IsPixelShader =
755 F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
756
757 // Calculate the number of VGPR registers based on the SPI input registers
758 uint32_t InputEna = 0;
759 uint32_t InputAddr = 0;
760 unsigned LastEna = 0;
761
762 if (IsPixelShader) {
763 // Note for IsPixelShader:
764 // By this stage, all enabled inputs are tagged in InputAddr as well.
765 // We will use InputAddr to determine whether the input counts against the
766 // vgpr total and only use the InputEnable to determine the last input
767 // that is relevant - if extra arguments are used, then we have to honour
768 // the InputAddr for any intermediate non-enabled inputs.
769 InputEna = MFI->getPSInputEnable();
770 InputAddr = MFI->getPSInputAddr();
771
772 // We only need to consider input args up to the last used arg.
773 assert((InputEna || InputAddr) &&
774 "PSInputAddr and PSInputEnable should "
775 "never both be 0 for AMDGPU_PS shaders");
776 // There are some rare circumstances where InputAddr is non-zero and
777 // InputEna can be set to 0. In this case we default to setting LastEna
778 // to 1.
779 LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
780 }
781
782 // FIXME: We should be using the number of registers determined during
783 // calling convention lowering to legalize the types.
784 const DataLayout &DL = F.getParent()->getDataLayout();
785 unsigned PSArgCount = 0;
786 unsigned IntermediateVGPR = 0;
787 for (auto &Arg : F.args()) {
788 unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
789 if (Arg.hasAttribute(Attribute::InReg)) {
790 WaveDispatchNumSGPR += NumRegs;
791 } else {
792 // If this is a PS shader and we're processing the PS Input args (first
793 // 16 VGPR), use the InputEna and InputAddr bits to define how many
794 // VGPRs are actually used.
795 // Any extra VGPR arguments are handled as normal arguments (and
796 // contribute to the VGPR count whether they're used or not).
797 if (IsPixelShader && PSArgCount < 16) {
798 if ((1 << PSArgCount) & InputAddr) {
799 if (PSArgCount < LastEna)
800 WaveDispatchNumVGPR += NumRegs;
801 else
802 IntermediateVGPR += NumRegs;
803 }
804 PSArgCount++;
805 } else {
806 // If there are extra arguments we have to include the allocation for
807 // the non-used (but enabled with InputAddr) input arguments
808 if (IntermediateVGPR) {
809 WaveDispatchNumVGPR += IntermediateVGPR;
810 IntermediateVGPR = 0;
811 }
812 WaveDispatchNumVGPR += NumRegs;
813 }
814 }
815 }
816 ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
817 ProgInfo.NumArchVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
818 ProgInfo.NumVGPR =
819 Info.getTotalNumVGPRs(STM, Info.NumAGPR, ProgInfo.NumArchVGPR);
820 }
821
822 // Adjust number of registers used to meet default/requested minimum/maximum
823 // number of waves per execution unit request.
824 ProgInfo.NumSGPRsForWavesPerEU = std::max(
825 std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
826 ProgInfo.NumVGPRsForWavesPerEU = std::max(
827 std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
828
830 STM.hasSGPRInitBug()) {
831 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
832 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
833 // This can happen due to a compiler bug or when using inline asm to use
834 // the registers which are usually reserved for vcc etc.
836 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
837 ProgInfo.NumSGPR, MaxAddressableNumSGPRs,
839 Ctx.diagnose(Diag);
840 ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
841 ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
842 }
843 }
844
845 if (STM.hasSGPRInitBug()) {
846 ProgInfo.NumSGPR =
848 ProgInfo.NumSGPRsForWavesPerEU =
850 }
851
852 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
854 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
855 MFI->getNumUserSGPRs(),
857 Ctx.diagnose(Diag);
858 }
859
860 if (MFI->getLDSSize() >
861 static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
864 MF.getFunction(), "local memory", MFI->getLDSSize(),
866 Ctx.diagnose(Diag);
867 }
868
870 &STM, ProgInfo.NumSGPRsForWavesPerEU);
871 ProgInfo.VGPRBlocks =
873
874 const SIModeRegisterDefaults Mode = MFI->getMode();
875
876 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
877 // register.
878 ProgInfo.FloatMode = getFPMode(Mode);
879
880 ProgInfo.IEEEMode = Mode.IEEE;
881
882 // Make clamp modifier on NaN input returns 0.
883 ProgInfo.DX10Clamp = Mode.DX10Clamp;
884
885 unsigned LDSAlignShift;
887 // LDS is allocated in 64 dword blocks.
888 LDSAlignShift = 8;
889 } else {
890 // LDS is allocated in 128 dword blocks.
891 LDSAlignShift = 9;
892 }
893
894 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
895 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
896
897 ProgInfo.LDSSize = MFI->getLDSSize();
898 ProgInfo.LDSBlocks =
899 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
900
901 // Scratch is allocated in 64-dword or 256-dword blocks.
902 unsigned ScratchAlignShift =
903 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
904 // We need to program the hardware with the amount of scratch memory that
905 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
906 // scratch memory used per thread.
907 ProgInfo.ScratchBlocks = divideCeil(
908 ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift);
909
910 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
911 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
912 ProgInfo.MemOrdered = 1;
913 }
914
915 // 0 = X, 1 = XY, 2 = XYZ
916 unsigned TIDIGCompCnt = 0;
917 if (MFI->hasWorkItemIDZ())
918 TIDIGCompCnt = 2;
919 else if (MFI->hasWorkItemIDY())
920 TIDIGCompCnt = 1;
921
922 // The private segment wave byte offset is the last of the system SGPRs. We
923 // initially assumed it was allocated, and may have used it. It shouldn't harm
924 // anything to disable it if we know the stack isn't used here. We may still
925 // have emitted code reading it to initialize scratch, but if that's unused
926 // reading garbage should be OK.
927 ProgInfo.ScratchEnable =
928 ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack;
929 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
930 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
931 ProgInfo.TrapHandlerEnable =
932 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
933 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
934 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
935 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
936 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
937 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
938 ProgInfo.EXCPEnMSB = 0;
939 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
940 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
941 ProgInfo.EXCPEnable = 0;
942
943 if (STM.hasGFX90AInsts()) {
945 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
946 ProgInfo.AccumOffset);
948 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
949 ProgInfo.TgSplit);
950 }
951
952 ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
953 ProgInfo.NumSGPRsForWavesPerEU,
954 ProgInfo.NumVGPRsForWavesPerEU);
955 const auto [MinWEU, MaxWEU] =
956 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
957 if (ProgInfo.Occupancy < MinWEU) {
959 F, F.getSubprogram(),
960 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
961 "'" +
962 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
963 ", final occupancy is " + Twine(ProgInfo.Occupancy));
964 F.getContext().diagnose(Diag);
965 }
966}
967
968static unsigned getRsrcReg(CallingConv::ID CallConv) {
969 switch (CallConv) {
970 default: [[fallthrough]];
978 }
979}
980
981void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
982 const SIProgramInfo &CurrentProgramInfo) {
985 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
986
989
990 OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1(STM));
991
993 OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2());
994
996 OutStreamer->emitInt32(
997 STM.getGeneration() >= AMDGPUSubtarget::GFX12
998 ? S_00B860_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
999 : STM.getGeneration() == AMDGPUSubtarget::GFX11
1000 ? S_00B860_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
1001 : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
1002
1003 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1004 // 0" comment but I don't see a corresponding field in the register spec.
1005 } else {
1006 OutStreamer->emitInt32(RsrcReg);
1007 OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
1008 S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
1010 OutStreamer->emitInt32(
1011 STM.getGeneration() >= AMDGPUSubtarget::GFX12
1012 ? S_0286E8_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
1013 : STM.getGeneration() == AMDGPUSubtarget::GFX11
1014 ? S_0286E8_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
1015 : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
1016 }
1017
1020 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1021 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1022 : CurrentProgramInfo.LDSBlocks;
1023 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1025 OutStreamer->emitInt32(MFI->getPSInputEnable());
1027 OutStreamer->emitInt32(MFI->getPSInputAddr());
1028 }
1029
1030 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1031 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1032 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1033 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1034}
1035
1036// Helper function to add common PAL Metadata 3.0+
1038 const SIProgramInfo &CurrentProgramInfo,
1039 CallingConv::ID CC, const GCNSubtarget &ST) {
1040 if (ST.hasIEEEMode())
1041 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1042
1043 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1044 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1045
1046 if (AMDGPU::isCompute(CC)) {
1047 MD->setHwStage(CC, ".trap_present",
1048 (bool)CurrentProgramInfo.TrapHandlerEnable);
1049 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1050
1051 MD->setHwStage(CC, ".lds_size",
1052 (unsigned)(CurrentProgramInfo.LdsSize *
1053 getLdsDwGranularity(ST) * sizeof(uint32_t)));
1054 }
1055}
1056
1057// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1058// is AMDPAL. It stores each compute/SPI register setting and other PAL
1059// metadata items into the PALMD::Metadata, combining with any provided by the
1060// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1061// is then written as a single block in the .note section.
1062void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1063 const SIProgramInfo &CurrentProgramInfo) {
1065 auto CC = MF.getFunction().getCallingConv();
1066 auto MD = getTargetStreamer()->getPALMetadata();
1067
1069 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1070
1071 // Only set AGPRs for supported devices
1072 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1073 if (STM.hasMAIInsts()) {
1074 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1075 }
1076
1077 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1078 if (MD->getPALMajorVersion() < 3) {
1079 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM));
1080 if (AMDGPU::isCompute(CC)) {
1081 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
1082 } else {
1083 if (CurrentProgramInfo.ScratchBlocks > 0)
1084 MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
1085 }
1086 } else {
1087 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1088 MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
1089 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
1090 }
1091
1092 // ScratchSize is in bytes, 16 aligned.
1093 MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
1095 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1096 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1097 : CurrentProgramInfo.LDSBlocks;
1098 if (MD->getPALMajorVersion() < 3) {
1099 MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1100 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1101 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1102 } else {
1103 // Graphics registers
1104 const unsigned ExtraLdsDwGranularity =
1105 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1106 MD->setGraphicsRegisters(
1107 ".ps_extra_lds_size",
1108 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1109
1110 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1111 static StringLiteral const PsInputFields[] = {
1112 ".persp_sample_ena", ".persp_center_ena",
1113 ".persp_centroid_ena", ".persp_pull_model_ena",
1114 ".linear_sample_ena", ".linear_center_ena",
1115 ".linear_centroid_ena", ".line_stipple_tex_ena",
1116 ".pos_x_float_ena", ".pos_y_float_ena",
1117 ".pos_z_float_ena", ".pos_w_float_ena",
1118 ".front_face_ena", ".ancillary_ena",
1119 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1120 unsigned PSInputEna = MFI->getPSInputEnable();
1121 unsigned PSInputAddr = MFI->getPSInputAddr();
1122 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1123 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1124 (bool)((PSInputEna >> Idx) & 1));
1125 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1126 (bool)((PSInputAddr >> Idx) & 1));
1127 }
1128 }
1129 }
1130
1131 // For version 3 and above the wave front size is already set in the metadata
1132 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1133 MD->setWave32(MF.getFunction().getCallingConv());
1134}
1135
1136void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1137 auto *MD = getTargetStreamer()->getPALMetadata();
1138 const MachineFrameInfo &MFI = MF.getFrameInfo();
1139 StringRef FnName = MF.getFunction().getName();
1140 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1142
1143 if (MD->getPALMajorVersion() < 3) {
1144 // Set compute registers
1145 MD->setRsrc1(CallingConv::AMDGPU_CS,
1146 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST));
1147 MD->setRsrc2(CallingConv::AMDGPU_CS,
1148 CurrentProgramInfo.getComputePGMRSrc2());
1149 } else {
1150 EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
1151 }
1152
1153 // Set optional info
1154 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1155 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1156 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1157}
1158
1159// This is supposed to be log2(Size)
1161 switch (Size) {
1162 case 4:
1163 return AMD_ELEMENT_4_BYTES;
1164 case 8:
1165 return AMD_ELEMENT_8_BYTES;
1166 case 16:
1167 return AMD_ELEMENT_16_BYTES;
1168 default:
1169 llvm_unreachable("invalid private_element_size");
1170 }
1171}
1172
1173void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
1174 const SIProgramInfo &CurrentProgramInfo,
1175 const MachineFunction &MF) const {
1176 const Function &F = MF.getFunction();
1177 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1178 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1179
1181 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1182
1184
1186 CurrentProgramInfo.getComputePGMRSrc1(STM) |
1187 (CurrentProgramInfo.getComputePGMRSrc2() << 32);
1189
1190 if (CurrentProgramInfo.DynamicCallStack)
1192
1195 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1196
1197 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1198 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1199 Out.code_properties |=
1201 }
1202
1203 if (UserSGPRInfo.hasDispatchPtr())
1205
1206 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)
1208
1209 if (UserSGPRInfo.hasKernargSegmentPtr())
1211
1212 if (UserSGPRInfo.hasDispatchID())
1214
1215 if (UserSGPRInfo.hasFlatScratchInit())
1217
1218 if (UserSGPRInfo.hasDispatchPtr())
1220
1221 if (STM.isXNACKEnabled())
1223
1224 Align MaxKernArgAlign;
1225 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1226 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1227 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1228 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1229 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1230
1231 // kernarg_segment_alignment is specified as log of the alignment.
1232 // The minimum alignment is 16.
1233 // FIXME: The metadata treats the minimum as 4?
1234 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1235}
1236
1238 const char *ExtraCode, raw_ostream &O) {
1239 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1240 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1241 return false;
1242
1243 if (ExtraCode && ExtraCode[0]) {
1244 if (ExtraCode[1] != 0)
1245 return true; // Unknown modifier.
1246
1247 switch (ExtraCode[0]) {
1248 case 'r':
1249 break;
1250 default:
1251 return true;
1252 }
1253 }
1254
1255 // TODO: Should be able to support other operand types like globals.
1256 const MachineOperand &MO = MI->getOperand(OpNo);
1257 if (MO.isReg()) {
1260 return false;
1261 } else if (MO.isImm()) {
1262 int64_t Val = MO.getImm();
1264 O << Val;
1265 } else if (isUInt<16>(Val)) {
1266 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1267 } else if (isUInt<32>(Val)) {
1268 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1269 } else {
1270 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1271 }
1272 return false;
1273 }
1274 return true;
1275}
1276
1281}
1282
1283void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1284 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1285 bool isModuleEntryFunction, bool hasMAIInsts) {
1286 if (!ORE)
1287 return;
1288
1289 const char *Name = "kernel-resource-usage";
1290 const char *Indent = " ";
1291
1292 // If the remark is not specifically enabled, do not output to yaml
1295 return;
1296
1297 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1298 StringRef RemarkLabel, auto Argument) {
1299 // Add an indent for every line besides the line with the kernel name. This
1300 // makes it easier to tell which resource usage go with which kernel since
1301 // the kernel name will always be displayed first.
1302 std::string LabelStr = RemarkLabel.str() + ": ";
1303 if (!RemarkName.equals("FunctionName"))
1304 LabelStr = Indent + LabelStr;
1305
1306 ORE->emit([&]() {
1307 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1309 &MF.front())
1310 << LabelStr << ore::NV(RemarkName, Argument);
1311 });
1312 };
1313
1314 // FIXME: Formatting here is pretty nasty because clang does not accept
1315 // newlines from diagnostics. This forces us to emit multiple diagnostic
1316 // remarks to simulate newlines. If and when clang does accept newlines, this
1317 // formatting should be aggregated into one remark with newlines to avoid
1318 // printing multiple diagnostic location and diag opts.
1319 EmitResourceUsageRemark("FunctionName", "Function Name",
1320 MF.getFunction().getName());
1321 EmitResourceUsageRemark("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR);
1322 EmitResourceUsageRemark("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR);
1323 if (hasMAIInsts)
1324 EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR);
1325 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1326 CurrentProgramInfo.ScratchSize);
1327 StringRef DynamicStackStr =
1328 CurrentProgramInfo.DynamicCallStack ? "True" : "False";
1329 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1330 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1331 CurrentProgramInfo.Occupancy);
1332 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1333 CurrentProgramInfo.SGPRSpill);
1334 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1335 CurrentProgramInfo.VGPRSpill);
1336 if (isModuleEntryFunction)
1337 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1338 CurrentProgramInfo.LDSSize);
1339}
#define Success
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, const SIProgramInfo &CurrentProgramInfo, CallingConv::ID CC, const GCNSubtarget &ST)
static unsigned getRsrcReg(CallingConv::ID CallConv)
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
Analyzes how many registers and other resources are used by functions.
AMDHSA kernel descriptor definitions.
#define AMDHSA_BITS_GET(SRC, MSK)
#define AMDHSA_BITS_SET(DST, MSK, VAL)
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
@ AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_EXTERNAL_VISIBILITY
Definition: Compiler.h:135
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::string Name
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
LLVMContext & Context
const char LLVMTargetMachineRef TM
R600 Assembly printer class.
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition: SIDefines.h:1049
#define S_0286E8_WAVESIZE_PreGFX11(x)
Definition: SIDefines.h:1188
#define R_0286E8_SPI_TMPRING_SIZE
Definition: SIDefines.h:1187
#define S_00B84C_SCRATCH_EN(x)
Definition: SIDefines.h:1083
#define S_0286E8_WAVESIZE_GFX11(x)
Definition: SIDefines.h:1189
#define FP_ROUND_MODE_DP(x)
Definition: SIDefines.h:1169
#define FP_ROUND_ROUND_TO_NEAREST
Definition: SIDefines.h:1161
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition: SIDefines.h:1120
#define S_00B860_WAVESIZE_GFX12Plus(x)
Definition: SIDefines.h:1185
#define S_0286E8_WAVESIZE_GFX12Plus(x)
Definition: SIDefines.h:1190
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition: SIDefines.h:1182
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition: SIDefines.h:1072
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition: SIDefines.h:1071
#define S_00B028_SGPRS(x)
Definition: SIDefines.h:1051
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition: SIDefines.h:1080
#define R_0286CC_SPI_PS_INPUT_ENA
Definition: SIDefines.h:1119
#define S_00B860_WAVESIZE_PreGFX11(x)
Definition: SIDefines.h:1183
#define S_00B028_VGPRS(x)
Definition: SIDefines.h:1050
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition: SIDefines.h:1058
#define FP_DENORM_MODE_DP(x)
Definition: SIDefines.h:1180
#define R_00B848_COMPUTE_PGM_RSRC1
Definition: SIDefines.h:1122
#define R_SPILLED_SGPRS
Definition: SIDefines.h:1201
#define FP_ROUND_MODE_SP(x)
Definition: SIDefines.h:1168
#define FP_DENORM_MODE_SP(x)
Definition: SIDefines.h:1179
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition: SIDefines.h:1063
#define R_SPILLED_VGPRS
Definition: SIDefines.h:1202
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition: SIDefines.h:1057
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition: SIDefines.h:1082
#define S_00B860_WAVESIZE_GFX11(x)
Definition: SIDefines.h:1184
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition: SIDefines.h:1056
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
Emit the specified function out to the OutStreamer.
void emitFunctionBodyEnd() override
Targets can override this to emit stuff after the last basic block in the function.
bool doFinalization(Module &M) override
Shut down the asmprinter.
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(unsigned RegNo, raw_ostream &O, const MCRegisterInfo &MRI)
void setEntryPoint(unsigned CC, StringRef Name)
void setHwStage(unsigned CC, StringRef field, unsigned Val)
unsigned getAddressableLocalMemorySize() const
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV)
void initializeTargetID(const MCSubtargetInfo &STI)
virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitDirectiveAMDGCNTarget()
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI, bool TrapEnabled)
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr)
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
This class is intended to be used as a driving class for all asm writers.
Definition: AsmPrinter.h:84
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
Definition: AsmPrinter.cpp:398
MCSymbol * getSymbol(const GlobalValue *GV) const
Definition: AsmPrinter.cpp:700
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
Definition: AsmPrinter.cpp:722
TargetMachine & TM
Target machine description.
Definition: AsmPrinter.h:87
MachineFunction * MF
The current machine function.
Definition: AsmPrinter.h:102
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
Definition: AsmPrinter.cpp:449
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
Definition: AsmPrinter.cpp:655
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
Definition: AsmPrinter.cpp:440
unsigned getFunctionNumber() const
Return a unique ID for the current function.
Definition: AsmPrinter.cpp:394
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition: AsmPrinter.h:114
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition: AsmPrinter.h:94
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition: AsmPrinter.h:99
bool isVerbose() const
Return true if assembly output should contain comments.
Definition: AsmPrinter.h:265
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
Definition: AsmPrinter.cpp:695
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Diagnostic information for optimization failures.
Diagnostic information for stack size etc.
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1828
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:262
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:342
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
bool hasMAIInsts() const
Definition: GCNSubtarget.h:801
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:251
bool hasSGPRInitBug() const
bool isTgSplitEnabled() const
Definition: GCNSubtarget.h:594
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool isCuModeEnabled() const
Definition: GCNSubtarget.h:598
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
Definition: GCNSubtarget.h:287
bool dumpCode() const
Definition: GCNSubtarget.h:498
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:586
bool isWave32() const
unsigned getMaxNumUserSGPRs() const
Definition: GCNSubtarget.h:926
Generation getGeneration() const
Definition: GCNSubtarget.h:302
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
Definition: GCNSubtarget.h:306
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
MaybeAlign getAlign() const
Returns the alignment of the given variable or function.
Definition: GlobalObject.h:80
VisibilityTypes getVisibility() const
Definition: GlobalValue.h:248
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:274
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
Type * getValueType() const
Definition: GlobalValue.h:296
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
const DiagnosticHandler * getDiagHandlerPtr() const
getDiagHandlerPtr - Returns const raw pointer of DiagnosticHandler set by setDiagnosticHandler.
MCCodeEmitter * getEmitterPtr() const
Definition: MCAssembler.h:330
Context object for machine code objects.
Definition: MCContext.h:76
void reportError(SMLoc L, const Twine &Msg)
Definition: MCContext.cpp:1064
MCContext & getContext() const
This represents a section on linux, lots of unix variants and some bare metal systems.
Definition: MCSectionELF.h:26
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:40
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition: MCSymbol.h:250
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:205
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition: MCSymbol.h:300
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition: MCSymbol.h:232
MCStreamer & getStreamer()
Definition: MCStreamer.h:101
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void setAlignment(Align A)
setAlignment - Set the alignment of the function.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
Definition: MachineInstr.h:69
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:287
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getNumKernargPreloadedSGPRs() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:849
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::string str() const
str - Get the contents as an std::string.
Definition: StringRef.h:222
bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:164
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
const Triple & getTargetTriple() const
const MCSubtargetInfo * getMCSubtargetInfo() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:370
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:690
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs)
unsigned getEncodedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, std::optional< bool > EnableWavefrontSize32)
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const MCSubtargetInfo *STI)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
IsaVersion getIsaVersion(StringRef GPU)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX90A(const MCSubtargetInfo &STI)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ SHT_PROGBITS
Definition: ELF.h:1063
@ STT_AMDGPU_HSA_KERNEL
Definition: ELF.h:1334
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1689
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2415
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
Target & getTheGCNTarget()
The target for GCN GPUs.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1858
@ DS_Error
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
AMD Kernel Code Object (amd_kernel_code_t).
uint16_t workitem_vgpr_count
Number of vector registers used by each work-item.
uint32_t code_properties
Code properties.
uint8_t kernarg_segment_alignment
The maximum byte alignment of variables used by the kernel in the specified memory segment.
uint32_t workgroup_group_segment_byte_size
The amount of group segment memory required by a work-group in bytes.
uint16_t wavefront_sgpr_count
Number of scalar registers used by a wavefront.
uint32_t workitem_private_segment_byte_size
The amount of memory required for the combined private, spill and arg segments for a work-item in byt...
uint64_t kernarg_segment_byte_size
The size in bytes of the kernarg segment that holds the values of the arguments to the kernel.
uint64_t compute_pgm_resource_registers
Shader program settings for CS.
const SIFunctionResourceInfo & getResourceInfo(const Function *F) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
virtual bool isAnalysisRemarkEnabled(StringRef PassName) const
Return true if analysis remarks are enabled, override to provide different implementation.
Track resource usage for kernels / entry functions.
Definition: SIProgramInfo.h:27
uint32_t NumSGPRsForWavesPerEU
Definition: SIProgramInfo.h:73
uint64_t getComputePGMRSrc1(const GCNSubtarget &ST) const
Compute the value of the ComputePGMRsrc1 register.
uint32_t TrapHandlerEnable
Definition: SIProgramInfo.h:49
uint64_t getComputePGMRSrc2() const
Compute the value of the ComputePGMRsrc2 register.
uint32_t NumVGPRsForWavesPerEU
Definition: SIProgramInfo.h:76
uint64_t ComputePGMRSrc3GFX90A
Definition: SIProgramInfo.h:59
uint64_t getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST) const
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.