LLVM 17.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
22#include "AMDKernelCodeT.h"
23#include "GCNSubtarget.h"
26#include "R600AsmPrinter.h"
35#include "llvm/MC/MCAssembler.h"
36#include "llvm/MC/MCContext.h"
38#include "llvm/MC/MCStreamer.h"
44
45using namespace llvm;
46using namespace llvm::AMDGPU;
47
48// This should get the default rounding mode from the kernel. We just set the
49// default here, but this could change if the OpenCL rounding mode pragmas are
50// used.
51//
52// The denormal mode here should match what is reported by the OpenCL runtime
53// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
54// can also be override to flush with the -cl-denorms-are-zero compiler flag.
55//
56// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
57// precision, and leaves single precision to flush all and does not report
58// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
59// CL_FP_DENORM for both.
60//
61// FIXME: It seems some instructions do not support single precision denormals
62// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
63// and sin_f32, cos_f32 on most parts).
64
65// We want to use these instructions, and using fp32 denormals also causes
66// instructions to run at the double precision rate for the device so it's
67// probably best to just report no single precision denormals.
71 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
72 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
73}
74
75static AsmPrinter *
77 std::unique_ptr<MCStreamer> &&Streamer) {
78 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
79}
80
86}
87
89 std::unique_ptr<MCStreamer> Streamer)
90 : AsmPrinter(TM, std::move(Streamer)) {
91 assert(OutStreamer && "AsmPrinter constructed without streamer");
92}
93
95 return "AMDGPU Assembly Printer";
96}
97
99 return TM.getMCSubtargetInfo();
100}
101
103 if (!OutStreamer)
104 return nullptr;
105 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
106}
107
110}
111
112void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
114
115 // TODO: Which one is called first, emitStartOfAsmFile or
116 // emitFunctionBodyStart?
117 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
118 initializeTargetID(M);
119
122 return;
123
124 if (CodeObjectVersion >= AMDGPU::AMDHSA_COV3)
126
128 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
129
132
133 if (CodeObjectVersion >= AMDGPU::AMDHSA_COV3)
134 return;
135
136 // HSA emits NT_AMD_HSA_CODE_OBJECT_VERSION for code objects v2.
139
140 // HSA and PAL emit NT_AMD_HSA_ISA_VERSION for code objects v2.
143 Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU");
144}
145
147 // Init target streamer if it has not yet happened
149 initTargetStreamer(M);
150
152 CodeObjectVersion == AMDGPU::AMDHSA_COV2)
154
155 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
156 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
158 HSAMetadataStream->end();
159 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
160 (void)Success;
161 assert(Success && "Malformed HSA Metadata");
162 }
163}
164
166 const MachineBasicBlock *MBB) const {
168 return false;
169
170 if (MBB->empty())
171 return true;
172
173 // If this is a block implementing a long branch, an expression relative to
174 // the start of the block is needed. to the start of the block.
175 // XXX - Is there a smarter way to check this?
176 return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
177}
178
181 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
182 const Function &F = MF->getFunction();
183
184 // TODO: Which one is called first, emitStartOfAsmFile or
185 // emitFunctionBodyStart?
187 initializeTargetID(*F.getParent());
188
189 const auto &FunctionTargetID = STM.getTargetID();
190 // Make sure function's xnack settings are compatible with module's
191 // xnack settings.
192 if (FunctionTargetID.isXnackSupported() &&
193 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
194 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
195 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
196 "' function does not match module xnack setting");
197 return;
198 }
199 // Make sure function's sramecc settings are compatible with module's
200 // sramecc settings.
201 if (FunctionTargetID.isSramEccSupported() &&
202 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
203 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
204 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
205 "' function does not match module sramecc setting");
206 return;
207 }
208
209 if (!MFI.isEntryFunction())
210 return;
211
212 if ((STM.isMesaKernel(F) || CodeObjectVersion == AMDGPU::AMDHSA_COV2) &&
213 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
214 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
215 amd_kernel_code_t KernelCode;
216 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
218 }
219
220 if (STM.isAmdHsaOS())
221 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
222}
223
226 if (!MFI.isEntryFunction())
227 return;
228
230 CodeObjectVersion == AMDGPU::AMDHSA_COV2)
231 return;
232
233 auto &Streamer = getTargetStreamer()->getStreamer();
234 auto &Context = Streamer.getContext();
235 auto &ObjectFileInfo = *Context.getObjectFileInfo();
236 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
237
238 Streamer.pushSection();
239 Streamer.switchSection(&ReadOnlySection);
240
241 // CP microcode requires the kernel descriptor to be allocated on 64 byte
242 // alignment.
243 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
244 ReadOnlySection.ensureMinAlignment(Align(64));
245
246 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
247
248 SmallString<128> KernelName;
249 getNameWithPrefix(KernelName, &MF->getFunction());
251 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
252 CurrentProgramInfo.NumVGPRsForWavesPerEU,
253 CurrentProgramInfo.NumSGPRsForWavesPerEU -
255 &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
256 getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
257 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
258 CodeObjectVersion);
259
260 Streamer.popSection();
261}
262
265 CodeObjectVersion >= AMDGPU::AMDHSA_COV3) {
267 return;
268 }
269
271 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
272 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
273 SmallString<128> SymbolName;
274 getNameWithPrefix(SymbolName, &MF->getFunction()),
276 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
277 }
278 if (DumpCodeInstEmitter) {
279 // Disassemble function name label to text.
280 DisasmLines.push_back(MF->getName().str() + ":");
281 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
282 HexLines.push_back("");
283 }
284
286}
287
289 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
290 // Write a line for the basic block label if it is not only fallthrough.
291 DisasmLines.push_back(
292 (Twine("BB") + Twine(getFunctionNumber())
293 + "_" + Twine(MBB.getNumber()) + ":").str());
294 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
295 HexLines.push_back("");
296 }
298}
299
302 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
304 Twine(GV->getName()) +
305 ": unsupported initializer for address space");
306 return;
307 }
308
309 // LDS variables aren't emitted in HSA or PAL yet.
311 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
312 return;
313
314 MCSymbol *GVSym = getSymbol(GV);
315
316 GVSym->redefineIfPossible();
317 if (GVSym->isDefined() || GVSym->isVariable())
318 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
319 "' is already defined");
320
321 const DataLayout &DL = GV->getParent()->getDataLayout();
322 uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
323 Align Alignment = GV->getAlign().value_or(Align(4));
324
325 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
326 emitLinkage(GV, GVSym);
327 auto TS = getTargetStreamer();
328 TS->emitAMDGPULDS(GVSym, Size, Alignment);
329 return;
330 }
331
333}
334
336 CodeObjectVersion = AMDGPU::getCodeObjectVersion(M);
337
339 switch (CodeObjectVersion) {
341 HSAMetadataStream.reset(new HSAMD::MetadataStreamerYamlV2());
342 break;
344 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV3());
345 break;
347 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4());
348 break;
350 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV5());
351 break;
352 default:
353 report_fatal_error("Unexpected code object version");
354 }
355 }
357}
358
360 // Pad with s_code_end to help tools and guard against instruction prefetch
361 // causing stale data in caches. Arguably this should be done by the linker,
362 // which is why this isn't done for Mesa.
363 const MCSubtargetInfo &STI = *getGlobalSTI();
364 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
367 OutStreamer->switchSection(getObjFileLowering().getTextSection());
369 }
370
372}
373
374// Print comments that apply to both callable functions and entry points.
375void AMDGPUAsmPrinter::emitCommonFunctionComments(
376 uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
377 uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
378 const AMDGPUMachineFunction *MFI) {
379 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
380 OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
381 OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
382 if (NumAGPR) {
383 OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
384 OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
385 false);
386 }
387 OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
388 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
389 false);
390}
391
392uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
393 const MachineFunction &MF) const {
395 uint16_t KernelCodeProperties = 0;
396
397 if (MFI.hasPrivateSegmentBuffer()) {
398 KernelCodeProperties |=
399 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
400 }
401 if (MFI.hasDispatchPtr()) {
402 KernelCodeProperties |=
403 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
404 }
405 if (MFI.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {
406 KernelCodeProperties |=
407 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
408 }
409 if (MFI.hasKernargSegmentPtr()) {
410 KernelCodeProperties |=
411 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
412 }
413 if (MFI.hasDispatchID()) {
414 KernelCodeProperties |=
415 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
416 }
417 if (MFI.hasFlatScratchInit()) {
418 KernelCodeProperties |=
419 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
420 }
422 KernelCodeProperties |=
423 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
424 }
425
426 if (CurrentProgramInfo.DynamicCallStack &&
427 CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
428 KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
429
430 return KernelCodeProperties;
431}
432
433amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
434 const MachineFunction &MF,
435 const SIProgramInfo &PI) const {
437 const Function &F = MF.getFunction();
438
439 amdhsa::kernel_descriptor_t KernelDescriptor;
440 memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
441
442 assert(isUInt<32>(PI.ScratchSize));
443 assert(isUInt<32>(PI.getComputePGMRSrc1()));
444 assert(isUInt<32>(PI.getComputePGMRSrc2()));
445
446 KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
447 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
448
449 Align MaxKernArgAlign;
450 KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
451
452 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1();
453 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2();
454 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
455
456 assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
457 if (STM.hasGFX90AInsts())
458 KernelDescriptor.compute_pgm_rsrc3 =
459 CurrentProgramInfo.ComputePGMRSrc3GFX90A;
460
461 return KernelDescriptor;
462}
463
465 // Init target streamer lazily on the first function so that previous passes
466 // can set metadata.
468 initTargetStreamer(*MF.getFunction().getParent());
469
470 ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
471 CurrentProgramInfo = SIProgramInfo();
472
474
475 // The starting address of all shader programs must be 256 bytes aligned.
476 // Regular functions just need the basic required instruction alignment.
477 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
478
480
483 // FIXME: This should be an explicit check for Mesa.
484 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
485 MCSectionELF *ConfigSection =
486 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
487 OutStreamer->switchSection(ConfigSection);
488 }
489
490 if (MFI->isModuleEntryFunction()) {
491 getSIProgramInfo(CurrentProgramInfo, MF);
492 }
493
494 if (STM.isAmdPalOS()) {
495 if (MFI->isEntryFunction())
496 EmitPALMetadata(MF, CurrentProgramInfo);
497 else if (MFI->isModuleEntryFunction())
498 emitPALFunctionMetadata(MF);
499 } else if (!STM.isAmdHsaOS()) {
500 EmitProgramInfoSI(MF, CurrentProgramInfo);
501 }
502
503 DumpCodeInstEmitter = nullptr;
504 if (STM.dumpCode()) {
505 // For -dumpcode, get the assembler out of the streamer, even if it does
506 // not really want to let us have it. This only works with -filetype=obj.
507 bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing();
508 OutStreamer->setUseAssemblerInfoForParsing(true);
509 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
510 OutStreamer->setUseAssemblerInfoForParsing(SaveFlag);
511 if (Assembler)
512 DumpCodeInstEmitter = Assembler->getEmitterPtr();
513 }
514
515 DisasmLines.clear();
516 HexLines.clear();
518
520
521 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
522 STM.hasMAIInsts());
523
524 if (isVerbose()) {
525 MCSectionELF *CommentSection =
526 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
527 OutStreamer->switchSection(CommentSection);
528
529 if (!MFI->isEntryFunction()) {
530 OutStreamer->emitRawComment(" Function info:", false);
532 ResourceUsage->getResourceInfo(&MF.getFunction());
533 emitCommonFunctionComments(
534 Info.NumVGPR,
535 STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
536 Info.getTotalNumVGPRs(STM),
537 Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
538 Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI);
539 return false;
540 }
541
542 OutStreamer->emitRawComment(" Kernel info:", false);
543 emitCommonFunctionComments(
544 CurrentProgramInfo.NumArchVGPR,
545 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR
546 : std::optional<uint32_t>(),
547 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
548 CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
549
550 OutStreamer->emitRawComment(
551 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
552 OutStreamer->emitRawComment(
553 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
554 OutStreamer->emitRawComment(
555 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
556 " bytes/workgroup (compile time only)", false);
557
558 OutStreamer->emitRawComment(
559 " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
560 OutStreamer->emitRawComment(
561 " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
562
563 OutStreamer->emitRawComment(
564 " NumSGPRsForWavesPerEU: " +
565 Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
566 OutStreamer->emitRawComment(
567 " NumVGPRsForWavesPerEU: " +
568 Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
569
570 if (STM.hasGFX90AInsts())
571 OutStreamer->emitRawComment(
572 " AccumOffset: " +
573 Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false);
574
575 OutStreamer->emitRawComment(
576 " Occupancy: " +
577 Twine(CurrentProgramInfo.Occupancy), false);
578
579 OutStreamer->emitRawComment(
580 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
581
582 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
583 Twine(CurrentProgramInfo.ScratchEnable),
584 false);
585 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
586 Twine(CurrentProgramInfo.UserSGPR),
587 false);
588 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
589 Twine(CurrentProgramInfo.TrapHandlerEnable),
590 false);
591 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
592 Twine(CurrentProgramInfo.TGIdXEnable),
593 false);
594 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
595 Twine(CurrentProgramInfo.TGIdYEnable),
596 false);
597 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
598 Twine(CurrentProgramInfo.TGIdZEnable),
599 false);
600 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
601 Twine(CurrentProgramInfo.TIdIGCompCount),
602 false);
603
604 assert(STM.hasGFX90AInsts() ||
605 CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
606 if (STM.hasGFX90AInsts()) {
607 OutStreamer->emitRawComment(
608 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
609 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
610 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
611 false);
612 OutStreamer->emitRawComment(
613 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
614 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
615 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
616 false);
617 }
618 }
619
620 if (DumpCodeInstEmitter) {
621
622 OutStreamer->switchSection(
623 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
624
625 for (size_t i = 0; i < DisasmLines.size(); ++i) {
626 std::string Comment = "\n";
627 if (!HexLines[i].empty()) {
628 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
629 Comment += " ; " + HexLines[i] + "\n";
630 }
631
632 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
633 OutStreamer->emitBytes(StringRef(Comment));
634 }
635 }
636
637 return false;
638}
639
640// TODO: Fold this into emitFunctionBodyStart.
641void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
642 // In the beginning all features are either 'Any' or 'NotSupported',
643 // depending on global target features. This will cover empty modules.
645 *getGlobalSTI(), getGlobalSTI()->getFeatureString(), CodeObjectVersion);
646
647 // If module is empty, we are done.
648 if (M.empty())
649 return;
650
651 // If module is not empty, need to find first 'Off' or 'On' feature
652 // setting per feature from functions in module.
653 for (auto &F : M) {
654 auto &TSTargetID = getTargetStreamer()->getTargetID();
655 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
656 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
657 break;
658
660 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
661 if (TSTargetID->isXnackSupported())
662 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
663 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
664 if (TSTargetID->isSramEccSupported())
665 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
666 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
667 }
668}
669
670uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
672 const SIInstrInfo *TII = STM.getInstrInfo();
673
674 uint64_t CodeSize = 0;
675
676 for (const MachineBasicBlock &MBB : MF) {
677 for (const MachineInstr &MI : MBB) {
678 // TODO: CodeSize should account for multiple functions.
679
680 // TODO: Should we count size of debug info?
681 if (MI.isDebugInstr())
682 continue;
683
684 CodeSize += TII->getInstSizeInBytes(MI);
685 }
686 }
687
688 return CodeSize;
689}
690
691void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
692 const MachineFunction &MF) {
694 ResourceUsage->getResourceInfo(&MF.getFunction());
696
697 ProgInfo.NumArchVGPR = Info.NumVGPR;
698 ProgInfo.NumAccVGPR = Info.NumAGPR;
699 ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
700 ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
701 ProgInfo.TgSplit = STM.isTgSplitEnabled();
702 ProgInfo.NumSGPR = Info.NumExplicitSGPR;
703 ProgInfo.ScratchSize = Info.PrivateSegmentSize;
704 ProgInfo.VCCUsed = Info.UsesVCC;
705 ProgInfo.FlatUsed = Info.UsesFlatScratch;
706 ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
707
708 const uint64_t MaxScratchPerWorkitem =
710 if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
711 DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
712 ProgInfo.ScratchSize,
713 MaxScratchPerWorkitem, DS_Error);
714 MF.getFunction().getContext().diagnose(DiagStackSize);
715 }
716
718
719 // The calculations related to SGPR/VGPR blocks are
720 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
721 // unified.
722 unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
723 &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed,
724 getTargetStreamer()->getTargetID()->isXnackOnOrAny());
725
726 // Check the addressable register limit before we add ExtraSGPRs.
728 !STM.hasSGPRInitBug()) {
729 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
730 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
731 // This can happen due to a compiler bug or when using inline asm.
734 MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR,
735 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
736 Ctx.diagnose(Diag);
737 ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
738 }
739 }
740
741 // Account for extra SGPRs and VGPRs reserved for debugger use.
742 ProgInfo.NumSGPR += ExtraSGPRs;
743
744 const Function &F = MF.getFunction();
745
746 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
747 // dispatch registers are function args.
748 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
749
750 if (isShader(F.getCallingConv())) {
751 bool IsPixelShader =
752 F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
753
754 // Calculate the number of VGPR registers based on the SPI input registers
755 uint32_t InputEna = 0;
756 uint32_t InputAddr = 0;
757 unsigned LastEna = 0;
758
759 if (IsPixelShader) {
760 // Note for IsPixelShader:
761 // By this stage, all enabled inputs are tagged in InputAddr as well.
762 // We will use InputAddr to determine whether the input counts against the
763 // vgpr total and only use the InputEnable to determine the last input
764 // that is relevant - if extra arguments are used, then we have to honour
765 // the InputAddr for any intermediate non-enabled inputs.
766 InputEna = MFI->getPSInputEnable();
767 InputAddr = MFI->getPSInputAddr();
768
769 // We only need to consider input args up to the last used arg.
770 assert((InputEna || InputAddr) &&
771 "PSInputAddr and PSInputEnable should "
772 "never both be 0 for AMDGPU_PS shaders");
773 // There are some rare circumstances where InputAddr is non-zero and
774 // InputEna can be set to 0. In this case we default to setting LastEna
775 // to 1.
776 LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
777 }
778
779 // FIXME: We should be using the number of registers determined during
780 // calling convention lowering to legalize the types.
781 const DataLayout &DL = F.getParent()->getDataLayout();
782 unsigned PSArgCount = 0;
783 unsigned IntermediateVGPR = 0;
784 for (auto &Arg : F.args()) {
785 unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
786 if (Arg.hasAttribute(Attribute::InReg)) {
787 WaveDispatchNumSGPR += NumRegs;
788 } else {
789 // If this is a PS shader and we're processing the PS Input args (first
790 // 16 VGPR), use the InputEna and InputAddr bits to define how many
791 // VGPRs are actually used.
792 // Any extra VGPR arguments are handled as normal arguments (and
793 // contribute to the VGPR count whether they're used or not).
794 if (IsPixelShader && PSArgCount < 16) {
795 if ((1 << PSArgCount) & InputAddr) {
796 if (PSArgCount < LastEna)
797 WaveDispatchNumVGPR += NumRegs;
798 else
799 IntermediateVGPR += NumRegs;
800 }
801 PSArgCount++;
802 } else {
803 // If there are extra arguments we have to include the allocation for
804 // the non-used (but enabled with InputAddr) input arguments
805 if (IntermediateVGPR) {
806 WaveDispatchNumVGPR += IntermediateVGPR;
807 IntermediateVGPR = 0;
808 }
809 WaveDispatchNumVGPR += NumRegs;
810 }
811 }
812 }
813 ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
814 ProgInfo.NumArchVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
815 ProgInfo.NumVGPR =
816 Info.getTotalNumVGPRs(STM, Info.NumAGPR, ProgInfo.NumArchVGPR);
817 }
818
819 // Adjust number of registers used to meet default/requested minimum/maximum
820 // number of waves per execution unit request.
821 ProgInfo.NumSGPRsForWavesPerEU = std::max(
822 std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
823 ProgInfo.NumVGPRsForWavesPerEU = std::max(
824 std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
825
827 STM.hasSGPRInitBug()) {
828 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
829 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
830 // This can happen due to a compiler bug or when using inline asm to use
831 // the registers which are usually reserved for vcc etc.
833 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
834 ProgInfo.NumSGPR, MaxAddressableNumSGPRs,
836 Ctx.diagnose(Diag);
837 ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
838 ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
839 }
840 }
841
842 if (STM.hasSGPRInitBug()) {
843 ProgInfo.NumSGPR =
845 ProgInfo.NumSGPRsForWavesPerEU =
847 }
848
849 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
851 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
852 MFI->getNumUserSGPRs(),
854 Ctx.diagnose(Diag);
855 }
856
857 if (MFI->getLDSSize() >
858 static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
861 MF.getFunction(), "local memory", MFI->getLDSSize(),
863 Ctx.diagnose(Diag);
864 }
865
867 &STM, ProgInfo.NumSGPRsForWavesPerEU);
869 &STM, ProgInfo.NumVGPRsForWavesPerEU);
870
871 const SIModeRegisterDefaults Mode = MFI->getMode();
872
873 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
874 // register.
875 ProgInfo.FloatMode = getFPMode(Mode);
876
877 ProgInfo.IEEEMode = Mode.IEEE;
878
879 // Make clamp modifier on NaN input returns 0.
880 ProgInfo.DX10Clamp = Mode.DX10Clamp;
881
882 unsigned LDSAlignShift;
884 // LDS is allocated in 64 dword blocks.
885 LDSAlignShift = 8;
886 } else {
887 // LDS is allocated in 128 dword blocks.
888 LDSAlignShift = 9;
889 }
890
891 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
892 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
893
894 ProgInfo.LDSSize = MFI->getLDSSize();
895 ProgInfo.LDSBlocks =
896 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
897
898 // Scratch is allocated in 64-dword or 256-dword blocks.
899 unsigned ScratchAlignShift =
900 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
901 // We need to program the hardware with the amount of scratch memory that
902 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
903 // scratch memory used per thread.
904 ProgInfo.ScratchBlocks = divideCeil(
905 ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift);
906
907 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
908 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
909 ProgInfo.MemOrdered = 1;
910 }
911
912 // 0 = X, 1 = XY, 2 = XYZ
913 unsigned TIDIGCompCnt = 0;
914 if (MFI->hasWorkItemIDZ())
915 TIDIGCompCnt = 2;
916 else if (MFI->hasWorkItemIDY())
917 TIDIGCompCnt = 1;
918
919 // The private segment wave byte offset is the last of the system SGPRs. We
920 // initially assumed it was allocated, and may have used it. It shouldn't harm
921 // anything to disable it if we know the stack isn't used here. We may still
922 // have emitted code reading it to initialize scratch, but if that's unused
923 // reading garbage should be OK.
924 ProgInfo.ScratchEnable =
925 ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack;
926 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
927 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
928 ProgInfo.TrapHandlerEnable =
929 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
930 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
931 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
932 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
933 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
934 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
935 ProgInfo.EXCPEnMSB = 0;
936 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
937 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
938 ProgInfo.EXCPEnable = 0;
939
940 if (STM.hasGFX90AInsts()) {
942 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
943 ProgInfo.AccumOffset);
945 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
946 ProgInfo.TgSplit);
947 }
948
949 ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
950 ProgInfo.NumSGPRsForWavesPerEU,
951 ProgInfo.NumVGPRsForWavesPerEU);
952}
953
954static unsigned getRsrcReg(CallingConv::ID CallConv) {
955 switch (CallConv) {
956 default: [[fallthrough]];
964 }
965}
966
967void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
968 const SIProgramInfo &CurrentProgramInfo) {
971 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
972
975
976 OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1());
977
979 OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2());
980
982 OutStreamer->emitInt32(
983 STM.getGeneration() >= AMDGPUSubtarget::GFX11
984 ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
985 : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
986
987 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
988 // 0" comment but I don't see a corresponding field in the register spec.
989 } else {
990 OutStreamer->emitInt32(RsrcReg);
991 OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
992 S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
994 OutStreamer->emitInt32(
995 STM.getGeneration() >= AMDGPUSubtarget::GFX11
996 ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
997 : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
998 }
999
1002 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1003 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1004 : CurrentProgramInfo.LDSBlocks;
1005 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1007 OutStreamer->emitInt32(MFI->getPSInputEnable());
1009 OutStreamer->emitInt32(MFI->getPSInputAddr());
1010 }
1011
1012 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1013 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1014 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1015 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1016}
1017
1018// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1019// is AMDPAL. It stores each compute/SPI register setting and other PAL
1020// metadata items into the PALMD::Metadata, combining with any provided by the
1021// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1022// is then written as a single block in the .note section.
1023void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1024 const SIProgramInfo &CurrentProgramInfo) {
1026 auto CC = MF.getFunction().getCallingConv();
1027 auto MD = getTargetStreamer()->getPALMetadata();
1028
1030 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1031
1032 // Only set AGPRs for supported devices
1033 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1034 if (STM.hasMAIInsts()) {
1035 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1036 }
1037
1038 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1039 if (MD->getPALMajorVersion() < 3) {
1040 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
1041 if (AMDGPU::isCompute(CC)) {
1042 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
1043 } else {
1044 if (CurrentProgramInfo.ScratchBlocks > 0)
1045 MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
1046 }
1047 } else {
1048 // Priority?
1049 MD->setHwStage(CC, ".float_mode", CurrentProgramInfo.FloatMode);
1050 // Priv?
1051 // DX10Clamp?
1052 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1053 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1054 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1055 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1056
1057 if (AMDGPU::isCompute(CC)) {
1058 MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
1059 MD->setHwStage(CC, ".trap_present",
1060 (bool)CurrentProgramInfo.TrapHandlerEnable);
1061
1062 // EXCPEnMSB?
1063 const unsigned LdsDwGranularity = 128;
1064 MD->setHwStage(CC, ".lds_size",
1065 (unsigned)(CurrentProgramInfo.LdsSize * LdsDwGranularity *
1066 sizeof(uint32_t)));
1067 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1068 } else {
1069 MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
1070 }
1071 }
1072
1073 // ScratchSize is in bytes, 16 aligned.
1074 MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
1076 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1077 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1078 : CurrentProgramInfo.LDSBlocks;
1079 if (MD->getPALMajorVersion() < 3) {
1080 MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1081 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1082 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1083 } else {
1084 // Graphics registers
1085 MD->setGraphicsRegisters(".ps_extra_lds_size", ExtraLDSSize);
1086 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1087 static StringLiteral const PsInputFields[] = {
1088 ".persp_sample_ena", ".persp_center_ena",
1089 ".persp_centroid_ena", ".persp_pull_model_ena",
1090 ".linear_sample_ena", ".linear_center_ena",
1091 ".linear_centroid_ena", ".line_stipple_tex_ena",
1092 ".pos_x_float_ena", ".pos_y_float_ena",
1093 ".pos_z_float_ena", ".pos_w_float_ena",
1094 ".front_face_ena", ".ancillary_ena",
1095 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1096 unsigned PSInputEna = MFI->getPSInputEnable();
1097 unsigned PSInputAddr = MFI->getPSInputAddr();
1098 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1099 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1100 (bool)((PSInputEna >> Idx) & 1));
1101 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1102 (bool)((PSInputAddr >> Idx) & 1));
1103 }
1104 }
1105 }
1106
1107 // For version 3 and above the wave front size is already set in the metadata
1108 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1109 MD->setWave32(MF.getFunction().getCallingConv());
1110}
1111
1112void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1113 auto *MD = getTargetStreamer()->getPALMetadata();
1114 const MachineFrameInfo &MFI = MF.getFrameInfo();
1115 MD->setFunctionScratchSize(MF, MFI.getStackSize());
1116
1117 // Set compute registers
1118 MD->setRsrc1(CallingConv::AMDGPU_CS,
1119 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
1120 MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.getComputePGMRSrc2());
1121
1122 // Set optional info
1123 MD->setFunctionLdsSize(MF, CurrentProgramInfo.LDSSize);
1124 MD->setFunctionNumUsedVgprs(MF, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1125 MD->setFunctionNumUsedSgprs(MF, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1126}
1127
1128// This is supposed to be log2(Size)
1130 switch (Size) {
1131 case 4:
1132 return AMD_ELEMENT_4_BYTES;
1133 case 8:
1134 return AMD_ELEMENT_8_BYTES;
1135 case 16:
1136 return AMD_ELEMENT_16_BYTES;
1137 default:
1138 llvm_unreachable("invalid private_element_size");
1139 }
1140}
1141
1142void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
1143 const SIProgramInfo &CurrentProgramInfo,
1144 const MachineFunction &MF) const {
1145 const Function &F = MF.getFunction();
1146 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1147 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1148
1150 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1151
1153
1155 CurrentProgramInfo.getComputePGMRSrc1() |
1156 (CurrentProgramInfo.getComputePGMRSrc2() << 32);
1158
1159 if (CurrentProgramInfo.DynamicCallStack)
1161
1164 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1165
1166 if (MFI->hasPrivateSegmentBuffer()) {
1167 Out.code_properties |=
1169 }
1170
1171 if (MFI->hasDispatchPtr())
1173
1174 if (MFI->hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)
1176
1177 if (MFI->hasKernargSegmentPtr())
1179
1180 if (MFI->hasDispatchID())
1182
1183 if (MFI->hasFlatScratchInit())
1185
1186 if (MFI->hasDispatchPtr())
1188
1189 if (STM.isXNACKEnabled())
1191
1192 Align MaxKernArgAlign;
1193 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1194 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1195 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1196 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1197 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1198
1199 // kernarg_segment_alignment is specified as log of the alignment.
1200 // The minimum alignment is 16.
1201 // FIXME: The metadata treats the minimum as 4?
1202 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1203}
1204
1206 const char *ExtraCode, raw_ostream &O) {
1207 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1208 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1209 return false;
1210
1211 if (ExtraCode && ExtraCode[0]) {
1212 if (ExtraCode[1] != 0)
1213 return true; // Unknown modifier.
1214
1215 switch (ExtraCode[0]) {
1216 case 'r':
1217 break;
1218 default:
1219 return true;
1220 }
1221 }
1222
1223 // TODO: Should be able to support other operand types like globals.
1224 const MachineOperand &MO = MI->getOperand(OpNo);
1225 if (MO.isReg()) {
1228 return false;
1229 } else if (MO.isImm()) {
1230 int64_t Val = MO.getImm();
1232 O << Val;
1233 } else if (isUInt<16>(Val)) {
1234 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1235 } else if (isUInt<32>(Val)) {
1236 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1237 } else {
1238 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1239 }
1240 return false;
1241 }
1242 return true;
1243}
1244
1249}
1250
1251void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1252 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1253 bool isModuleEntryFunction, bool hasMAIInsts) {
1254 if (!ORE)
1255 return;
1256
1257 const char *Name = "kernel-resource-usage";
1258 const char *Indent = " ";
1259
1260 // If the remark is not specifically enabled, do not output to yaml
1263 return;
1264
1265 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1266 StringRef RemarkLabel, auto Argument) {
1267 // Add an indent for every line besides the line with the kernel name. This
1268 // makes it easier to tell which resource usage go with which kernel since
1269 // the kernel name will always be displayed first.
1270 std::string LabelStr = RemarkLabel.str() + ": ";
1271 if (!RemarkName.equals("FunctionName"))
1272 LabelStr = Indent + LabelStr;
1273
1274 ORE->emit([&]() {
1275 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1277 &MF.front())
1278 << LabelStr << ore::NV(RemarkName, Argument);
1279 });
1280 };
1281
1282 // FIXME: Formatting here is pretty nasty because clang does not accept
1283 // newlines from diagnostics. This forces us to emit multiple diagnostic
1284 // remarks to simulate newlines. If and when clang does accept newlines, this
1285 // formatting should be aggregated into one remark with newlines to avoid
1286 // printing multiple diagnostic location and diag opts.
1287 EmitResourceUsageRemark("FunctionName", "Function Name",
1288 MF.getFunction().getName());
1289 EmitResourceUsageRemark("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR);
1290 EmitResourceUsageRemark("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR);
1291 if (hasMAIInsts)
1292 EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR);
1293 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1294 CurrentProgramInfo.ScratchSize);
1295 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1296 CurrentProgramInfo.Occupancy);
1297 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1298 CurrentProgramInfo.SGPRSpill);
1299 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1300 CurrentProgramInfo.VGPRSpill);
1301 if (isModuleEntryFunction)
1302 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1303 CurrentProgramInfo.LDSSize);
1304}
#define Success
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static unsigned getRsrcReg(CallingConv::ID CallConv)
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
void LLVM_EXTERNAL_VISIBILITY LLVMInitializeAMDGPUAsmPrinter()
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Analyzes how many registers and other resources are used by functions.
AMDHSA kernel descriptor definitions.
#define AMDHSA_BITS_GET(SRC, MSK)
#define AMDHSA_BITS_SET(DST, MSK, VAL)
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
@ AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_EXTERNAL_VISIBILITY
Definition: Compiler.h:127
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::string Name
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
LLVMContext & Context
const char LLVMTargetMachineRef TM
R600 Assembly printer class.
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition: SIDefines.h:931
#define S_0286E8_WAVESIZE_PreGFX11(x)
Definition: SIDefines.h:1067
#define R_0286E8_SPI_TMPRING_SIZE
Definition: SIDefines.h:1066
#define S_00B84C_SCRATCH_EN(x)
Definition: SIDefines.h:965
#define S_0286E8_WAVESIZE_GFX11Plus(x)
Definition: SIDefines.h:1068
#define FP_ROUND_MODE_DP(x)
Definition: SIDefines.h:1049
#define S_00B860_WAVESIZE_GFX11Plus(x)
Definition: SIDefines.h:1064
#define FP_ROUND_ROUND_TO_NEAREST
Definition: SIDefines.h:1041
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition: SIDefines.h:1002
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition: SIDefines.h:1062
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition: SIDefines.h:954
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition: SIDefines.h:953
#define S_00B028_SGPRS(x)
Definition: SIDefines.h:933
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition: SIDefines.h:962
#define R_0286CC_SPI_PS_INPUT_ENA
Definition: SIDefines.h:1001
#define S_00B860_WAVESIZE_PreGFX11(x)
Definition: SIDefines.h:1063
#define S_00B028_VGPRS(x)
Definition: SIDefines.h:932
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition: SIDefines.h:940
#define FP_DENORM_MODE_DP(x)
Definition: SIDefines.h:1060
#define R_00B848_COMPUTE_PGM_RSRC1
Definition: SIDefines.h:1004
#define R_SPILLED_SGPRS
Definition: SIDefines.h:1079
#define FP_ROUND_MODE_SP(x)
Definition: SIDefines.h:1048
#define FP_DENORM_MODE_SP(x)
Definition: SIDefines.h:1059
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition: SIDefines.h:945
#define R_SPILLED_VGPRS
Definition: SIDefines.h:1080
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition: SIDefines.h:939
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition: SIDefines.h:964
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition: SIDefines.h:938
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const override
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
Emit the specified function out to the OutStreamer.
void emitFunctionBodyEnd() override
Targets can override this to emit stuff after the last basic block in the function.
bool doFinalization(Module &M) override
Shut down the asmprinter.
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(unsigned RegNo, raw_ostream &O, const MCRegisterInfo &MRI)
void setEntryPoint(unsigned CC, StringRef Name)
unsigned getAddressableLocalMemorySize() const
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major, uint32_t Minor)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, unsigned CodeObjectVersion)
virtual void EmitDirectiveAMDGCNTarget()
virtual void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor, uint32_t Stepping, StringRef VendorName, StringRef ArchName)
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
void initializeTargetID(const MCSubtargetInfo &STI, unsigned CodeObjectVersion)
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
This class is intended to be used as a driving class for all asm writers.
Definition: AsmPrinter.h:84
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
Definition: AsmPrinter.cpp:380
MCSymbol * getSymbol(const GlobalValue *GV) const
Definition: AsmPrinter.cpp:663
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
Definition: AsmPrinter.cpp:685
TargetMachine & TM
Target machine description.
Definition: AsmPrinter.h:87
MachineFunction * MF
The current machine function.
Definition: AsmPrinter.h:102
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
Definition: AsmPrinter.cpp:429
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
Definition: AsmPrinter.cpp:618
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
Definition: AsmPrinter.cpp:421
unsigned getFunctionNumber() const
Return a unique ID for the current function.
Definition: AsmPrinter.cpp:376
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition: AsmPrinter.h:114
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition: AsmPrinter.h:94
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition: AsmPrinter.h:99
bool isVerbose() const
Return true if assembly output should contain comments.
Definition: AsmPrinter.h:269
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
Definition: AsmPrinter.cpp:658
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Diagnostic information for stack size etc.
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1725
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:237
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:319
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
bool hasMAIInsts() const
Definition: GCNSubtarget.h:754
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:228
bool hasSGPRInitBug() const
Definition: GCNSubtarget.h:984
bool isTgSplitEnabled() const
Definition: GCNSubtarget.h:569
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool isCuModeEnabled() const
Definition: GCNSubtarget.h:573
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
Definition: GCNSubtarget.h:264
bool dumpCode() const
Definition: GCNSubtarget.h:473
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:561
bool isWave32() const
unsigned getMaxNumUserSGPRs() const
Definition: GCNSubtarget.h:860
Generation getGeneration() const
Definition: GCNSubtarget.h:279
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
Definition: GCNSubtarget.h:283
MaybeAlign getAlign() const
Returns the alignment of the given variable or function.
Definition: GlobalObject.h:79
VisibilityTypes getVisibility() const
Definition: GlobalValue.h:244
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:273
unsigned getAddressSpace() const
Definition: GlobalValue.h:201
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:652
Type * getValueType() const
Definition: GlobalValue.h:292
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
const DiagnosticHandler * getDiagHandlerPtr() const
getDiagHandlerPtr - Returns const raw pointer of DiagnosticHandler set by setDiagnosticHandler.
MCCodeEmitter * getEmitterPtr() const
Definition: MCAssembler.h:325
Context object for machine code objects.
Definition: MCContext.h:76
void reportError(SMLoc L, const Twine &Msg)
Definition: MCContext.cpp:1049
MCContext & getContext() const
This represents a section on linux, lots of unix variants and some bare metal systems.
Definition: MCSectionELF.h:26
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition: MCSymbol.h:248
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:203
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition: MCSymbol.h:298
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition: MCSymbol.h:230
MCStreamer & getStreamer()
Definition: MCStreamer.h:101
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void setAlignment(Align A)
setAlignment - Set the alignment of the function.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
Definition: MachineInstr.h:68
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:516
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:398
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:851
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::string str() const
str - Get the contents as an std::string.
Definition: StringRef.h:222
bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:164
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
const Triple & getTargetTriple() const
const MCSubtargetInfo * getMCSubtargetInfo() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:365
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:393
unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, std::optional< bool > EnableWavefrontSize32)
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs)
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const MCSubtargetInfo *STI)
unsigned getCodeObjectVersion(const Module &M)
IsaVersion getIsaVersion(StringRef GPU)
bool isCompute(CallingConv::ID cc)
bool isGFX90A(const MCSubtargetInfo &STI)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:194
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:185
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:197
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:203
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:188
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:191
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:141
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:215
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:210
@ SHT_PROGBITS
Definition: ELF.h:1000
@ STT_AMDGPU_HSA_KERNEL
Definition: ELF.h:1267
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1777
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:522
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2430
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:382
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:145
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:124
Target & getTheAMDGPUTarget()
The target which supports all AMD GPUs.
Target & getTheGCNTarget()
The target for GCN GPUs.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1946
@ DS_Error
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
Definition: BitVector.h:858
AMD Kernel Code Object (amd_kernel_code_t).
uint16_t workitem_vgpr_count
Number of vector registers used by each work-item.
uint32_t code_properties
Code properties.
uint8_t kernarg_segment_alignment
The maximum byte alignment of variables used by the kernel in the specified memory segment.
uint32_t workgroup_group_segment_byte_size
The amount of group segment memory required by a work-group in bytes.
uint16_t wavefront_sgpr_count
Number of scalar registers used by a wavefront.
uint32_t workitem_private_segment_byte_size
The amount of memory required for the combined private, spill and arg segments for a work-item in byt...
uint64_t kernarg_segment_byte_size
The size in bytes of the kernarg segment that holds the values of the arguments to the kernel.
uint64_t compute_pgm_resource_registers
Shader program settings for CS.
const SIFunctionResourceInfo & getResourceInfo(const Function *F) const
Instruction set architecture version.
Definition: TargetParser.h:112
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
virtual bool isAnalysisRemarkEnabled(StringRef PassName) const
Return true if analysis remarks are enabled, override to provide different implementation.
Track resource usage for kernels / entry functions.
Definition: SIProgramInfo.h:25
uint64_t getPGMRSrc1(CallingConv::ID CC) const
uint32_t NumSGPRsForWavesPerEU
Definition: SIProgramInfo.h:70
uint32_t TrapHandlerEnable
Definition: SIProgramInfo.h:46
uint64_t getComputePGMRSrc2() const
Compute the value of the ComputePGMRsrc2 register.
uint32_t NumVGPRsForWavesPerEU
Definition: SIProgramInfo.h:73
uint64_t ComputePGMRSrc3GFX90A
Definition: SIProgramInfo.h:56
uint64_t getComputePGMRSrc1() const
Compute the value of the ComputePGMRsrc1 register.
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.