LLVM 17.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
22#include "AMDKernelCodeT.h"
23#include "GCNSubtarget.h"
26#include "R600AsmPrinter.h"
35#include "llvm/MC/MCAssembler.h"
36#include "llvm/MC/MCContext.h"
38#include "llvm/MC/MCStreamer.h"
44
45using namespace llvm;
46using namespace llvm::AMDGPU;
47
48// This should get the default rounding mode from the kernel. We just set the
49// default here, but this could change if the OpenCL rounding mode pragmas are
50// used.
51//
52// The denormal mode here should match what is reported by the OpenCL runtime
53// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
54// can also be override to flush with the -cl-denorms-are-zero compiler flag.
55//
56// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
57// precision, and leaves single precision to flush all and does not report
58// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
59// CL_FP_DENORM for both.
60//
61// FIXME: It seems some instructions do not support single precision denormals
62// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
63// and sin_f32, cos_f32 on most parts).
64
65// We want to use these instructions, and using fp32 denormals also causes
66// instructions to run at the double precision rate for the device so it's
67// probably best to just report no single precision denormals.
71 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
72 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
73}
74
75static AsmPrinter *
77 std::unique_ptr<MCStreamer> &&Streamer) {
78 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
79}
80
86}
87
89 std::unique_ptr<MCStreamer> Streamer)
90 : AsmPrinter(TM, std::move(Streamer)) {
91 assert(OutStreamer && "AsmPrinter constructed without streamer");
92}
93
95 return "AMDGPU Assembly Printer";
96}
97
99 return TM.getMCSubtargetInfo();
100}
101
103 if (!OutStreamer)
104 return nullptr;
105 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
106}
107
110}
111
112void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
114
115 // TODO: Which one is called first, emitStartOfAsmFile or
116 // emitFunctionBodyStart?
117 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
118 initializeTargetID(M);
119
122 return;
123
124 if (CodeObjectVersion >= 3)
126
128 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
129
132
133 if (CodeObjectVersion >= 3)
134 return;
135
136 // HSA emits NT_AMD_HSA_CODE_OBJECT_VERSION for code objects v2.
139
140 // HSA and PAL emit NT_AMD_HSA_ISA_VERSION for code objects v2.
143 Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU");
144}
145
147 // Init target streamer if it has not yet happened
149 initTargetStreamer(M);
150
151 if (TM.getTargetTriple().getOS() != Triple::AMDHSA || CodeObjectVersion == 2)
153
154 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
155 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
157 HSAMetadataStream->end();
158 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
159 (void)Success;
160 assert(Success && "Malformed HSA Metadata");
161 }
162}
163
165 const MachineBasicBlock *MBB) const {
167 return false;
168
169 if (MBB->empty())
170 return true;
171
172 // If this is a block implementing a long branch, an expression relative to
173 // the start of the block is needed. to the start of the block.
174 // XXX - Is there a smarter way to check this?
175 return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
176}
177
180 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
181 const Function &F = MF->getFunction();
182
183 // TODO: Which one is called first, emitStartOfAsmFile or
184 // emitFunctionBodyStart?
186 initializeTargetID(*F.getParent());
187
188 const auto &FunctionTargetID = STM.getTargetID();
189 // Make sure function's xnack settings are compatible with module's
190 // xnack settings.
191 if (FunctionTargetID.isXnackSupported() &&
192 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
193 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
194 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
195 "' function does not match module xnack setting");
196 return;
197 }
198 // Make sure function's sramecc settings are compatible with module's
199 // sramecc settings.
200 if (FunctionTargetID.isSramEccSupported() &&
201 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
202 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
203 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
204 "' function does not match module sramecc setting");
205 return;
206 }
207
208 if (!MFI.isEntryFunction())
209 return;
210
211 if ((STM.isMesaKernel(F) || CodeObjectVersion == 2) &&
212 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
213 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
214 amd_kernel_code_t KernelCode;
215 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
217 }
218
219 if (STM.isAmdHsaOS())
220 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
221}
222
225 if (!MFI.isEntryFunction())
226 return;
227
228 if (TM.getTargetTriple().getOS() != Triple::AMDHSA || CodeObjectVersion == 2)
229 return;
230
231 auto &Streamer = getTargetStreamer()->getStreamer();
232 auto &Context = Streamer.getContext();
233 auto &ObjectFileInfo = *Context.getObjectFileInfo();
234 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
235
236 Streamer.pushSection();
237 Streamer.switchSection(&ReadOnlySection);
238
239 // CP microcode requires the kernel descriptor to be allocated on 64 byte
240 // alignment.
241 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
242 ReadOnlySection.ensureMinAlignment(Align(64));
243
244 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
245
246 SmallString<128> KernelName;
247 getNameWithPrefix(KernelName, &MF->getFunction());
249 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
250 CurrentProgramInfo.NumVGPRsForWavesPerEU,
251 CurrentProgramInfo.NumSGPRsForWavesPerEU -
253 CurrentProgramInfo.VCCUsed,
254 CurrentProgramInfo.FlatUsed),
255 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
256 CodeObjectVersion);
257
258 Streamer.popSection();
259}
260
263 CodeObjectVersion >=3) {
265 return;
266 }
267
269 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
270 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
271 SmallString<128> SymbolName;
272 getNameWithPrefix(SymbolName, &MF->getFunction()),
274 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
275 }
276 if (DumpCodeInstEmitter) {
277 // Disassemble function name label to text.
278 DisasmLines.push_back(MF->getName().str() + ":");
279 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
280 HexLines.push_back("");
281 }
282
284}
285
287 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
288 // Write a line for the basic block label if it is not only fallthrough.
289 DisasmLines.push_back(
290 (Twine("BB") + Twine(getFunctionNumber())
291 + "_" + Twine(MBB.getNumber()) + ":").str());
292 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
293 HexLines.push_back("");
294 }
296}
297
300 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
302 Twine(GV->getName()) +
303 ": unsupported initializer for address space");
304 return;
305 }
306
307 // LDS variables aren't emitted in HSA or PAL yet.
308 const Triple::OSType OS = TM.getTargetTriple().getOS();
309 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
310 return;
311
312 MCSymbol *GVSym = getSymbol(GV);
313
314 GVSym->redefineIfPossible();
315 if (GVSym->isDefined() || GVSym->isVariable())
316 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
317 "' is already defined");
318
319 const DataLayout &DL = GV->getParent()->getDataLayout();
320 uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
321 Align Alignment = GV->getAlign().value_or(Align(4));
322
323 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
324 emitLinkage(GV, GVSym);
325 auto TS = getTargetStreamer();
326 TS->emitAMDGPULDS(GVSym, Size, Alignment);
327 return;
328 }
329
331}
332
334 CodeObjectVersion = AMDGPU::getCodeObjectVersion(M);
335
337 switch (CodeObjectVersion) {
338 case 2:
339 HSAMetadataStream.reset(new HSAMD::MetadataStreamerYamlV2());
340 break;
341 case 3:
342 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV3());
343 break;
344 case 4:
345 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4());
346 break;
347 case 5:
348 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV5());
349 break;
350 default:
351 report_fatal_error("Unexpected code object version");
352 }
353 }
355}
356
358 // Pad with s_code_end to help tools and guard against instruction prefetch
359 // causing stale data in caches. Arguably this should be done by the linker,
360 // which is why this isn't done for Mesa.
361 const MCSubtargetInfo &STI = *getGlobalSTI();
362 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
365 OutStreamer->switchSection(getObjFileLowering().getTextSection());
367 }
368
370}
371
372// Print comments that apply to both callable functions and entry points.
373void AMDGPUAsmPrinter::emitCommonFunctionComments(
374 uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
375 uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
376 const AMDGPUMachineFunction *MFI) {
377 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
378 OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
379 OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
380 if (NumAGPR) {
381 OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
382 OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
383 false);
384 }
385 OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
386 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
387 false);
388}
389
390uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
391 const MachineFunction &MF) const {
393 uint16_t KernelCodeProperties = 0;
394
395 if (MFI.hasPrivateSegmentBuffer()) {
396 KernelCodeProperties |=
397 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
398 }
399 if (MFI.hasDispatchPtr()) {
400 KernelCodeProperties |=
401 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
402 }
403 if (MFI.hasQueuePtr() && CodeObjectVersion < 5) {
404 KernelCodeProperties |=
405 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
406 }
407 if (MFI.hasKernargSegmentPtr()) {
408 KernelCodeProperties |=
409 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
410 }
411 if (MFI.hasDispatchID()) {
412 KernelCodeProperties |=
413 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
414 }
415 if (MFI.hasFlatScratchInit()) {
416 KernelCodeProperties |=
417 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
418 }
420 KernelCodeProperties |=
421 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
422 }
423
424 if (CurrentProgramInfo.DynamicCallStack && CodeObjectVersion >= 5)
425 KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
426
427 return KernelCodeProperties;
428}
429
430amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
431 const MachineFunction &MF,
432 const SIProgramInfo &PI) const {
434 const Function &F = MF.getFunction();
435
436 amdhsa::kernel_descriptor_t KernelDescriptor;
437 memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
438
439 assert(isUInt<32>(PI.ScratchSize));
440 assert(isUInt<32>(PI.getComputePGMRSrc1()));
441 assert(isUInt<32>(PI.ComputePGMRSrc2));
442
443 KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
444 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
445
446 Align MaxKernArgAlign;
447 KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
448
449 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1();
450 KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
451 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
452
453 assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
454 if (STM.hasGFX90AInsts())
455 KernelDescriptor.compute_pgm_rsrc3 =
456 CurrentProgramInfo.ComputePGMRSrc3GFX90A;
457
458 return KernelDescriptor;
459}
460
462 // Init target streamer lazily on the first function so that previous passes
463 // can set metadata.
465 initTargetStreamer(*MF.getFunction().getParent());
466
467 ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
468 CurrentProgramInfo = SIProgramInfo();
469
471
472 // The starting address of all shader programs must be 256 bytes aligned.
473 // Regular functions just need the basic required instruction alignment.
474 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
475
477
480 // FIXME: This should be an explicit check for Mesa.
481 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
482 MCSectionELF *ConfigSection =
483 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
484 OutStreamer->switchSection(ConfigSection);
485 }
486
487 if (MFI->isModuleEntryFunction()) {
488 getSIProgramInfo(CurrentProgramInfo, MF);
489 }
490
491 if (STM.isAmdPalOS()) {
492 if (MFI->isEntryFunction())
493 EmitPALMetadata(MF, CurrentProgramInfo);
494 else if (MFI->isModuleEntryFunction())
495 emitPALFunctionMetadata(MF);
496 } else if (!STM.isAmdHsaOS()) {
497 EmitProgramInfoSI(MF, CurrentProgramInfo);
498 }
499
500 DumpCodeInstEmitter = nullptr;
501 if (STM.dumpCode()) {
502 // For -dumpcode, get the assembler out of the streamer, even if it does
503 // not really want to let us have it. This only works with -filetype=obj.
504 bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing();
505 OutStreamer->setUseAssemblerInfoForParsing(true);
506 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
507 OutStreamer->setUseAssemblerInfoForParsing(SaveFlag);
508 if (Assembler)
509 DumpCodeInstEmitter = Assembler->getEmitterPtr();
510 }
511
512 DisasmLines.clear();
513 HexLines.clear();
515
517
518 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
519 STM.hasMAIInsts());
520
521 if (isVerbose()) {
522 MCSectionELF *CommentSection =
523 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
524 OutStreamer->switchSection(CommentSection);
525
526 if (!MFI->isEntryFunction()) {
527 OutStreamer->emitRawComment(" Function info:", false);
529 ResourceUsage->getResourceInfo(&MF.getFunction());
530 emitCommonFunctionComments(
531 Info.NumVGPR,
532 STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
533 Info.getTotalNumVGPRs(STM),
534 Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
535 Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI);
536 return false;
537 }
538
539 OutStreamer->emitRawComment(" Kernel info:", false);
540 emitCommonFunctionComments(
541 CurrentProgramInfo.NumArchVGPR,
542 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR
543 : std::optional<uint32_t>(),
544 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
545 CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
546
547 OutStreamer->emitRawComment(
548 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
549 OutStreamer->emitRawComment(
550 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
551 OutStreamer->emitRawComment(
552 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
553 " bytes/workgroup (compile time only)", false);
554
555 OutStreamer->emitRawComment(
556 " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
557 OutStreamer->emitRawComment(
558 " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
559
560 OutStreamer->emitRawComment(
561 " NumSGPRsForWavesPerEU: " +
562 Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
563 OutStreamer->emitRawComment(
564 " NumVGPRsForWavesPerEU: " +
565 Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
566
567 if (STM.hasGFX90AInsts())
568 OutStreamer->emitRawComment(
569 " AccumOffset: " +
570 Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false);
571
572 OutStreamer->emitRawComment(
573 " Occupancy: " +
574 Twine(CurrentProgramInfo.Occupancy), false);
575
576 OutStreamer->emitRawComment(
577 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
578
579 OutStreamer->emitRawComment(
580 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
581 Twine(G_00B84C_SCRATCH_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
582 OutStreamer->emitRawComment(
583 " COMPUTE_PGM_RSRC2:USER_SGPR: " +
584 Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
585 OutStreamer->emitRawComment(
586 " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
587 Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
588 OutStreamer->emitRawComment(
589 " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
590 Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
591 OutStreamer->emitRawComment(
592 " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
593 Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
594 OutStreamer->emitRawComment(
595 " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
596 Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
597 OutStreamer->emitRawComment(
598 " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
599 Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
600 false);
601
602 assert(STM.hasGFX90AInsts() ||
603 CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
604 if (STM.hasGFX90AInsts()) {
605 OutStreamer->emitRawComment(
606 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
607 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
608 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
609 false);
610 OutStreamer->emitRawComment(
611 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
612 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
613 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
614 false);
615 }
616 }
617
618 if (DumpCodeInstEmitter) {
619
620 OutStreamer->switchSection(
621 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
622
623 for (size_t i = 0; i < DisasmLines.size(); ++i) {
624 std::string Comment = "\n";
625 if (!HexLines[i].empty()) {
626 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
627 Comment += " ; " + HexLines[i] + "\n";
628 }
629
630 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
631 OutStreamer->emitBytes(StringRef(Comment));
632 }
633 }
634
635 return false;
636}
637
638// TODO: Fold this into emitFunctionBodyStart.
639void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
640 // In the beginning all features are either 'Any' or 'NotSupported',
641 // depending on global target features. This will cover empty modules.
643 *getGlobalSTI(), getGlobalSTI()->getFeatureString());
644
645 // If module is empty, we are done.
646 if (M.empty())
647 return;
648
649 // If module is not empty, need to find first 'Off' or 'On' feature
650 // setting per feature from functions in module.
651 for (auto &F : M) {
652 auto &TSTargetID = getTargetStreamer()->getTargetID();
653 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
654 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
655 break;
656
658 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
659 if (TSTargetID->isXnackSupported())
660 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
661 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
662 if (TSTargetID->isSramEccSupported())
663 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
664 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
665 }
666}
667
668uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
670 const SIInstrInfo *TII = STM.getInstrInfo();
671
672 uint64_t CodeSize = 0;
673
674 for (const MachineBasicBlock &MBB : MF) {
675 for (const MachineInstr &MI : MBB) {
676 // TODO: CodeSize should account for multiple functions.
677
678 // TODO: Should we count size of debug info?
679 if (MI.isDebugInstr())
680 continue;
681
682 CodeSize += TII->getInstSizeInBytes(MI);
683 }
684 }
685
686 return CodeSize;
687}
688
689void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
690 const MachineFunction &MF) {
692 ResourceUsage->getResourceInfo(&MF.getFunction());
694
695 ProgInfo.NumArchVGPR = Info.NumVGPR;
696 ProgInfo.NumAccVGPR = Info.NumAGPR;
697 ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
698 ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
699 ProgInfo.TgSplit = STM.isTgSplitEnabled();
700 ProgInfo.NumSGPR = Info.NumExplicitSGPR;
701 ProgInfo.ScratchSize = Info.PrivateSegmentSize;
702 ProgInfo.VCCUsed = Info.UsesVCC;
703 ProgInfo.FlatUsed = Info.UsesFlatScratch;
704 ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
705
706 const uint64_t MaxScratchPerWorkitem =
708 if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
709 DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
710 ProgInfo.ScratchSize,
711 MaxScratchPerWorkitem, DS_Error);
712 MF.getFunction().getContext().diagnose(DiagStackSize);
713 }
714
716
717 // The calculations related to SGPR/VGPR blocks are
718 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
719 // unified.
720 unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
721 &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed);
722
723 // Check the addressable register limit before we add ExtraSGPRs.
725 !STM.hasSGPRInitBug()) {
726 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
727 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
728 // This can happen due to a compiler bug or when using inline asm.
731 MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR,
732 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
733 Ctx.diagnose(Diag);
734 ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
735 }
736 }
737
738 // Account for extra SGPRs and VGPRs reserved for debugger use.
739 ProgInfo.NumSGPR += ExtraSGPRs;
740
741 const Function &F = MF.getFunction();
742
743 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
744 // dispatch registers are function args.
745 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
746
747 if (isShader(F.getCallingConv())) {
748 bool IsPixelShader =
749 F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
750
751 // Calculate the number of VGPR registers based on the SPI input registers
752 uint32_t InputEna = 0;
753 uint32_t InputAddr = 0;
754 unsigned LastEna = 0;
755
756 if (IsPixelShader) {
757 // Note for IsPixelShader:
758 // By this stage, all enabled inputs are tagged in InputAddr as well.
759 // We will use InputAddr to determine whether the input counts against the
760 // vgpr total and only use the InputEnable to determine the last input
761 // that is relevant - if extra arguments are used, then we have to honour
762 // the InputAddr for any intermediate non-enabled inputs.
763 InputEna = MFI->getPSInputEnable();
764 InputAddr = MFI->getPSInputAddr();
765
766 // We only need to consider input args up to the last used arg.
767 assert((InputEna || InputAddr) &&
768 "PSInputAddr and PSInputEnable should "
769 "never both be 0 for AMDGPU_PS shaders");
770 // There are some rare circumstances where InputAddr is non-zero and
771 // InputEna can be set to 0. In this case we default to setting LastEna
772 // to 1.
773 LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
774 }
775
776 // FIXME: We should be using the number of registers determined during
777 // calling convention lowering to legalize the types.
778 const DataLayout &DL = F.getParent()->getDataLayout();
779 unsigned PSArgCount = 0;
780 unsigned IntermediateVGPR = 0;
781 for (auto &Arg : F.args()) {
782 unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
783 if (Arg.hasAttribute(Attribute::InReg)) {
784 WaveDispatchNumSGPR += NumRegs;
785 } else {
786 // If this is a PS shader and we're processing the PS Input args (first
787 // 16 VGPR), use the InputEna and InputAddr bits to define how many
788 // VGPRs are actually used.
789 // Any extra VGPR arguments are handled as normal arguments (and
790 // contribute to the VGPR count whether they're used or not).
791 if (IsPixelShader && PSArgCount < 16) {
792 if ((1 << PSArgCount) & InputAddr) {
793 if (PSArgCount < LastEna)
794 WaveDispatchNumVGPR += NumRegs;
795 else
796 IntermediateVGPR += NumRegs;
797 }
798 PSArgCount++;
799 } else {
800 // If there are extra arguments we have to include the allocation for
801 // the non-used (but enabled with InputAddr) input arguments
802 if (IntermediateVGPR) {
803 WaveDispatchNumVGPR += IntermediateVGPR;
804 IntermediateVGPR = 0;
805 }
806 WaveDispatchNumVGPR += NumRegs;
807 }
808 }
809 }
810 ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
811 ProgInfo.NumArchVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
812 ProgInfo.NumVGPR =
813 Info.getTotalNumVGPRs(STM, Info.NumAGPR, ProgInfo.NumArchVGPR);
814 }
815
816 // Adjust number of registers used to meet default/requested minimum/maximum
817 // number of waves per execution unit request.
818 ProgInfo.NumSGPRsForWavesPerEU = std::max(
819 std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
820 ProgInfo.NumVGPRsForWavesPerEU = std::max(
821 std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
822
824 STM.hasSGPRInitBug()) {
825 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
826 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
827 // This can happen due to a compiler bug or when using inline asm to use
828 // the registers which are usually reserved for vcc etc.
830 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
831 ProgInfo.NumSGPR, MaxAddressableNumSGPRs,
833 Ctx.diagnose(Diag);
834 ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
835 ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
836 }
837 }
838
839 if (STM.hasSGPRInitBug()) {
840 ProgInfo.NumSGPR =
842 ProgInfo.NumSGPRsForWavesPerEU =
844 }
845
846 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
848 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
849 MFI->getNumUserSGPRs(),
851 Ctx.diagnose(Diag);
852 }
853
854 if (MFI->getLDSSize() >
855 static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
858 MF.getFunction(), "local memory", MFI->getLDSSize(),
860 Ctx.diagnose(Diag);
861 }
862
864 &STM, ProgInfo.NumSGPRsForWavesPerEU);
866 &STM, ProgInfo.NumVGPRsForWavesPerEU);
867
868 const SIModeRegisterDefaults Mode = MFI->getMode();
869
870 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
871 // register.
872 ProgInfo.FloatMode = getFPMode(Mode);
873
874 ProgInfo.IEEEMode = Mode.IEEE;
875
876 // Make clamp modifier on NaN input returns 0.
877 ProgInfo.DX10Clamp = Mode.DX10Clamp;
878
879 unsigned LDSAlignShift;
881 // LDS is allocated in 64 dword blocks.
882 LDSAlignShift = 8;
883 } else {
884 // LDS is allocated in 128 dword blocks.
885 LDSAlignShift = 9;
886 }
887
888 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
889 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
890
891 ProgInfo.LDSSize = MFI->getLDSSize();
892 ProgInfo.LDSBlocks =
893 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
894
895 // Scratch is allocated in 64-dword or 256-dword blocks.
896 unsigned ScratchAlignShift =
897 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
898 // We need to program the hardware with the amount of scratch memory that
899 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
900 // scratch memory used per thread.
901 ProgInfo.ScratchBlocks = divideCeil(
902 ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift);
903
904 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
905 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
906 ProgInfo.MemOrdered = 1;
907 }
908
909 // 0 = X, 1 = XY, 2 = XYZ
910 unsigned TIDIGCompCnt = 0;
911 if (MFI->hasWorkItemIDZ())
912 TIDIGCompCnt = 2;
913 else if (MFI->hasWorkItemIDY())
914 TIDIGCompCnt = 1;
915
916 // The private segment wave byte offset is the last of the system SGPRs. We
917 // initially assumed it was allocated, and may have used it. It shouldn't harm
918 // anything to disable it if we know the stack isn't used here. We may still
919 // have emitted code reading it to initialize scratch, but if that's unused
920 // reading garbage should be OK.
921 const bool EnablePrivateSegment =
922 ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack;
923 ProgInfo.ComputePGMRSrc2 =
924 S_00B84C_SCRATCH_EN(EnablePrivateSegment) |
926 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
932 S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
934 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
935 S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
937
938 if (STM.hasGFX90AInsts()) {
940 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
941 ProgInfo.AccumOffset);
943 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
944 ProgInfo.TgSplit);
945 }
946
947 ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
948 ProgInfo.NumSGPRsForWavesPerEU,
949 ProgInfo.NumVGPRsForWavesPerEU);
950}
951
952static unsigned getRsrcReg(CallingConv::ID CallConv) {
953 switch (CallConv) {
954 default: [[fallthrough]];
962 }
963}
964
965void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
966 const SIProgramInfo &CurrentProgramInfo) {
969 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
970
973
974 OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1());
975
977 OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2);
978
980 OutStreamer->emitInt32(
981 STM.getGeneration() >= AMDGPUSubtarget::GFX11
982 ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
983 : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
984
985 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
986 // 0" comment but I don't see a corresponding field in the register spec.
987 } else {
988 OutStreamer->emitInt32(RsrcReg);
989 OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
990 S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
992 OutStreamer->emitInt32(
993 STM.getGeneration() >= AMDGPUSubtarget::GFX11
994 ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
995 : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
996 }
997
1000 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1001 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1002 : CurrentProgramInfo.LDSBlocks;
1003 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1005 OutStreamer->emitInt32(MFI->getPSInputEnable());
1007 OutStreamer->emitInt32(MFI->getPSInputAddr());
1008 }
1009
1010 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1011 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1012 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1013 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1014}
1015
1016// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1017// is AMDPAL. It stores each compute/SPI register setting and other PAL
1018// metadata items into the PALMD::Metadata, combining with any provided by the
1019// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1020// is then written as a single block in the .note section.
1021void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1022 const SIProgramInfo &CurrentProgramInfo) {
1024 auto CC = MF.getFunction().getCallingConv();
1025 auto MD = getTargetStreamer()->getPALMetadata();
1026
1028 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1029
1030 // Only set AGPRs for supported devices
1031 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1032 if (STM.hasMAIInsts()) {
1033 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1034 }
1035
1036 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1037 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
1038 if (AMDGPU::isCompute(CC)) {
1039 MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2);
1040 } else {
1041 if (CurrentProgramInfo.ScratchBlocks > 0)
1042 MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
1043 }
1044 // ScratchSize is in bytes, 16 aligned.
1045 MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
1047 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1048 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1049 : CurrentProgramInfo.LDSBlocks;
1050 MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1051 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1052 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1053 }
1054
1055 if (STM.isWave32())
1056 MD->setWave32(MF.getFunction().getCallingConv());
1057}
1058
1059void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1060 auto *MD = getTargetStreamer()->getPALMetadata();
1061 const MachineFrameInfo &MFI = MF.getFrameInfo();
1062 MD->setFunctionScratchSize(MF, MFI.getStackSize());
1063
1064 // Set compute registers
1065 MD->setRsrc1(CallingConv::AMDGPU_CS,
1066 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
1067 MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.ComputePGMRSrc2);
1068
1069 // Set optional info
1070 MD->setFunctionLdsSize(MF, CurrentProgramInfo.LDSSize);
1071 MD->setFunctionNumUsedVgprs(MF, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1072 MD->setFunctionNumUsedSgprs(MF, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1073}
1074
1075// This is supposed to be log2(Size)
1077 switch (Size) {
1078 case 4:
1079 return AMD_ELEMENT_4_BYTES;
1080 case 8:
1081 return AMD_ELEMENT_8_BYTES;
1082 case 16:
1083 return AMD_ELEMENT_16_BYTES;
1084 default:
1085 llvm_unreachable("invalid private_element_size");
1086 }
1087}
1088
1089void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
1090 const SIProgramInfo &CurrentProgramInfo,
1091 const MachineFunction &MF) const {
1092 const Function &F = MF.getFunction();
1093 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1094 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1095
1097 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1098
1100
1102 CurrentProgramInfo.getComputePGMRSrc1() |
1103 (CurrentProgramInfo.ComputePGMRSrc2 << 32);
1105
1106 if (CurrentProgramInfo.DynamicCallStack)
1108
1111 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1112
1113 if (MFI->hasPrivateSegmentBuffer()) {
1114 Out.code_properties |=
1116 }
1117
1118 if (MFI->hasDispatchPtr())
1120
1121 if (MFI->hasQueuePtr() && CodeObjectVersion < 5)
1123
1124 if (MFI->hasKernargSegmentPtr())
1126
1127 if (MFI->hasDispatchID())
1129
1130 if (MFI->hasFlatScratchInit())
1132
1133 if (MFI->hasDispatchPtr())
1135
1136 if (STM.isXNACKEnabled())
1138
1139 Align MaxKernArgAlign;
1140 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1141 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1142 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1143 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1144 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1145
1146 // kernarg_segment_alignment is specified as log of the alignment.
1147 // The minimum alignment is 16.
1148 // FIXME: The metadata treats the minimum as 4?
1149 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1150}
1151
1153 const char *ExtraCode, raw_ostream &O) {
1154 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1155 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1156 return false;
1157
1158 if (ExtraCode && ExtraCode[0]) {
1159 if (ExtraCode[1] != 0)
1160 return true; // Unknown modifier.
1161
1162 switch (ExtraCode[0]) {
1163 case 'r':
1164 break;
1165 default:
1166 return true;
1167 }
1168 }
1169
1170 // TODO: Should be able to support other operand types like globals.
1171 const MachineOperand &MO = MI->getOperand(OpNo);
1172 if (MO.isReg()) {
1175 return false;
1176 } else if (MO.isImm()) {
1177 int64_t Val = MO.getImm();
1179 O << Val;
1180 } else if (isUInt<16>(Val)) {
1181 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1182 } else if (isUInt<32>(Val)) {
1183 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1184 } else {
1185 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1186 }
1187 return false;
1188 }
1189 return true;
1190}
1191
1196}
1197
1198void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1199 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1200 bool isModuleEntryFunction, bool hasMAIInsts) {
1201 if (!ORE)
1202 return;
1203
1204 const char *Name = "kernel-resource-usage";
1205 const char *Indent = " ";
1206
1207 // If the remark is not specifically enabled, do not output to yaml
1210 return;
1211
1212 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1213 StringRef RemarkLabel, auto Argument) {
1214 // Add an indent for every line besides the line with the kernel name. This
1215 // makes it easier to tell which resource usage go with which kernel since
1216 // the kernel name will always be displayed first.
1217 std::string LabelStr = RemarkLabel.str() + ": ";
1218 if (!RemarkName.equals("FunctionName"))
1219 LabelStr = Indent + LabelStr;
1220
1221 ORE->emit([&]() {
1222 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1224 &MF.front())
1225 << LabelStr << ore::NV(RemarkName, Argument);
1226 });
1227 };
1228
1229 // FIXME: Formatting here is pretty nasty because clang does not accept
1230 // newlines from diagnostics. This forces us to emit multiple diagnostic
1231 // remarks to simulate newlines. If and when clang does accept newlines, this
1232 // formatting should be aggregated into one remark with newlines to avoid
1233 // printing multiple diagnostic location and diag opts.
1234 EmitResourceUsageRemark("FunctionName", "Function Name",
1235 MF.getFunction().getName());
1236 EmitResourceUsageRemark("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR);
1237 EmitResourceUsageRemark("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR);
1238 if (hasMAIInsts)
1239 EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR);
1240 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1241 CurrentProgramInfo.ScratchSize);
1242 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1243 CurrentProgramInfo.Occupancy);
1244 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1245 CurrentProgramInfo.SGPRSpill);
1246 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1247 CurrentProgramInfo.VGPRSpill);
1248 if (isModuleEntryFunction)
1249 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1250 CurrentProgramInfo.LDSSize);
1251}
#define Success
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static unsigned getRsrcReg(CallingConv::ID CallConv)
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(AMDGPU::SIModeRegisterDefaults Mode)
void LLVM_EXTERNAL_VISIBILITY LLVMInitializeAMDGPUAsmPrinter()
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
Analyzes how many registers and other resources are used by functions.
AMDHSA kernel descriptor definitions.
#define AMDHSA_BITS_GET(SRC, MSK)
#define AMDHSA_BITS_SET(DST, MSK, VAL)
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
@ AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_EXTERNAL_VISIBILITY
Definition: Compiler.h:127
std::string Name
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
LLVMContext & Context
return ToRemove size() > 0
const char LLVMTargetMachineRef TM
R600 Assembly printer class.
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition: SIDefines.h:914
#define S_00B84C_EXCP_EN(x)
Definition: SIDefines.h:980
#define S_0286E8_WAVESIZE_PreGFX11(x)
Definition: SIDefines.h:1050
#define R_0286E8_SPI_TMPRING_SIZE
Definition: SIDefines.h:1049
#define S_00B84C_SCRATCH_EN(x)
Definition: SIDefines.h:948
#define S_00B84C_TGID_Z_EN(x)
Definition: SIDefines.h:963
#define S_0286E8_WAVESIZE_GFX11Plus(x)
Definition: SIDefines.h:1051
#define FP_ROUND_MODE_DP(x)
Definition: SIDefines.h:1032
#define S_00B860_WAVESIZE_GFX11Plus(x)
Definition: SIDefines.h:1047
#define FP_ROUND_ROUND_TO_NEAREST
Definition: SIDefines.h:1024
#define G_00B84C_TIDIG_COMP_CNT(x)
Definition: SIDefines.h:970
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition: SIDefines.h:985
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition: SIDefines.h:1045
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition: SIDefines.h:937
#define G_00B84C_TGID_X_EN(x)
Definition: SIDefines.h:958
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition: SIDefines.h:936
#define S_00B84C_TGID_X_EN(x)
Definition: SIDefines.h:957
#define G_00B84C_TRAP_HANDLER(x)
Definition: SIDefines.h:955
#define S_00B028_SGPRS(x)
Definition: SIDefines.h:916
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition: SIDefines.h:945
#define R_0286CC_SPI_PS_INPUT_ENA
Definition: SIDefines.h:984
#define G_00B84C_TGID_Y_EN(x)
Definition: SIDefines.h:961
#define S_00B860_WAVESIZE_PreGFX11(x)
Definition: SIDefines.h:1046
#define S_00B84C_TG_SIZE_EN(x)
Definition: SIDefines.h:966
#define S_00B84C_TIDIG_COMP_CNT(x)
Definition: SIDefines.h:969
#define S_00B028_VGPRS(x)
Definition: SIDefines.h:915
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition: SIDefines.h:923
#define FP_DENORM_MODE_DP(x)
Definition: SIDefines.h:1043
#define R_00B848_COMPUTE_PGM_RSRC1
Definition: SIDefines.h:987
#define S_00B84C_LDS_SIZE(x)
Definition: SIDefines.h:977
#define S_00B84C_USER_SGPR(x)
Definition: SIDefines.h:951
#define S_00B84C_TRAP_HANDLER(x)
Definition: SIDefines.h:954
#define G_00B84C_TGID_Z_EN(x)
Definition: SIDefines.h:964
#define R_SPILLED_SGPRS
Definition: SIDefines.h:1062
#define S_00B84C_TGID_Y_EN(x)
Definition: SIDefines.h:960
#define FP_ROUND_MODE_SP(x)
Definition: SIDefines.h:1031
#define FP_DENORM_MODE_SP(x)
Definition: SIDefines.h:1042
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition: SIDefines.h:928
#define S_00B84C_EXCP_EN_MSB(x)
Definition: SIDefines.h:973
#define G_00B84C_USER_SGPR(x)
Definition: SIDefines.h:952
#define G_00B84C_SCRATCH_EN(x)
Definition: SIDefines.h:949
#define R_SPILLED_VGPRS
Definition: SIDefines.h:1063
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition: SIDefines.h:922
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition: SIDefines.h:947
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition: SIDefines.h:921
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This header is deprecated in favour of llvm/TargetParser/TargetParser.h.
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const override
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
Emit the specified function out to the OutStreamer.
void emitFunctionBodyEnd() override
Targets can override this to emit stuff after the last basic block in the function.
bool doFinalization(Module &M) override
Shut down the asmprinter.
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(unsigned RegNo, raw_ostream &O, const MCRegisterInfo &MRI)
void setEntryPoint(unsigned CC, StringRef Name)
unsigned getAddressableLocalMemorySize() const
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
AMDGPUPALMetadata * getPALMetadata()
void initializeTargetID(const MCSubtargetInfo &STI)
virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major, uint32_t Minor)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, unsigned CodeObjectVersion)
virtual void EmitDirectiveAMDGCNTarget()
virtual void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor, uint32_t Stepping, StringRef VendorName, StringRef ArchName)
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
This class is intended to be used as a driving class for all asm writers.
Definition: AsmPrinter.h:84
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
Definition: AsmPrinter.cpp:371
MCSymbol * getSymbol(const GlobalValue *GV) const
Definition: AsmPrinter.cpp:643
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
Definition: AsmPrinter.cpp:665
TargetMachine & TM
Target machine description.
Definition: AsmPrinter.h:87
MachineFunction * MF
The current machine function.
Definition: AsmPrinter.h:102
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
Definition: AsmPrinter.cpp:419
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
Definition: AsmPrinter.cpp:598
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
Definition: AsmPrinter.cpp:412
unsigned getFunctionNumber() const
Return a unique ID for the current function.
Definition: AsmPrinter.cpp:367
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition: AsmPrinter.h:114
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition: AsmPrinter.h:94
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition: AsmPrinter.h:99
bool isVerbose() const
Return true if assembly output should contain comments.
Definition: AsmPrinter.h:265
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
Definition: AsmPrinter.cpp:638
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:114
Diagnostic information for stack size etc.
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1625
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:237
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:315
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
bool hasMAIInsts() const
Definition: GCNSubtarget.h:747
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:221
bool hasSGPRInitBug() const
Definition: GCNSubtarget.h:961
bool isTgSplitEnabled() const
Definition: GCNSubtarget.h:562
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool isCuModeEnabled() const
Definition: GCNSubtarget.h:566
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
Definition: GCNSubtarget.h:257
bool dumpCode() const
Definition: GCNSubtarget.h:466
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:554
bool isWave32() const
unsigned getMaxNumUserSGPRs() const
Definition: GCNSubtarget.h:839
Generation getGeneration() const
Definition: GCNSubtarget.h:272
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
Definition: GCNSubtarget.h:276
MaybeAlign getAlign() const
Returns the alignment of the given variable or function.
Definition: GlobalObject.h:79
VisibilityTypes getVisibility() const
Definition: GlobalValue.h:244
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:275
unsigned getAddressSpace() const
Definition: GlobalValue.h:201
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:652
Type * getValueType() const
Definition: GlobalValue.h:292
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
const DiagnosticHandler * getDiagHandlerPtr() const
getDiagHandlerPtr - Returns const raw pointer of DiagnosticHandler set by setDiagnosticHandler.
MCCodeEmitter * getEmitterPtr() const
Definition: MCAssembler.h:325
Context object for machine code objects.
Definition: MCContext.h:76
void reportError(SMLoc L, const Twine &Msg)
Definition: MCContext.cpp:1055
MCContext & getContext() const
This represents a section on linux, lots of unix variants and some bare metal systems.
Definition: MCSectionELF.h:26
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition: MCSymbol.h:248
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:203
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition: MCSymbol.h:298
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition: MCSymbol.h:230
MCStreamer & getStreamer()
Definition: MCStreamer.h:101
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void setAlignment(Align A)
setAlignment - Set the alignment of the function.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
Definition: MachineInstr.h:68
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:516
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:398
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::SIModeRegisterDefaults getMode() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::string str() const
str - Get the contents as an std::string.
Definition: StringRef.h:222
bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:164
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
const Triple & getTargetTriple() const
const MCSubtargetInfo * getMCSubtargetInfo() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:364
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:308
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:377
unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, std::optional< bool > EnableWavefrontSize32)
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs)
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const MCSubtargetInfo *STI)
unsigned getCodeObjectVersion(const Module &M)
IsaVersion getIsaVersion(StringRef GPU)
bool isCompute(CallingConv::ID cc)
bool isGFX90A(const MCSubtargetInfo &STI)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:198
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:189
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:201
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:207
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:192
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:195
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:141
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:219
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:214
@ SHT_PROGBITS
Definition: ELF.h:995
@ STT_AMDGPU_HSA_KERNEL
Definition: ELF.h:1262
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:508
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:373
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:145
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:124
Target & getTheAMDGPUTarget()
The target which supports all AMD GPUs.
Target & getTheGCNTarget()
The target for GCN GPUs.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1862
@ DS_Error
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
Definition: BitVector.h:851
AMD Kernel Code Object (amd_kernel_code_t).
uint16_t workitem_vgpr_count
Number of vector registers used by each work-item.
uint32_t code_properties
Code properties.
uint8_t kernarg_segment_alignment
The maximum byte alignment of variables used by the kernel in the specified memory segment.
uint32_t workgroup_group_segment_byte_size
The amount of group segment memory required by a work-group in bytes.
uint16_t wavefront_sgpr_count
Number of scalar registers used by a wavefront.
uint32_t workitem_private_segment_byte_size
The amount of memory required for the combined private, spill and arg segments for a work-item in byt...
uint64_t kernarg_segment_byte_size
The size in bytes of the kernarg segment that holds the values of the arguments to the kernel.
uint64_t compute_pgm_resource_registers
Shader program settings for CS.
const SIFunctionResourceInfo & getResourceInfo(const Function *F) const
Instruction set architecture version.
Definition: TargetParser.h:113
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
virtual bool isAnalysisRemarkEnabled(StringRef PassName) const
Return true if analysis remarks are enabled, override to provide different implementation.
Track resource usage for kernels / entry functions.
Definition: SIProgramInfo.h:25
uint64_t getPGMRSrc1(CallingConv::ID CC) const
uint64_t ComputePGMRSrc2
Definition: SIProgramInfo.h:43
uint32_t NumSGPRsForWavesPerEU
Definition: SIProgramInfo.h:58
uint32_t NumVGPRsForWavesPerEU
Definition: SIProgramInfo.h:61
uint64_t ComputePGMRSrc3GFX90A
Definition: SIProgramInfo.h:44
uint64_t getComputePGMRSrc1() const
Compute the value of the ComputePGMRsrc1 register.
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.