LLVM 19.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
22#include "AMDKernelCodeT.h"
23#include "GCNSubtarget.h"
27#include "R600AsmPrinter.h"
36#include "llvm/MC/MCAssembler.h"
37#include "llvm/MC/MCContext.h"
39#include "llvm/MC/MCStreamer.h"
45
46using namespace llvm;
47using namespace llvm::AMDGPU;
48
49// This should get the default rounding mode from the kernel. We just set the
50// default here, but this could change if the OpenCL rounding mode pragmas are
51// used.
52//
53// The denormal mode here should match what is reported by the OpenCL runtime
54// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
55// can also be override to flush with the -cl-denorms-are-zero compiler flag.
56//
57// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
58// precision, and leaves single precision to flush all and does not report
59// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
60// CL_FP_DENORM for both.
61//
62// FIXME: It seems some instructions do not support single precision denormals
63// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
64// and sin_f32, cos_f32 on most parts).
65
66// We want to use these instructions, and using fp32 denormals also causes
67// instructions to run at the double precision rate for the device so it's
68// probably best to just report no single precision denormals.
72 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
73 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
74}
75
76static AsmPrinter *
78 std::unique_ptr<MCStreamer> &&Streamer) {
79 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
80}
81
87}
88
90 std::unique_ptr<MCStreamer> Streamer)
91 : AsmPrinter(TM, std::move(Streamer)) {
92 assert(OutStreamer && "AsmPrinter constructed without streamer");
93}
94
96 return "AMDGPU Assembly Printer";
97}
98
100 return TM.getMCSubtargetInfo();
101}
102
104 if (!OutStreamer)
105 return nullptr;
106 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
107}
108
111}
112
113void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
115
116 // TODO: Which one is called first, emitStartOfAsmFile or
117 // emitFunctionBodyStart?
118 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
119 initializeTargetID(M);
120
123 return;
124
126
129 CodeObjectVersion);
130 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
131 }
132
135}
136
138 // Init target streamer if it has not yet happened
140 initTargetStreamer(M);
141
144
145 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
146 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
148 HSAMetadataStream->end();
149 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
150 (void)Success;
151 assert(Success && "Malformed HSA Metadata");
152 }
153}
154
157 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
158 const Function &F = MF->getFunction();
159
160 // TODO: We're checking this late, would be nice to check it earlier.
161 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
163 STM.getCPU() + " is only available on code object version 6 or better",
164 /*gen_crash_diag*/ false);
165 }
166
167 // TODO: Which one is called first, emitStartOfAsmFile or
168 // emitFunctionBodyStart?
169 if (!getTargetStreamer()->getTargetID())
170 initializeTargetID(*F.getParent());
171
172 const auto &FunctionTargetID = STM.getTargetID();
173 // Make sure function's xnack settings are compatible with module's
174 // xnack settings.
175 if (FunctionTargetID.isXnackSupported() &&
176 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
177 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
178 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
179 "' function does not match module xnack setting");
180 return;
181 }
182 // Make sure function's sramecc settings are compatible with module's
183 // sramecc settings.
184 if (FunctionTargetID.isSramEccSupported() &&
185 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
186 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
187 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
188 "' function does not match module sramecc setting");
189 return;
190 }
191
192 if (!MFI.isEntryFunction())
193 return;
194
195 if (STM.isMesaKernel(F) &&
196 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
197 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
198 amd_kernel_code_t KernelCode;
199 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
201 }
202
203 if (STM.isAmdHsaOS())
204 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
205
206 if (MFI.getNumKernargPreloadedSGPRs() > 0) {
209 STM.isAmdHsaOS());
210 }
211}
212
215 if (!MFI.isEntryFunction())
216 return;
217
219 return;
220
221 auto &Streamer = getTargetStreamer()->getStreamer();
222 auto &Context = Streamer.getContext();
223 auto &ObjectFileInfo = *Context.getObjectFileInfo();
224 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
225
226 Streamer.pushSection();
227 Streamer.switchSection(&ReadOnlySection);
228
229 // CP microcode requires the kernel descriptor to be allocated on 64 byte
230 // alignment.
231 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
232 ReadOnlySection.ensureMinAlignment(Align(64));
233
234 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
235
236 SmallString<128> KernelName;
237 getNameWithPrefix(KernelName, &MF->getFunction());
239 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
240 CurrentProgramInfo.NumVGPRsForWavesPerEU,
241 CurrentProgramInfo.NumSGPRsForWavesPerEU -
243 &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
244 getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
245 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
246
247 Streamer.popSection();
248}
249
251 Register RegNo = MI->getOperand(0).getReg();
252
255 OS << "implicit-def: "
256 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
257
258 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
259 OS << " : SGPR spill to VGPR lane";
260
261 OutStreamer->AddComment(OS.str());
262 OutStreamer->addBlankLine();
263}
264
268 return;
269 }
270
272 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
273 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
274 SmallString<128> SymbolName;
275 getNameWithPrefix(SymbolName, &MF->getFunction()),
277 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
278 }
279 if (DumpCodeInstEmitter) {
280 // Disassemble function name label to text.
281 DisasmLines.push_back(MF->getName().str() + ":");
282 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
283 HexLines.push_back("");
284 }
285
287}
288
290 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
291 // Write a line for the basic block label if it is not only fallthrough.
292 DisasmLines.push_back(
293 (Twine("BB") + Twine(getFunctionNumber())
294 + "_" + Twine(MBB.getNumber()) + ":").str());
295 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
296 HexLines.push_back("");
297 }
299}
300
303 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
305 Twine(GV->getName()) +
306 ": unsupported initializer for address space");
307 return;
308 }
309
310 // LDS variables aren't emitted in HSA or PAL yet.
312 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
313 return;
314
315 MCSymbol *GVSym = getSymbol(GV);
316
317 GVSym->redefineIfPossible();
318 if (GVSym->isDefined() || GVSym->isVariable())
319 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
320 "' is already defined");
321
322 const DataLayout &DL = GV->getParent()->getDataLayout();
323 uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
324 Align Alignment = GV->getAlign().value_or(Align(4));
325
326 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
327 emitLinkage(GV, GVSym);
328 auto TS = getTargetStreamer();
329 TS->emitAMDGPULDS(GVSym, Size, Alignment);
330 return;
331 }
332
334}
335
337 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
338
340 switch (CodeObjectVersion) {
342 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4());
343 break;
345 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV5());
346 break;
348 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV6());
349 break;
350 default:
351 report_fatal_error("Unexpected code object version");
352 }
353 }
355}
356
358 // Pad with s_code_end to help tools and guard against instruction prefetch
359 // causing stale data in caches. Arguably this should be done by the linker,
360 // which is why this isn't done for Mesa.
361 const MCSubtargetInfo &STI = *getGlobalSTI();
362 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
365 OutStreamer->switchSection(getObjFileLowering().getTextSection());
367 }
368
370}
371
372// Print comments that apply to both callable functions and entry points.
373void AMDGPUAsmPrinter::emitCommonFunctionComments(
374 uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
375 uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
376 const AMDGPUMachineFunction *MFI) {
377 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
378 OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
379 OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
380 if (NumAGPR) {
381 OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
382 OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
383 false);
384 }
385 OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
386 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
387 false);
388}
389
390uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
391 const MachineFunction &MF) const {
393 uint16_t KernelCodeProperties = 0;
394 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
395
396 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
397 KernelCodeProperties |=
398 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
399 }
400 if (UserSGPRInfo.hasDispatchPtr()) {
401 KernelCodeProperties |=
402 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
403 }
404 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {
405 KernelCodeProperties |=
406 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
407 }
408 if (UserSGPRInfo.hasKernargSegmentPtr()) {
409 KernelCodeProperties |=
410 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
411 }
412 if (UserSGPRInfo.hasDispatchID()) {
413 KernelCodeProperties |=
414 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
415 }
416 if (UserSGPRInfo.hasFlatScratchInit()) {
417 KernelCodeProperties |=
418 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
419 }
421 KernelCodeProperties |=
422 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
423 }
424
425 if (CurrentProgramInfo.DynamicCallStack &&
426 CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
427 KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
428
429 return KernelCodeProperties;
430}
431
433AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
434 const SIProgramInfo &PI) const {
436 const Function &F = MF.getFunction();
438 MCContext &Ctx = MF.getContext();
439
440 MCKernelDescriptor KernelDescriptor;
441
442 assert(isUInt<32>(PI.ScratchSize));
443 assert(isUInt<32>(PI.getComputePGMRSrc1(STM)));
444 assert(isUInt<32>(PI.getComputePGMRSrc2()));
445
446 KernelDescriptor.group_segment_fixed_size =
448 KernelDescriptor.private_segment_fixed_size =
450
451 Align MaxKernArgAlign;
452 KernelDescriptor.kernarg_size = MCConstantExpr::create(
453 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
454
455 KernelDescriptor.compute_pgm_rsrc1 =
457 KernelDescriptor.compute_pgm_rsrc2 =
459 KernelDescriptor.kernel_code_properties =
460 MCConstantExpr::create(getAmdhsaKernelCodeProperties(MF), Ctx);
461
462 assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
463 KernelDescriptor.compute_pgm_rsrc3 = MCConstantExpr::create(
464 STM.hasGFX90AInsts() ? CurrentProgramInfo.ComputePGMRSrc3GFX90A : 0, Ctx);
465
466 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
467 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
468 Ctx);
469
470 return KernelDescriptor;
471}
472
474 // Init target streamer lazily on the first function so that previous passes
475 // can set metadata.
477 initTargetStreamer(*MF.getFunction().getParent());
478
479 ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
480 CurrentProgramInfo = SIProgramInfo();
481
483
484 // The starting address of all shader programs must be 256 bytes aligned.
485 // Regular functions just need the basic required instruction alignment.
486 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
487
489
492 // FIXME: This should be an explicit check for Mesa.
493 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
494 MCSectionELF *ConfigSection =
495 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
496 OutStreamer->switchSection(ConfigSection);
497 }
498
499 if (MFI->isModuleEntryFunction()) {
500 getSIProgramInfo(CurrentProgramInfo, MF);
501 }
502
503 if (STM.isAmdPalOS()) {
504 if (MFI->isEntryFunction())
505 EmitPALMetadata(MF, CurrentProgramInfo);
506 else if (MFI->isModuleEntryFunction())
507 emitPALFunctionMetadata(MF);
508 } else if (!STM.isAmdHsaOS()) {
509 EmitProgramInfoSI(MF, CurrentProgramInfo);
510 }
511
512 DumpCodeInstEmitter = nullptr;
513 if (STM.dumpCode()) {
514 // For -dumpcode, get the assembler out of the streamer, even if it does
515 // not really want to let us have it. This only works with -filetype=obj.
516 bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing();
517 OutStreamer->setUseAssemblerInfoForParsing(true);
518 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
519 OutStreamer->setUseAssemblerInfoForParsing(SaveFlag);
520 if (Assembler)
521 DumpCodeInstEmitter = Assembler->getEmitterPtr();
522 }
523
524 DisasmLines.clear();
525 HexLines.clear();
527
529
530 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
531 STM.hasMAIInsts());
532
533 if (isVerbose()) {
534 MCSectionELF *CommentSection =
535 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
536 OutStreamer->switchSection(CommentSection);
537
538 if (!MFI->isEntryFunction()) {
539 OutStreamer->emitRawComment(" Function info:", false);
541 ResourceUsage->getResourceInfo(&MF.getFunction());
542 emitCommonFunctionComments(
543 Info.NumVGPR,
544 STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
545 Info.getTotalNumVGPRs(STM),
546 Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
547 Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI);
548 return false;
549 }
550
551 OutStreamer->emitRawComment(" Kernel info:", false);
552 emitCommonFunctionComments(
553 CurrentProgramInfo.NumArchVGPR,
554 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR
555 : std::optional<uint32_t>(),
556 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
557 CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
558
559 OutStreamer->emitRawComment(
560 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
561 OutStreamer->emitRawComment(
562 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
563 OutStreamer->emitRawComment(
564 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
565 " bytes/workgroup (compile time only)", false);
566
567 OutStreamer->emitRawComment(
568 " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
569 OutStreamer->emitRawComment(
570 " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
571
572 OutStreamer->emitRawComment(
573 " NumSGPRsForWavesPerEU: " +
574 Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
575 OutStreamer->emitRawComment(
576 " NumVGPRsForWavesPerEU: " +
577 Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
578
579 if (STM.hasGFX90AInsts())
580 OutStreamer->emitRawComment(
581 " AccumOffset: " +
582 Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false);
583
584 OutStreamer->emitRawComment(
585 " Occupancy: " +
586 Twine(CurrentProgramInfo.Occupancy), false);
587
588 OutStreamer->emitRawComment(
589 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
590
591 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
592 Twine(CurrentProgramInfo.ScratchEnable),
593 false);
594 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
595 Twine(CurrentProgramInfo.UserSGPR),
596 false);
597 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
598 Twine(CurrentProgramInfo.TrapHandlerEnable),
599 false);
600 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
601 Twine(CurrentProgramInfo.TGIdXEnable),
602 false);
603 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
604 Twine(CurrentProgramInfo.TGIdYEnable),
605 false);
606 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
607 Twine(CurrentProgramInfo.TGIdZEnable),
608 false);
609 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
610 Twine(CurrentProgramInfo.TIdIGCompCount),
611 false);
612
613 assert(STM.hasGFX90AInsts() ||
614 CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
615 if (STM.hasGFX90AInsts()) {
616 OutStreamer->emitRawComment(
617 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
618 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
619 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
620 false);
621 OutStreamer->emitRawComment(
622 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
623 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
624 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
625 false);
626 }
627 }
628
629 if (DumpCodeInstEmitter) {
630
631 OutStreamer->switchSection(
632 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
633
634 for (size_t i = 0; i < DisasmLines.size(); ++i) {
635 std::string Comment = "\n";
636 if (!HexLines[i].empty()) {
637 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
638 Comment += " ; " + HexLines[i] + "\n";
639 }
640
641 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
642 OutStreamer->emitBytes(StringRef(Comment));
643 }
644 }
645
646 return false;
647}
648
649// TODO: Fold this into emitFunctionBodyStart.
650void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
651 // In the beginning all features are either 'Any' or 'NotSupported',
652 // depending on global target features. This will cover empty modules.
654 getGlobalSTI()->getFeatureString());
655
656 // If module is empty, we are done.
657 if (M.empty())
658 return;
659
660 // If module is not empty, need to find first 'Off' or 'On' feature
661 // setting per feature from functions in module.
662 for (auto &F : M) {
663 auto &TSTargetID = getTargetStreamer()->getTargetID();
664 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
665 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
666 break;
667
669 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
670 if (TSTargetID->isXnackSupported())
671 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
672 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
673 if (TSTargetID->isSramEccSupported())
674 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
675 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
676 }
677}
678
679uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
681 const SIInstrInfo *TII = STM.getInstrInfo();
682
683 uint64_t CodeSize = 0;
684
685 for (const MachineBasicBlock &MBB : MF) {
686 for (const MachineInstr &MI : MBB) {
687 // TODO: CodeSize should account for multiple functions.
688
689 // TODO: Should we count size of debug info?
690 if (MI.isDebugInstr())
691 continue;
692
693 CodeSize += TII->getInstSizeInBytes(MI);
694 }
695 }
696
697 return CodeSize;
698}
699
700void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
701 const MachineFunction &MF) {
703 ResourceUsage->getResourceInfo(&MF.getFunction());
705
706 ProgInfo.NumArchVGPR = Info.NumVGPR;
707 ProgInfo.NumAccVGPR = Info.NumAGPR;
708 ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
709 ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
710 ProgInfo.TgSplit = STM.isTgSplitEnabled();
711 ProgInfo.NumSGPR = Info.NumExplicitSGPR;
712 ProgInfo.ScratchSize = Info.PrivateSegmentSize;
713 ProgInfo.VCCUsed = Info.UsesVCC;
714 ProgInfo.FlatUsed = Info.UsesFlatScratch;
715 ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
716
717 const uint64_t MaxScratchPerWorkitem =
719 if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
720 DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
721 ProgInfo.ScratchSize,
722 MaxScratchPerWorkitem, DS_Error);
723 MF.getFunction().getContext().diagnose(DiagStackSize);
724 }
725
727
728 // The calculations related to SGPR/VGPR blocks are
729 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
730 // unified.
731 unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
732 &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed,
733 getTargetStreamer()->getTargetID()->isXnackOnOrAny());
734
735 // Check the addressable register limit before we add ExtraSGPRs.
737 !STM.hasSGPRInitBug()) {
738 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
739 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
740 // This can happen due to a compiler bug or when using inline asm.
743 MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR,
744 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
745 Ctx.diagnose(Diag);
746 ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
747 }
748 }
749
750 // Account for extra SGPRs and VGPRs reserved for debugger use.
751 ProgInfo.NumSGPR += ExtraSGPRs;
752
753 const Function &F = MF.getFunction();
754
755 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
756 // dispatch registers are function args.
757 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
758
759 if (isShader(F.getCallingConv())) {
760 bool IsPixelShader =
761 F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
762
763 // Calculate the number of VGPR registers based on the SPI input registers
764 uint32_t InputEna = 0;
765 uint32_t InputAddr = 0;
766 unsigned LastEna = 0;
767
768 if (IsPixelShader) {
769 // Note for IsPixelShader:
770 // By this stage, all enabled inputs are tagged in InputAddr as well.
771 // We will use InputAddr to determine whether the input counts against the
772 // vgpr total and only use the InputEnable to determine the last input
773 // that is relevant - if extra arguments are used, then we have to honour
774 // the InputAddr for any intermediate non-enabled inputs.
775 InputEna = MFI->getPSInputEnable();
776 InputAddr = MFI->getPSInputAddr();
777
778 // We only need to consider input args up to the last used arg.
779 assert((InputEna || InputAddr) &&
780 "PSInputAddr and PSInputEnable should "
781 "never both be 0 for AMDGPU_PS shaders");
782 // There are some rare circumstances where InputAddr is non-zero and
783 // InputEna can be set to 0. In this case we default to setting LastEna
784 // to 1.
785 LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
786 }
787
788 // FIXME: We should be using the number of registers determined during
789 // calling convention lowering to legalize the types.
790 const DataLayout &DL = F.getParent()->getDataLayout();
791 unsigned PSArgCount = 0;
792 unsigned IntermediateVGPR = 0;
793 for (auto &Arg : F.args()) {
794 unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
795 if (Arg.hasAttribute(Attribute::InReg)) {
796 WaveDispatchNumSGPR += NumRegs;
797 } else {
798 // If this is a PS shader and we're processing the PS Input args (first
799 // 16 VGPR), use the InputEna and InputAddr bits to define how many
800 // VGPRs are actually used.
801 // Any extra VGPR arguments are handled as normal arguments (and
802 // contribute to the VGPR count whether they're used or not).
803 if (IsPixelShader && PSArgCount < 16) {
804 if ((1 << PSArgCount) & InputAddr) {
805 if (PSArgCount < LastEna)
806 WaveDispatchNumVGPR += NumRegs;
807 else
808 IntermediateVGPR += NumRegs;
809 }
810 PSArgCount++;
811 } else {
812 // If there are extra arguments we have to include the allocation for
813 // the non-used (but enabled with InputAddr) input arguments
814 if (IntermediateVGPR) {
815 WaveDispatchNumVGPR += IntermediateVGPR;
816 IntermediateVGPR = 0;
817 }
818 WaveDispatchNumVGPR += NumRegs;
819 }
820 }
821 }
822 ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
823 ProgInfo.NumArchVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
824 ProgInfo.NumVGPR =
825 Info.getTotalNumVGPRs(STM, Info.NumAGPR, ProgInfo.NumArchVGPR);
826 }
827
828 // Adjust number of registers used to meet default/requested minimum/maximum
829 // number of waves per execution unit request.
830 ProgInfo.NumSGPRsForWavesPerEU = std::max(
831 std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
832 ProgInfo.NumVGPRsForWavesPerEU = std::max(
833 std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
834
836 STM.hasSGPRInitBug()) {
837 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
838 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
839 // This can happen due to a compiler bug or when using inline asm to use
840 // the registers which are usually reserved for vcc etc.
842 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
843 ProgInfo.NumSGPR, MaxAddressableNumSGPRs,
845 Ctx.diagnose(Diag);
846 ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
847 ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
848 }
849 }
850
851 if (STM.hasSGPRInitBug()) {
852 ProgInfo.NumSGPR =
854 ProgInfo.NumSGPRsForWavesPerEU =
856 }
857
858 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
860 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
861 MFI->getNumUserSGPRs(),
863 Ctx.diagnose(Diag);
864 }
865
866 if (MFI->getLDSSize() >
867 static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
870 MF.getFunction(), "local memory", MFI->getLDSSize(),
872 Ctx.diagnose(Diag);
873 }
874
876 &STM, ProgInfo.NumSGPRsForWavesPerEU);
877 ProgInfo.VGPRBlocks =
879
880 const SIModeRegisterDefaults Mode = MFI->getMode();
881
882 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
883 // register.
884 ProgInfo.FloatMode = getFPMode(Mode);
885
886 ProgInfo.IEEEMode = Mode.IEEE;
887
888 // Make clamp modifier on NaN input returns 0.
889 ProgInfo.DX10Clamp = Mode.DX10Clamp;
890
891 unsigned LDSAlignShift;
893 // LDS is allocated in 64 dword blocks.
894 LDSAlignShift = 8;
895 } else {
896 // LDS is allocated in 128 dword blocks.
897 LDSAlignShift = 9;
898 }
899
900 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
901 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
902
903 ProgInfo.LDSSize = MFI->getLDSSize();
904 ProgInfo.LDSBlocks =
905 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
906
907 // Scratch is allocated in 64-dword or 256-dword blocks.
908 unsigned ScratchAlignShift =
909 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
910 // We need to program the hardware with the amount of scratch memory that
911 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
912 // scratch memory used per thread.
913 ProgInfo.ScratchBlocks = divideCeil(
914 ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift);
915
916 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
917 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
918 ProgInfo.MemOrdered = 1;
919 }
920
921 // 0 = X, 1 = XY, 2 = XYZ
922 unsigned TIDIGCompCnt = 0;
923 if (MFI->hasWorkItemIDZ())
924 TIDIGCompCnt = 2;
925 else if (MFI->hasWorkItemIDY())
926 TIDIGCompCnt = 1;
927
928 // The private segment wave byte offset is the last of the system SGPRs. We
929 // initially assumed it was allocated, and may have used it. It shouldn't harm
930 // anything to disable it if we know the stack isn't used here. We may still
931 // have emitted code reading it to initialize scratch, but if that's unused
932 // reading garbage should be OK.
933 ProgInfo.ScratchEnable =
934 ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack;
935 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
936 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
937 ProgInfo.TrapHandlerEnable =
938 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
939 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
940 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
941 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
942 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
943 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
944 ProgInfo.EXCPEnMSB = 0;
945 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
946 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
947 ProgInfo.EXCPEnable = 0;
948
949 if (STM.hasGFX90AInsts()) {
951 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
952 ProgInfo.AccumOffset);
954 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
955 ProgInfo.TgSplit);
956 }
957
958 ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
959 ProgInfo.NumSGPRsForWavesPerEU,
960 ProgInfo.NumVGPRsForWavesPerEU);
961 const auto [MinWEU, MaxWEU] =
962 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
963 if (ProgInfo.Occupancy < MinWEU) {
965 F, F.getSubprogram(),
966 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
967 "'" +
968 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
969 ", final occupancy is " + Twine(ProgInfo.Occupancy));
970 F.getContext().diagnose(Diag);
971 }
972}
973
974static unsigned getRsrcReg(CallingConv::ID CallConv) {
975 switch (CallConv) {
976 default: [[fallthrough]];
984 }
985}
986
987void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
988 const SIProgramInfo &CurrentProgramInfo) {
991 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
992
995
996 OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1(STM));
997
999 OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2());
1000
1002 OutStreamer->emitInt32(
1003 STM.getGeneration() >= AMDGPUSubtarget::GFX12
1004 ? S_00B860_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
1005 : STM.getGeneration() == AMDGPUSubtarget::GFX11
1006 ? S_00B860_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
1007 : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
1008
1009 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1010 // 0" comment but I don't see a corresponding field in the register spec.
1011 } else {
1012 OutStreamer->emitInt32(RsrcReg);
1013 OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
1014 S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
1016 OutStreamer->emitInt32(
1017 STM.getGeneration() >= AMDGPUSubtarget::GFX12
1018 ? S_0286E8_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
1019 : STM.getGeneration() == AMDGPUSubtarget::GFX11
1020 ? S_0286E8_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
1021 : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
1022 }
1023
1026 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1027 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1028 : CurrentProgramInfo.LDSBlocks;
1029 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1031 OutStreamer->emitInt32(MFI->getPSInputEnable());
1033 OutStreamer->emitInt32(MFI->getPSInputAddr());
1034 }
1035
1036 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1037 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1038 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1039 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1040}
1041
1042// Helper function to add common PAL Metadata 3.0+
1044 const SIProgramInfo &CurrentProgramInfo,
1045 CallingConv::ID CC, const GCNSubtarget &ST) {
1046 if (ST.hasIEEEMode())
1047 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1048
1049 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1050 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1051
1052 if (AMDGPU::isCompute(CC)) {
1053 MD->setHwStage(CC, ".trap_present",
1054 (bool)CurrentProgramInfo.TrapHandlerEnable);
1055 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1056
1057 MD->setHwStage(CC, ".lds_size",
1058 (unsigned)(CurrentProgramInfo.LdsSize *
1059 getLdsDwGranularity(ST) * sizeof(uint32_t)));
1060 }
1061}
1062
1063// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1064// is AMDPAL. It stores each compute/SPI register setting and other PAL
1065// metadata items into the PALMD::Metadata, combining with any provided by the
1066// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1067// is then written as a single block in the .note section.
1068void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1069 const SIProgramInfo &CurrentProgramInfo) {
1071 auto CC = MF.getFunction().getCallingConv();
1072 auto MD = getTargetStreamer()->getPALMetadata();
1073
1075 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1076
1077 // Only set AGPRs for supported devices
1078 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1079 if (STM.hasMAIInsts()) {
1080 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1081 }
1082
1083 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1084 if (MD->getPALMajorVersion() < 3) {
1085 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM));
1086 if (AMDGPU::isCompute(CC)) {
1087 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
1088 } else {
1089 if (CurrentProgramInfo.ScratchBlocks > 0)
1090 MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
1091 }
1092 } else {
1093 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1094 MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
1095 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
1096 }
1097
1098 // ScratchSize is in bytes, 16 aligned.
1099 MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
1101 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1102 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1103 : CurrentProgramInfo.LDSBlocks;
1104 if (MD->getPALMajorVersion() < 3) {
1105 MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1106 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1107 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1108 } else {
1109 // Graphics registers
1110 const unsigned ExtraLdsDwGranularity =
1111 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1112 MD->setGraphicsRegisters(
1113 ".ps_extra_lds_size",
1114 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1115
1116 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1117 static StringLiteral const PsInputFields[] = {
1118 ".persp_sample_ena", ".persp_center_ena",
1119 ".persp_centroid_ena", ".persp_pull_model_ena",
1120 ".linear_sample_ena", ".linear_center_ena",
1121 ".linear_centroid_ena", ".line_stipple_tex_ena",
1122 ".pos_x_float_ena", ".pos_y_float_ena",
1123 ".pos_z_float_ena", ".pos_w_float_ena",
1124 ".front_face_ena", ".ancillary_ena",
1125 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1126 unsigned PSInputEna = MFI->getPSInputEnable();
1127 unsigned PSInputAddr = MFI->getPSInputAddr();
1128 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1129 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1130 (bool)((PSInputEna >> Idx) & 1));
1131 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1132 (bool)((PSInputAddr >> Idx) & 1));
1133 }
1134 }
1135 }
1136
1137 // For version 3 and above the wave front size is already set in the metadata
1138 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1139 MD->setWave32(MF.getFunction().getCallingConv());
1140}
1141
1142void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1143 auto *MD = getTargetStreamer()->getPALMetadata();
1144 const MachineFrameInfo &MFI = MF.getFrameInfo();
1145 StringRef FnName = MF.getFunction().getName();
1146 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1148
1149 if (MD->getPALMajorVersion() < 3) {
1150 // Set compute registers
1151 MD->setRsrc1(CallingConv::AMDGPU_CS,
1152 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST));
1153 MD->setRsrc2(CallingConv::AMDGPU_CS,
1154 CurrentProgramInfo.getComputePGMRSrc2());
1155 } else {
1156 EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
1157 }
1158
1159 // Set optional info
1160 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1161 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1162 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1163}
1164
1165// This is supposed to be log2(Size)
1167 switch (Size) {
1168 case 4:
1169 return AMD_ELEMENT_4_BYTES;
1170 case 8:
1171 return AMD_ELEMENT_8_BYTES;
1172 case 16:
1173 return AMD_ELEMENT_16_BYTES;
1174 default:
1175 llvm_unreachable("invalid private_element_size");
1176 }
1177}
1178
1179void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
1180 const SIProgramInfo &CurrentProgramInfo,
1181 const MachineFunction &MF) const {
1182 const Function &F = MF.getFunction();
1183 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1184 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1185
1187 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1188
1190
1192 CurrentProgramInfo.getComputePGMRSrc1(STM) |
1193 (CurrentProgramInfo.getComputePGMRSrc2() << 32);
1195
1196 if (CurrentProgramInfo.DynamicCallStack)
1198
1201 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1202
1203 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1204 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1205 Out.code_properties |=
1207 }
1208
1209 if (UserSGPRInfo.hasDispatchPtr())
1211
1212 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)
1214
1215 if (UserSGPRInfo.hasKernargSegmentPtr())
1217
1218 if (UserSGPRInfo.hasDispatchID())
1220
1221 if (UserSGPRInfo.hasFlatScratchInit())
1223
1224 if (UserSGPRInfo.hasDispatchPtr())
1226
1227 if (STM.isXNACKEnabled())
1229
1230 Align MaxKernArgAlign;
1231 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1232 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1233 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1234 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1235 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1236
1237 // kernarg_segment_alignment is specified as log of the alignment.
1238 // The minimum alignment is 16.
1239 // FIXME: The metadata treats the minimum as 4?
1240 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1241}
1242
1244 const char *ExtraCode, raw_ostream &O) {
1245 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1246 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1247 return false;
1248
1249 if (ExtraCode && ExtraCode[0]) {
1250 if (ExtraCode[1] != 0)
1251 return true; // Unknown modifier.
1252
1253 switch (ExtraCode[0]) {
1254 case 'r':
1255 break;
1256 default:
1257 return true;
1258 }
1259 }
1260
1261 // TODO: Should be able to support other operand types like globals.
1262 const MachineOperand &MO = MI->getOperand(OpNo);
1263 if (MO.isReg()) {
1266 return false;
1267 } else if (MO.isImm()) {
1268 int64_t Val = MO.getImm();
1270 O << Val;
1271 } else if (isUInt<16>(Val)) {
1272 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1273 } else if (isUInt<32>(Val)) {
1274 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1275 } else {
1276 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1277 }
1278 return false;
1279 }
1280 return true;
1281}
1282
1287}
1288
1289void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1290 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1291 bool isModuleEntryFunction, bool hasMAIInsts) {
1292 if (!ORE)
1293 return;
1294
1295 const char *Name = "kernel-resource-usage";
1296 const char *Indent = " ";
1297
1298 // If the remark is not specifically enabled, do not output to yaml
1301 return;
1302
1303 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1304 StringRef RemarkLabel, auto Argument) {
1305 // Add an indent for every line besides the line with the kernel name. This
1306 // makes it easier to tell which resource usage go with which kernel since
1307 // the kernel name will always be displayed first.
1308 std::string LabelStr = RemarkLabel.str() + ": ";
1309 if (!RemarkName.equals("FunctionName"))
1310 LabelStr = Indent + LabelStr;
1311
1312 ORE->emit([&]() {
1313 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1315 &MF.front())
1316 << LabelStr << ore::NV(RemarkName, Argument);
1317 });
1318 };
1319
1320 // FIXME: Formatting here is pretty nasty because clang does not accept
1321 // newlines from diagnostics. This forces us to emit multiple diagnostic
1322 // remarks to simulate newlines. If and when clang does accept newlines, this
1323 // formatting should be aggregated into one remark with newlines to avoid
1324 // printing multiple diagnostic location and diag opts.
1325 EmitResourceUsageRemark("FunctionName", "Function Name",
1326 MF.getFunction().getName());
1327 EmitResourceUsageRemark("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR);
1328 EmitResourceUsageRemark("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR);
1329 if (hasMAIInsts)
1330 EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR);
1331 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1332 CurrentProgramInfo.ScratchSize);
1333 StringRef DynamicStackStr =
1334 CurrentProgramInfo.DynamicCallStack ? "True" : "False";
1335 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1336 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1337 CurrentProgramInfo.Occupancy);
1338 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1339 CurrentProgramInfo.SGPRSpill);
1340 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1341 CurrentProgramInfo.VGPRSpill);
1342 if (isModuleEntryFunction)
1343 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1344 CurrentProgramInfo.LDSSize);
1345}
#define Success
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, const SIProgramInfo &CurrentProgramInfo, CallingConv::ID CC, const GCNSubtarget &ST)
static unsigned getRsrcReg(CallingConv::ID CallConv)
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
AMDHSA kernel descriptor MCExpr struct for use in MC layer.
Analyzes how many registers and other resources are used by functions.
AMDHSA kernel descriptor definitions.
#define AMDHSA_BITS_GET(SRC, MSK)
#define AMDHSA_BITS_SET(DST, MSK, VAL)
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
@ AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_EXTERNAL_VISIBILITY
Definition: Compiler.h:135
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::string Name
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
LLVMContext & Context
const char LLVMTargetMachineRef TM
R600 Assembly printer class.
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition: SIDefines.h:1049
#define S_0286E8_WAVESIZE_PreGFX11(x)
Definition: SIDefines.h:1188
#define R_0286E8_SPI_TMPRING_SIZE
Definition: SIDefines.h:1187
#define S_00B84C_SCRATCH_EN(x)
Definition: SIDefines.h:1083
#define S_0286E8_WAVESIZE_GFX11(x)
Definition: SIDefines.h:1189
#define FP_ROUND_MODE_DP(x)
Definition: SIDefines.h:1169
#define FP_ROUND_ROUND_TO_NEAREST
Definition: SIDefines.h:1161
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition: SIDefines.h:1120
#define S_00B860_WAVESIZE_GFX12Plus(x)
Definition: SIDefines.h:1185
#define S_0286E8_WAVESIZE_GFX12Plus(x)
Definition: SIDefines.h:1190
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition: SIDefines.h:1182
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition: SIDefines.h:1072
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition: SIDefines.h:1071
#define S_00B028_SGPRS(x)
Definition: SIDefines.h:1051
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition: SIDefines.h:1080
#define R_0286CC_SPI_PS_INPUT_ENA
Definition: SIDefines.h:1119
#define S_00B860_WAVESIZE_PreGFX11(x)
Definition: SIDefines.h:1183
#define S_00B028_VGPRS(x)
Definition: SIDefines.h:1050
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition: SIDefines.h:1058
#define FP_DENORM_MODE_DP(x)
Definition: SIDefines.h:1180
#define R_00B848_COMPUTE_PGM_RSRC1
Definition: SIDefines.h:1122
#define R_SPILLED_SGPRS
Definition: SIDefines.h:1201
#define FP_ROUND_MODE_SP(x)
Definition: SIDefines.h:1168
#define FP_DENORM_MODE_SP(x)
Definition: SIDefines.h:1179
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition: SIDefines.h:1063
#define R_SPILLED_VGPRS
Definition: SIDefines.h:1202
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition: SIDefines.h:1057
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition: SIDefines.h:1082
#define S_00B860_WAVESIZE_GFX11(x)
Definition: SIDefines.h:1184
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition: SIDefines.h:1056
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
Emit the specified function out to the OutStreamer.
void emitFunctionBodyEnd() override
Targets can override this to emit stuff after the last basic block in the function.
bool doFinalization(Module &M) override
Shut down the asmprinter.
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(unsigned RegNo, raw_ostream &O, const MCRegisterInfo &MRI)
void setEntryPoint(unsigned CC, StringRef Name)
void setHwStage(unsigned CC, StringRef field, unsigned Val)
unsigned getAddressableLocalMemorySize() const
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV)
void initializeTargetID(const MCSubtargetInfo &STI)
virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitDirectiveAMDGCNTarget()
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, uint64_t NextVGPR, uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr)
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI, bool TrapEnabled)
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
This class is intended to be used as a driving class for all asm writers.
Definition: AsmPrinter.h:84
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
Definition: AsmPrinter.cpp:398
MCSymbol * getSymbol(const GlobalValue *GV) const
Definition: AsmPrinter.cpp:700
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
Definition: AsmPrinter.cpp:722
TargetMachine & TM
Target machine description.
Definition: AsmPrinter.h:87
MachineFunction * MF
The current machine function.
Definition: AsmPrinter.h:102
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
Definition: AsmPrinter.cpp:449
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
Definition: AsmPrinter.cpp:655
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
Definition: AsmPrinter.cpp:440
unsigned getFunctionNumber() const
Return a unique ID for the current function.
Definition: AsmPrinter.cpp:394
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition: AsmPrinter.h:114
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition: AsmPrinter.h:94
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition: AsmPrinter.h:99
bool isVerbose() const
Return true if assembly output should contain comments.
Definition: AsmPrinter.h:265
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
Definition: AsmPrinter.cpp:695
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Diagnostic information for optimization failures.
Diagnostic information for stack size etc.
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1831
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:262
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:350
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
bool hasMAIInsts() const
Definition: GCNSubtarget.h:801
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:251
bool hasSGPRInitBug() const
bool isTgSplitEnabled() const
Definition: GCNSubtarget.h:594
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool isCuModeEnabled() const
Definition: GCNSubtarget.h:598
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
Definition: GCNSubtarget.h:287
bool dumpCode() const
Definition: GCNSubtarget.h:498
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:586
bool isWave32() const
unsigned getMaxNumUserSGPRs() const
Definition: GCNSubtarget.h:926
Generation getGeneration() const
Definition: GCNSubtarget.h:302
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
Definition: GCNSubtarget.h:306
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
MaybeAlign getAlign() const
Returns the alignment of the given variable or function.
Definition: GlobalObject.h:80
VisibilityTypes getVisibility() const
Definition: GlobalValue.h:248
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:274
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
Type * getValueType() const
Definition: GlobalValue.h:296
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
const DiagnosticHandler * getDiagHandlerPtr() const
getDiagHandlerPtr - Returns const raw pointer of DiagnosticHandler set by setDiagnosticHandler.
MCCodeEmitter * getEmitterPtr() const
Definition: MCAssembler.h:330
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:194
Context object for machine code objects.
Definition: MCContext.h:76
void reportError(SMLoc L, const Twine &Msg)
Definition: MCContext.cpp:1064
MCContext & getContext() const
This represents a section on linux, lots of unix variants and some bare metal systems.
Definition: MCSectionELF.h:26
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:40
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition: MCSymbol.h:250
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:205
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition: MCSymbol.h:300
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition: MCSymbol.h:232
MCStreamer & getStreamer()
Definition: MCStreamer.h:101
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void setAlignment(Align A)
setAlignment - Set the alignment of the function.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MCContext & getContext() const
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
Definition: MachineInstr.h:69
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:287
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getNumKernargPreloadedSGPRs() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:849
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::string str() const
str - Get the contents as an std::string.
Definition: StringRef.h:222
bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:164
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
const Triple & getTargetTriple() const
const MCSubtargetInfo * getMCSubtargetInfo() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:370
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:690
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs)
unsigned getEncodedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, std::optional< bool > EnableWavefrontSize32)
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const MCSubtargetInfo *STI)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
IsaVersion getIsaVersion(StringRef GPU)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX90A(const MCSubtargetInfo &STI)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ SHT_PROGBITS
Definition: ELF.h:1063
@ STT_AMDGPU_HSA_KERNEL
Definition: ELF.h:1336
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1689
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2415
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
Target & getTheGCNTarget()
The target for GCN GPUs.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1858
@ DS_Error
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
AMD Kernel Code Object (amd_kernel_code_t).
uint16_t workitem_vgpr_count
Number of vector registers used by each work-item.
uint32_t code_properties
Code properties.
uint8_t kernarg_segment_alignment
The maximum byte alignment of variables used by the kernel in the specified memory segment.
uint32_t workgroup_group_segment_byte_size
The amount of group segment memory required by a work-group in bytes.
uint16_t wavefront_sgpr_count
Number of scalar registers used by a wavefront.
uint32_t workitem_private_segment_byte_size
The amount of memory required for the combined private, spill and arg segments for a work-item in byt...
uint64_t kernarg_segment_byte_size
The size in bytes of the kernarg segment that holds the values of the arguments to the kernel.
uint64_t compute_pgm_resource_registers
Shader program settings for CS.
const SIFunctionResourceInfo & getResourceInfo(const Function *F) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
virtual bool isAnalysisRemarkEnabled(StringRef PassName) const
Return true if analysis remarks are enabled, override to provide different implementation.
Track resource usage for kernels / entry functions.
Definition: SIProgramInfo.h:27
uint32_t NumSGPRsForWavesPerEU
Definition: SIProgramInfo.h:73
uint64_t getComputePGMRSrc1(const GCNSubtarget &ST) const
Compute the value of the ComputePGMRsrc1 register.
uint32_t TrapHandlerEnable
Definition: SIProgramInfo.h:49
uint64_t getComputePGMRSrc2() const
Compute the value of the ComputePGMRsrc2 register.
uint32_t NumVGPRsForWavesPerEU
Definition: SIProgramInfo.h:76
uint64_t ComputePGMRSrc3GFX90A
Definition: SIProgramInfo.h:59
uint64_t getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST) const
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.