LLVM 18.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
22#include "AMDKernelCodeT.h"
23#include "GCNSubtarget.h"
26#include "R600AsmPrinter.h"
35#include "llvm/MC/MCAssembler.h"
36#include "llvm/MC/MCContext.h"
38#include "llvm/MC/MCStreamer.h"
44
45using namespace llvm;
46using namespace llvm::AMDGPU;
47
48// This should get the default rounding mode from the kernel. We just set the
49// default here, but this could change if the OpenCL rounding mode pragmas are
50// used.
51//
52// The denormal mode here should match what is reported by the OpenCL runtime
53// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
54// can also be override to flush with the -cl-denorms-are-zero compiler flag.
55//
56// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
57// precision, and leaves single precision to flush all and does not report
58// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
59// CL_FP_DENORM for both.
60//
61// FIXME: It seems some instructions do not support single precision denormals
62// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
63// and sin_f32, cos_f32 on most parts).
64
65// We want to use these instructions, and using fp32 denormals also causes
66// instructions to run at the double precision rate for the device so it's
67// probably best to just report no single precision denormals.
71 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
72 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
73}
74
75static AsmPrinter *
77 std::unique_ptr<MCStreamer> &&Streamer) {
78 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
79}
80
86}
87
89 std::unique_ptr<MCStreamer> Streamer)
90 : AsmPrinter(TM, std::move(Streamer)) {
91 assert(OutStreamer && "AsmPrinter constructed without streamer");
92}
93
95 return "AMDGPU Assembly Printer";
96}
97
99 return TM.getMCSubtargetInfo();
100}
101
103 if (!OutStreamer)
104 return nullptr;
105 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
106}
107
110}
111
112void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
114
115 // TODO: Which one is called first, emitStartOfAsmFile or
116 // emitFunctionBodyStart?
117 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
118 initializeTargetID(M);
119
122 return;
123
125
127 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
128
131}
132
134 // Init target streamer if it has not yet happened
136 initTargetStreamer(M);
137
140
141 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
142 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
144 HSAMetadataStream->end();
145 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
146 (void)Success;
147 assert(Success && "Malformed HSA Metadata");
148 }
149}
150
153 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
154 const Function &F = MF->getFunction();
155
156 // TODO: Which one is called first, emitStartOfAsmFile or
157 // emitFunctionBodyStart?
159 initializeTargetID(*F.getParent());
160
161 const auto &FunctionTargetID = STM.getTargetID();
162 // Make sure function's xnack settings are compatible with module's
163 // xnack settings.
164 if (FunctionTargetID.isXnackSupported() &&
165 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
166 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
167 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
168 "' function does not match module xnack setting");
169 return;
170 }
171 // Make sure function's sramecc settings are compatible with module's
172 // sramecc settings.
173 if (FunctionTargetID.isSramEccSupported() &&
174 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
175 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
176 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
177 "' function does not match module sramecc setting");
178 return;
179 }
180
181 if (!MFI.isEntryFunction())
182 return;
183
184 if (STM.isMesaKernel(F) &&
185 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
186 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
187 amd_kernel_code_t KernelCode;
188 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
190 }
191
192 if (STM.isAmdHsaOS())
193 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
194
195 if (MFI.getNumKernargPreloadedSGPRs() > 0) {
198 }
199}
200
203 if (!MFI.isEntryFunction())
204 return;
205
207 return;
208
209 auto &Streamer = getTargetStreamer()->getStreamer();
210 auto &Context = Streamer.getContext();
211 auto &ObjectFileInfo = *Context.getObjectFileInfo();
212 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
213
214 Streamer.pushSection();
215 Streamer.switchSection(&ReadOnlySection);
216
217 // CP microcode requires the kernel descriptor to be allocated on 64 byte
218 // alignment.
219 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
220 ReadOnlySection.ensureMinAlignment(Align(64));
221
222 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
223
224 SmallString<128> KernelName;
225 getNameWithPrefix(KernelName, &MF->getFunction());
227 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
228 CurrentProgramInfo.NumVGPRsForWavesPerEU,
229 CurrentProgramInfo.NumSGPRsForWavesPerEU -
231 &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
232 getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
233 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
234 CodeObjectVersion);
235
236 Streamer.popSection();
237}
238
240 Register RegNo = MI->getOperand(0).getReg();
241
244 OS << "implicit-def: "
245 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
246
247 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
248 OS << " : SGPR spill to VGPR lane";
249
250 OutStreamer->AddComment(OS.str());
251 OutStreamer->addBlankLine();
252}
253
257 return;
258 }
259
261 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
262 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
263 SmallString<128> SymbolName;
264 getNameWithPrefix(SymbolName, &MF->getFunction()),
266 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
267 }
268 if (DumpCodeInstEmitter) {
269 // Disassemble function name label to text.
270 DisasmLines.push_back(MF->getName().str() + ":");
271 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
272 HexLines.push_back("");
273 }
274
276}
277
279 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
280 // Write a line for the basic block label if it is not only fallthrough.
281 DisasmLines.push_back(
282 (Twine("BB") + Twine(getFunctionNumber())
283 + "_" + Twine(MBB.getNumber()) + ":").str());
284 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
285 HexLines.push_back("");
286 }
288}
289
292 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
294 Twine(GV->getName()) +
295 ": unsupported initializer for address space");
296 return;
297 }
298
299 // LDS variables aren't emitted in HSA or PAL yet.
301 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
302 return;
303
304 MCSymbol *GVSym = getSymbol(GV);
305
306 GVSym->redefineIfPossible();
307 if (GVSym->isDefined() || GVSym->isVariable())
308 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
309 "' is already defined");
310
311 const DataLayout &DL = GV->getParent()->getDataLayout();
312 uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
313 Align Alignment = GV->getAlign().value_or(Align(4));
314
315 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
316 emitLinkage(GV, GVSym);
317 auto TS = getTargetStreamer();
318 TS->emitAMDGPULDS(GVSym, Size, Alignment);
319 return;
320 }
321
323}
324
326 CodeObjectVersion = AMDGPU::getCodeObjectVersion(M);
327
329 switch (CodeObjectVersion) {
331 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4());
332 break;
334 HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV5());
335 break;
336 default:
337 report_fatal_error("Unexpected code object version");
338 }
339 }
341}
342
344 // Pad with s_code_end to help tools and guard against instruction prefetch
345 // causing stale data in caches. Arguably this should be done by the linker,
346 // which is why this isn't done for Mesa.
347 const MCSubtargetInfo &STI = *getGlobalSTI();
348 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
351 OutStreamer->switchSection(getObjFileLowering().getTextSection());
353 }
354
356}
357
358// Print comments that apply to both callable functions and entry points.
359void AMDGPUAsmPrinter::emitCommonFunctionComments(
360 uint32_t NumVGPR, std::optional<uint32_t> NumAGPR, uint32_t TotalNumVGPR,
361 uint32_t NumSGPR, uint64_t ScratchSize, uint64_t CodeSize,
362 const AMDGPUMachineFunction *MFI) {
363 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
364 OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
365 OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
366 if (NumAGPR) {
367 OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
368 OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
369 false);
370 }
371 OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
372 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
373 false);
374}
375
376uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
377 const MachineFunction &MF) const {
379 uint16_t KernelCodeProperties = 0;
380 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
381
382 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
383 KernelCodeProperties |=
384 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
385 }
386 if (UserSGPRInfo.hasDispatchPtr()) {
387 KernelCodeProperties |=
388 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
389 }
390 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {
391 KernelCodeProperties |=
392 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
393 }
394 if (UserSGPRInfo.hasKernargSegmentPtr()) {
395 KernelCodeProperties |=
396 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
397 }
398 if (UserSGPRInfo.hasDispatchID()) {
399 KernelCodeProperties |=
400 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
401 }
402 if (UserSGPRInfo.hasFlatScratchInit()) {
403 KernelCodeProperties |=
404 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
405 }
407 KernelCodeProperties |=
408 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
409 }
410
411 if (CurrentProgramInfo.DynamicCallStack &&
412 CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
413 KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
414
415 return KernelCodeProperties;
416}
417
418amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
419 const MachineFunction &MF,
420 const SIProgramInfo &PI) const {
422 const Function &F = MF.getFunction();
424
425 amdhsa::kernel_descriptor_t KernelDescriptor;
426 memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
427
428 assert(isUInt<32>(PI.ScratchSize));
429 assert(isUInt<32>(PI.getComputePGMRSrc1()));
430 assert(isUInt<32>(PI.getComputePGMRSrc2()));
431
432 KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
433 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
434
435 Align MaxKernArgAlign;
436 KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
437
438 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1();
439 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2();
440 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
441
442 assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
443 if (STM.hasGFX90AInsts())
444 KernelDescriptor.compute_pgm_rsrc3 =
445 CurrentProgramInfo.ComputePGMRSrc3GFX90A;
446
448 KernelDescriptor.kernarg_preload =
449 static_cast<uint16_t>(Info->getNumKernargPreloadedSGPRs());
450
451 return KernelDescriptor;
452}
453
455 // Init target streamer lazily on the first function so that previous passes
456 // can set metadata.
458 initTargetStreamer(*MF.getFunction().getParent());
459
460 ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
461 CurrentProgramInfo = SIProgramInfo();
462
464
465 // The starting address of all shader programs must be 256 bytes aligned.
466 // Regular functions just need the basic required instruction alignment.
467 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
468
470
473 // FIXME: This should be an explicit check for Mesa.
474 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
475 MCSectionELF *ConfigSection =
476 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
477 OutStreamer->switchSection(ConfigSection);
478 }
479
480 if (MFI->isModuleEntryFunction()) {
481 getSIProgramInfo(CurrentProgramInfo, MF);
482 }
483
484 if (STM.isAmdPalOS()) {
485 if (MFI->isEntryFunction())
486 EmitPALMetadata(MF, CurrentProgramInfo);
487 else if (MFI->isModuleEntryFunction())
488 emitPALFunctionMetadata(MF);
489 } else if (!STM.isAmdHsaOS()) {
490 EmitProgramInfoSI(MF, CurrentProgramInfo);
491 }
492
493 DumpCodeInstEmitter = nullptr;
494 if (STM.dumpCode()) {
495 // For -dumpcode, get the assembler out of the streamer, even if it does
496 // not really want to let us have it. This only works with -filetype=obj.
497 bool SaveFlag = OutStreamer->getUseAssemblerInfoForParsing();
498 OutStreamer->setUseAssemblerInfoForParsing(true);
499 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
500 OutStreamer->setUseAssemblerInfoForParsing(SaveFlag);
501 if (Assembler)
502 DumpCodeInstEmitter = Assembler->getEmitterPtr();
503 }
504
505 DisasmLines.clear();
506 HexLines.clear();
508
510
511 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
512 STM.hasMAIInsts());
513
514 if (isVerbose()) {
515 MCSectionELF *CommentSection =
516 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
517 OutStreamer->switchSection(CommentSection);
518
519 if (!MFI->isEntryFunction()) {
520 OutStreamer->emitRawComment(" Function info:", false);
522 ResourceUsage->getResourceInfo(&MF.getFunction());
523 emitCommonFunctionComments(
524 Info.NumVGPR,
525 STM.hasMAIInsts() ? Info.NumAGPR : std::optional<uint32_t>(),
526 Info.getTotalNumVGPRs(STM),
527 Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
528 Info.PrivateSegmentSize, getFunctionCodeSize(MF), MFI);
529 return false;
530 }
531
532 OutStreamer->emitRawComment(" Kernel info:", false);
533 emitCommonFunctionComments(
534 CurrentProgramInfo.NumArchVGPR,
535 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR
536 : std::optional<uint32_t>(),
537 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
538 CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
539
540 OutStreamer->emitRawComment(
541 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
542 OutStreamer->emitRawComment(
543 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
544 OutStreamer->emitRawComment(
545 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
546 " bytes/workgroup (compile time only)", false);
547
548 OutStreamer->emitRawComment(
549 " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
550 OutStreamer->emitRawComment(
551 " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
552
553 OutStreamer->emitRawComment(
554 " NumSGPRsForWavesPerEU: " +
555 Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
556 OutStreamer->emitRawComment(
557 " NumVGPRsForWavesPerEU: " +
558 Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
559
560 if (STM.hasGFX90AInsts())
561 OutStreamer->emitRawComment(
562 " AccumOffset: " +
563 Twine((CurrentProgramInfo.AccumOffset + 1) * 4), false);
564
565 OutStreamer->emitRawComment(
566 " Occupancy: " +
567 Twine(CurrentProgramInfo.Occupancy), false);
568
569 OutStreamer->emitRawComment(
570 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
571
572 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
573 Twine(CurrentProgramInfo.ScratchEnable),
574 false);
575 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
576 Twine(CurrentProgramInfo.UserSGPR),
577 false);
578 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
579 Twine(CurrentProgramInfo.TrapHandlerEnable),
580 false);
581 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
582 Twine(CurrentProgramInfo.TGIdXEnable),
583 false);
584 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
585 Twine(CurrentProgramInfo.TGIdYEnable),
586 false);
587 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
588 Twine(CurrentProgramInfo.TGIdZEnable),
589 false);
590 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
591 Twine(CurrentProgramInfo.TIdIGCompCount),
592 false);
593
594 assert(STM.hasGFX90AInsts() ||
595 CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
596 if (STM.hasGFX90AInsts()) {
597 OutStreamer->emitRawComment(
598 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
599 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
600 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET))),
601 false);
602 OutStreamer->emitRawComment(
603 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
604 Twine((AMDHSA_BITS_GET(CurrentProgramInfo.ComputePGMRSrc3GFX90A,
605 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT))),
606 false);
607 }
608 }
609
610 if (DumpCodeInstEmitter) {
611
612 OutStreamer->switchSection(
613 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
614
615 for (size_t i = 0; i < DisasmLines.size(); ++i) {
616 std::string Comment = "\n";
617 if (!HexLines[i].empty()) {
618 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
619 Comment += " ; " + HexLines[i] + "\n";
620 }
621
622 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
623 OutStreamer->emitBytes(StringRef(Comment));
624 }
625 }
626
627 return false;
628}
629
630// TODO: Fold this into emitFunctionBodyStart.
631void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
632 // In the beginning all features are either 'Any' or 'NotSupported',
633 // depending on global target features. This will cover empty modules.
635 *getGlobalSTI(), getGlobalSTI()->getFeatureString(), CodeObjectVersion);
636
637 // If module is empty, we are done.
638 if (M.empty())
639 return;
640
641 // If module is not empty, need to find first 'Off' or 'On' feature
642 // setting per feature from functions in module.
643 for (auto &F : M) {
644 auto &TSTargetID = getTargetStreamer()->getTargetID();
645 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
646 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
647 break;
648
650 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
651 if (TSTargetID->isXnackSupported())
652 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
653 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
654 if (TSTargetID->isSramEccSupported())
655 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
656 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
657 }
658}
659
660uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
662 const SIInstrInfo *TII = STM.getInstrInfo();
663
664 uint64_t CodeSize = 0;
665
666 for (const MachineBasicBlock &MBB : MF) {
667 for (const MachineInstr &MI : MBB) {
668 // TODO: CodeSize should account for multiple functions.
669
670 // TODO: Should we count size of debug info?
671 if (MI.isDebugInstr())
672 continue;
673
674 CodeSize += TII->getInstSizeInBytes(MI);
675 }
676 }
677
678 return CodeSize;
679}
680
681void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
682 const MachineFunction &MF) {
684 ResourceUsage->getResourceInfo(&MF.getFunction());
686
687 ProgInfo.NumArchVGPR = Info.NumVGPR;
688 ProgInfo.NumAccVGPR = Info.NumAGPR;
689 ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
690 ProgInfo.AccumOffset = alignTo(std::max(1, Info.NumVGPR), 4) / 4 - 1;
691 ProgInfo.TgSplit = STM.isTgSplitEnabled();
692 ProgInfo.NumSGPR = Info.NumExplicitSGPR;
693 ProgInfo.ScratchSize = Info.PrivateSegmentSize;
694 ProgInfo.VCCUsed = Info.UsesVCC;
695 ProgInfo.FlatUsed = Info.UsesFlatScratch;
696 ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
697
698 const uint64_t MaxScratchPerWorkitem =
700 if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
701 DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
702 ProgInfo.ScratchSize,
703 MaxScratchPerWorkitem, DS_Error);
704 MF.getFunction().getContext().diagnose(DiagStackSize);
705 }
706
708
709 // The calculations related to SGPR/VGPR blocks are
710 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
711 // unified.
712 unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
713 &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed,
714 getTargetStreamer()->getTargetID()->isXnackOnOrAny());
715
716 // Check the addressable register limit before we add ExtraSGPRs.
718 !STM.hasSGPRInitBug()) {
719 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
720 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
721 // This can happen due to a compiler bug or when using inline asm.
724 MF.getFunction(), "addressable scalar registers", ProgInfo.NumSGPR,
725 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
726 Ctx.diagnose(Diag);
727 ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
728 }
729 }
730
731 // Account for extra SGPRs and VGPRs reserved for debugger use.
732 ProgInfo.NumSGPR += ExtraSGPRs;
733
734 const Function &F = MF.getFunction();
735
736 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
737 // dispatch registers are function args.
738 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
739
740 if (isShader(F.getCallingConv())) {
741 bool IsPixelShader =
742 F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
743
744 // Calculate the number of VGPR registers based on the SPI input registers
745 uint32_t InputEna = 0;
746 uint32_t InputAddr = 0;
747 unsigned LastEna = 0;
748
749 if (IsPixelShader) {
750 // Note for IsPixelShader:
751 // By this stage, all enabled inputs are tagged in InputAddr as well.
752 // We will use InputAddr to determine whether the input counts against the
753 // vgpr total and only use the InputEnable to determine the last input
754 // that is relevant - if extra arguments are used, then we have to honour
755 // the InputAddr for any intermediate non-enabled inputs.
756 InputEna = MFI->getPSInputEnable();
757 InputAddr = MFI->getPSInputAddr();
758
759 // We only need to consider input args up to the last used arg.
760 assert((InputEna || InputAddr) &&
761 "PSInputAddr and PSInputEnable should "
762 "never both be 0 for AMDGPU_PS shaders");
763 // There are some rare circumstances where InputAddr is non-zero and
764 // InputEna can be set to 0. In this case we default to setting LastEna
765 // to 1.
766 LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
767 }
768
769 // FIXME: We should be using the number of registers determined during
770 // calling convention lowering to legalize the types.
771 const DataLayout &DL = F.getParent()->getDataLayout();
772 unsigned PSArgCount = 0;
773 unsigned IntermediateVGPR = 0;
774 for (auto &Arg : F.args()) {
775 unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
776 if (Arg.hasAttribute(Attribute::InReg)) {
777 WaveDispatchNumSGPR += NumRegs;
778 } else {
779 // If this is a PS shader and we're processing the PS Input args (first
780 // 16 VGPR), use the InputEna and InputAddr bits to define how many
781 // VGPRs are actually used.
782 // Any extra VGPR arguments are handled as normal arguments (and
783 // contribute to the VGPR count whether they're used or not).
784 if (IsPixelShader && PSArgCount < 16) {
785 if ((1 << PSArgCount) & InputAddr) {
786 if (PSArgCount < LastEna)
787 WaveDispatchNumVGPR += NumRegs;
788 else
789 IntermediateVGPR += NumRegs;
790 }
791 PSArgCount++;
792 } else {
793 // If there are extra arguments we have to include the allocation for
794 // the non-used (but enabled with InputAddr) input arguments
795 if (IntermediateVGPR) {
796 WaveDispatchNumVGPR += IntermediateVGPR;
797 IntermediateVGPR = 0;
798 }
799 WaveDispatchNumVGPR += NumRegs;
800 }
801 }
802 }
803 ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
804 ProgInfo.NumArchVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
805 ProgInfo.NumVGPR =
806 Info.getTotalNumVGPRs(STM, Info.NumAGPR, ProgInfo.NumArchVGPR);
807 }
808
809 // Adjust number of registers used to meet default/requested minimum/maximum
810 // number of waves per execution unit request.
811 ProgInfo.NumSGPRsForWavesPerEU = std::max(
812 std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
813 ProgInfo.NumVGPRsForWavesPerEU = std::max(
814 std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
815
817 STM.hasSGPRInitBug()) {
818 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
819 if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
820 // This can happen due to a compiler bug or when using inline asm to use
821 // the registers which are usually reserved for vcc etc.
823 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
824 ProgInfo.NumSGPR, MaxAddressableNumSGPRs,
826 Ctx.diagnose(Diag);
827 ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
828 ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
829 }
830 }
831
832 if (STM.hasSGPRInitBug()) {
833 ProgInfo.NumSGPR =
835 ProgInfo.NumSGPRsForWavesPerEU =
837 }
838
839 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
841 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
842 MFI->getNumUserSGPRs(),
844 Ctx.diagnose(Diag);
845 }
846
847 if (MFI->getLDSSize() >
848 static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
851 MF.getFunction(), "local memory", MFI->getLDSSize(),
853 Ctx.diagnose(Diag);
854 }
855
857 &STM, ProgInfo.NumSGPRsForWavesPerEU);
859 &STM, ProgInfo.NumVGPRsForWavesPerEU);
860
861 const SIModeRegisterDefaults Mode = MFI->getMode();
862
863 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
864 // register.
865 ProgInfo.FloatMode = getFPMode(Mode);
866
867 ProgInfo.IEEEMode = Mode.IEEE;
868
869 // Make clamp modifier on NaN input returns 0.
870 ProgInfo.DX10Clamp = Mode.DX10Clamp;
871
872 unsigned LDSAlignShift;
874 // LDS is allocated in 64 dword blocks.
875 LDSAlignShift = 8;
876 } else {
877 // LDS is allocated in 128 dword blocks.
878 LDSAlignShift = 9;
879 }
880
881 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
882 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
883
884 ProgInfo.LDSSize = MFI->getLDSSize();
885 ProgInfo.LDSBlocks =
886 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
887
888 // Scratch is allocated in 64-dword or 256-dword blocks.
889 unsigned ScratchAlignShift =
890 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
891 // We need to program the hardware with the amount of scratch memory that
892 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
893 // scratch memory used per thread.
894 ProgInfo.ScratchBlocks = divideCeil(
895 ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift);
896
897 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
898 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
899 ProgInfo.MemOrdered = 1;
900 }
901
902 // 0 = X, 1 = XY, 2 = XYZ
903 unsigned TIDIGCompCnt = 0;
904 if (MFI->hasWorkItemIDZ())
905 TIDIGCompCnt = 2;
906 else if (MFI->hasWorkItemIDY())
907 TIDIGCompCnt = 1;
908
909 // The private segment wave byte offset is the last of the system SGPRs. We
910 // initially assumed it was allocated, and may have used it. It shouldn't harm
911 // anything to disable it if we know the stack isn't used here. We may still
912 // have emitted code reading it to initialize scratch, but if that's unused
913 // reading garbage should be OK.
914 ProgInfo.ScratchEnable =
915 ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack;
916 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
917 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
918 ProgInfo.TrapHandlerEnable =
919 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
920 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
921 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
922 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
923 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
924 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
925 ProgInfo.EXCPEnMSB = 0;
926 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
927 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
928 ProgInfo.EXCPEnable = 0;
929
930 if (STM.hasGFX90AInsts()) {
932 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
933 ProgInfo.AccumOffset);
935 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
936 ProgInfo.TgSplit);
937 }
938
939 ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
940 ProgInfo.NumSGPRsForWavesPerEU,
941 ProgInfo.NumVGPRsForWavesPerEU);
942}
943
944static unsigned getRsrcReg(CallingConv::ID CallConv) {
945 switch (CallConv) {
946 default: [[fallthrough]];
954 }
955}
956
957void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
958 const SIProgramInfo &CurrentProgramInfo) {
961 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
962
965
966 OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1());
967
969 OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2());
970
972 OutStreamer->emitInt32(
973 STM.getGeneration() >= AMDGPUSubtarget::GFX11
974 ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
975 : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
976
977 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
978 // 0" comment but I don't see a corresponding field in the register spec.
979 } else {
980 OutStreamer->emitInt32(RsrcReg);
981 OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
982 S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
984 OutStreamer->emitInt32(
985 STM.getGeneration() >= AMDGPUSubtarget::GFX11
986 ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
987 : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
988 }
989
992 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
993 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
994 : CurrentProgramInfo.LDSBlocks;
995 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
997 OutStreamer->emitInt32(MFI->getPSInputEnable());
999 OutStreamer->emitInt32(MFI->getPSInputAddr());
1000 }
1001
1002 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1003 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1004 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1005 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1006}
1007
1008// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1009// is AMDPAL. It stores each compute/SPI register setting and other PAL
1010// metadata items into the PALMD::Metadata, combining with any provided by the
1011// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1012// is then written as a single block in the .note section.
1013void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1014 const SIProgramInfo &CurrentProgramInfo) {
1016 auto CC = MF.getFunction().getCallingConv();
1017 auto MD = getTargetStreamer()->getPALMetadata();
1018
1020 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1021
1022 // Only set AGPRs for supported devices
1023 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1024 if (STM.hasMAIInsts()) {
1025 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1026 }
1027
1028 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1029 if (MD->getPALMajorVersion() < 3) {
1030 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
1031 if (AMDGPU::isCompute(CC)) {
1032 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
1033 } else {
1034 if (CurrentProgramInfo.ScratchBlocks > 0)
1035 MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
1036 }
1037 } else {
1038 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1039 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1040 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1041 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1042
1043 if (AMDGPU::isCompute(CC)) {
1044 MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
1045 MD->setHwStage(CC, ".trap_present",
1046 (bool)CurrentProgramInfo.TrapHandlerEnable);
1047
1048 // EXCPEnMSB?
1049 const unsigned LdsDwGranularity = 128;
1050 MD->setHwStage(CC, ".lds_size",
1051 (unsigned)(CurrentProgramInfo.LdsSize * LdsDwGranularity *
1052 sizeof(uint32_t)));
1053 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1054 } else {
1055 MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
1056 }
1057 }
1058
1059 // ScratchSize is in bytes, 16 aligned.
1060 MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
1062 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1063 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1064 : CurrentProgramInfo.LDSBlocks;
1065 if (MD->getPALMajorVersion() < 3) {
1066 MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1067 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1068 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1069 } else {
1070 // Graphics registers
1071 const unsigned ExtraLdsDwGranularity =
1072 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1073 MD->setGraphicsRegisters(
1074 ".ps_extra_lds_size",
1075 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1076
1077 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1078 static StringLiteral const PsInputFields[] = {
1079 ".persp_sample_ena", ".persp_center_ena",
1080 ".persp_centroid_ena", ".persp_pull_model_ena",
1081 ".linear_sample_ena", ".linear_center_ena",
1082 ".linear_centroid_ena", ".line_stipple_tex_ena",
1083 ".pos_x_float_ena", ".pos_y_float_ena",
1084 ".pos_z_float_ena", ".pos_w_float_ena",
1085 ".front_face_ena", ".ancillary_ena",
1086 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1087 unsigned PSInputEna = MFI->getPSInputEnable();
1088 unsigned PSInputAddr = MFI->getPSInputAddr();
1089 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1090 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1091 (bool)((PSInputEna >> Idx) & 1));
1092 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1093 (bool)((PSInputAddr >> Idx) & 1));
1094 }
1095 }
1096 }
1097
1098 // For version 3 and above the wave front size is already set in the metadata
1099 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1100 MD->setWave32(MF.getFunction().getCallingConv());
1101}
1102
1103void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1104 auto *MD = getTargetStreamer()->getPALMetadata();
1105 const MachineFrameInfo &MFI = MF.getFrameInfo();
1106 StringRef FnName = MF.getFunction().getName();
1107 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1108
1109 // Set compute registers
1110 MD->setRsrc1(CallingConv::AMDGPU_CS,
1111 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
1112 MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.getComputePGMRSrc2());
1113
1114 // Set optional info
1115 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1116 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1117 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1118}
1119
1120// This is supposed to be log2(Size)
1122 switch (Size) {
1123 case 4:
1124 return AMD_ELEMENT_4_BYTES;
1125 case 8:
1126 return AMD_ELEMENT_8_BYTES;
1127 case 16:
1128 return AMD_ELEMENT_16_BYTES;
1129 default:
1130 llvm_unreachable("invalid private_element_size");
1131 }
1132}
1133
1134void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
1135 const SIProgramInfo &CurrentProgramInfo,
1136 const MachineFunction &MF) const {
1137 const Function &F = MF.getFunction();
1138 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1139 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1140
1142 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1143
1145
1147 CurrentProgramInfo.getComputePGMRSrc1() |
1148 (CurrentProgramInfo.getComputePGMRSrc2() << 32);
1150
1151 if (CurrentProgramInfo.DynamicCallStack)
1153
1156 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1157
1158 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1159 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1160 Out.code_properties |=
1162 }
1163
1164 if (UserSGPRInfo.hasDispatchPtr())
1166
1167 if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)
1169
1170 if (UserSGPRInfo.hasKernargSegmentPtr())
1172
1173 if (UserSGPRInfo.hasDispatchID())
1175
1176 if (UserSGPRInfo.hasFlatScratchInit())
1178
1179 if (UserSGPRInfo.hasDispatchPtr())
1181
1182 if (STM.isXNACKEnabled())
1184
1185 Align MaxKernArgAlign;
1186 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1187 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1188 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1189 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1190 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1191
1192 // kernarg_segment_alignment is specified as log of the alignment.
1193 // The minimum alignment is 16.
1194 // FIXME: The metadata treats the minimum as 4?
1195 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1196}
1197
1199 const char *ExtraCode, raw_ostream &O) {
1200 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1201 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1202 return false;
1203
1204 if (ExtraCode && ExtraCode[0]) {
1205 if (ExtraCode[1] != 0)
1206 return true; // Unknown modifier.
1207
1208 switch (ExtraCode[0]) {
1209 case 'r':
1210 break;
1211 default:
1212 return true;
1213 }
1214 }
1215
1216 // TODO: Should be able to support other operand types like globals.
1217 const MachineOperand &MO = MI->getOperand(OpNo);
1218 if (MO.isReg()) {
1221 return false;
1222 } else if (MO.isImm()) {
1223 int64_t Val = MO.getImm();
1225 O << Val;
1226 } else if (isUInt<16>(Val)) {
1227 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1228 } else if (isUInt<32>(Val)) {
1229 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1230 } else {
1231 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1232 }
1233 return false;
1234 }
1235 return true;
1236}
1237
1242}
1243
1244void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1245 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1246 bool isModuleEntryFunction, bool hasMAIInsts) {
1247 if (!ORE)
1248 return;
1249
1250 const char *Name = "kernel-resource-usage";
1251 const char *Indent = " ";
1252
1253 // If the remark is not specifically enabled, do not output to yaml
1256 return;
1257
1258 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1259 StringRef RemarkLabel, auto Argument) {
1260 // Add an indent for every line besides the line with the kernel name. This
1261 // makes it easier to tell which resource usage go with which kernel since
1262 // the kernel name will always be displayed first.
1263 std::string LabelStr = RemarkLabel.str() + ": ";
1264 if (!RemarkName.equals("FunctionName"))
1265 LabelStr = Indent + LabelStr;
1266
1267 ORE->emit([&]() {
1268 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1270 &MF.front())
1271 << LabelStr << ore::NV(RemarkName, Argument);
1272 });
1273 };
1274
1275 // FIXME: Formatting here is pretty nasty because clang does not accept
1276 // newlines from diagnostics. This forces us to emit multiple diagnostic
1277 // remarks to simulate newlines. If and when clang does accept newlines, this
1278 // formatting should be aggregated into one remark with newlines to avoid
1279 // printing multiple diagnostic location and diag opts.
1280 EmitResourceUsageRemark("FunctionName", "Function Name",
1281 MF.getFunction().getName());
1282 EmitResourceUsageRemark("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR);
1283 EmitResourceUsageRemark("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR);
1284 if (hasMAIInsts)
1285 EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR);
1286 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1287 CurrentProgramInfo.ScratchSize);
1288 StringRef DynamicStackStr =
1289 CurrentProgramInfo.DynamicCallStack ? "True" : "False";
1290 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1291 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1292 CurrentProgramInfo.Occupancy);
1293 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1294 CurrentProgramInfo.SGPRSpill);
1295 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1296 CurrentProgramInfo.VGPRSpill);
1297 if (isModuleEntryFunction)
1298 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1299 CurrentProgramInfo.LDSSize);
1300}
#define Success
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static unsigned getRsrcReg(CallingConv::ID CallConv)
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
Analyzes how many registers and other resources are used by functions.
AMDHSA kernel descriptor definitions.
#define AMDHSA_BITS_GET(SRC, MSK)
#define AMDHSA_BITS_SET(DST, MSK, VAL)
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
@ AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_EXTERNAL_VISIBILITY
Definition: Compiler.h:135
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::string Name
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
LLVMContext & Context
const char LLVMTargetMachineRef TM
R600 Assembly printer class.
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition: SIDefines.h:972
#define S_0286E8_WAVESIZE_PreGFX11(x)
Definition: SIDefines.h:1108
#define R_0286E8_SPI_TMPRING_SIZE
Definition: SIDefines.h:1107
#define S_00B84C_SCRATCH_EN(x)
Definition: SIDefines.h:1006
#define S_0286E8_WAVESIZE_GFX11Plus(x)
Definition: SIDefines.h:1109
#define FP_ROUND_MODE_DP(x)
Definition: SIDefines.h:1090
#define S_00B860_WAVESIZE_GFX11Plus(x)
Definition: SIDefines.h:1105
#define FP_ROUND_ROUND_TO_NEAREST
Definition: SIDefines.h:1082
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition: SIDefines.h:1043
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition: SIDefines.h:1103
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition: SIDefines.h:995
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition: SIDefines.h:994
#define S_00B028_SGPRS(x)
Definition: SIDefines.h:974
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition: SIDefines.h:1003
#define R_0286CC_SPI_PS_INPUT_ENA
Definition: SIDefines.h:1042
#define S_00B860_WAVESIZE_PreGFX11(x)
Definition: SIDefines.h:1104
#define S_00B028_VGPRS(x)
Definition: SIDefines.h:973
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition: SIDefines.h:981
#define FP_DENORM_MODE_DP(x)
Definition: SIDefines.h:1101
#define R_00B848_COMPUTE_PGM_RSRC1
Definition: SIDefines.h:1045
#define R_SPILLED_SGPRS
Definition: SIDefines.h:1120
#define FP_ROUND_MODE_SP(x)
Definition: SIDefines.h:1089
#define FP_DENORM_MODE_SP(x)
Definition: SIDefines.h:1100
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition: SIDefines.h:986
#define R_SPILLED_VGPRS
Definition: SIDefines.h:1121
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition: SIDefines.h:980
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition: SIDefines.h:1005
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition: SIDefines.h:979
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
Emit the specified function out to the OutStreamer.
void emitFunctionBodyEnd() override
Targets can override this to emit stuff after the last basic block in the function.
bool doFinalization(Module &M) override
Shut down the asmprinter.
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(unsigned RegNo, raw_ostream &O, const MCRegisterInfo &MRI)
void setEntryPoint(unsigned CC, StringRef Name)
unsigned getAddressableLocalMemorySize() const
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr, unsigned CodeObjectVersion)
virtual void EmitDirectiveAMDGCNTarget()
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI)
void initializeTargetID(const MCSubtargetInfo &STI, unsigned CodeObjectVersion)
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
This class is intended to be used as a driving class for all asm writers.
Definition: AsmPrinter.h:84
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
Definition: AsmPrinter.cpp:381
MCSymbol * getSymbol(const GlobalValue *GV) const
Definition: AsmPrinter.cpp:679
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
Definition: AsmPrinter.cpp:701
TargetMachine & TM
Target machine description.
Definition: AsmPrinter.h:87
MachineFunction * MF
The current machine function.
Definition: AsmPrinter.h:102
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
Definition: AsmPrinter.cpp:431
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
Definition: AsmPrinter.cpp:634
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
Definition: AsmPrinter.cpp:423
unsigned getFunctionNumber() const
Return a unique ID for the current function.
Definition: AsmPrinter.cpp:377
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition: AsmPrinter.h:114
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition: AsmPrinter.h:94
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition: AsmPrinter.h:99
bool isVerbose() const
Return true if assembly output should contain comments.
Definition: AsmPrinter.h:269
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
Definition: AsmPrinter.cpp:674
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Diagnostic information for stack size etc.
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1776
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:262
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:341
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
bool hasMAIInsts() const
Definition: GCNSubtarget.h:759
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:237
bool hasSGPRInitBug() const
Definition: GCNSubtarget.h:998
bool isTgSplitEnabled() const
Definition: GCNSubtarget.h:574
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool isCuModeEnabled() const
Definition: GCNSubtarget.h:578
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
Definition: GCNSubtarget.h:273
bool dumpCode() const
Definition: GCNSubtarget.h:478
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:566
bool isWave32() const
unsigned getMaxNumUserSGPRs() const
Definition: GCNSubtarget.h:865
Generation getGeneration() const
Definition: GCNSubtarget.h:288
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
Definition: GCNSubtarget.h:292
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
MaybeAlign getAlign() const
Returns the alignment of the given variable or function.
Definition: GlobalObject.h:79
VisibilityTypes getVisibility() const
Definition: GlobalValue.h:244
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:273
unsigned getAddressSpace() const
Definition: GlobalValue.h:201
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:652
Type * getValueType() const
Definition: GlobalValue.h:292
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
const DiagnosticHandler * getDiagHandlerPtr() const
getDiagHandlerPtr - Returns const raw pointer of DiagnosticHandler set by setDiagnosticHandler.
MCCodeEmitter * getEmitterPtr() const
Definition: MCAssembler.h:325
Context object for machine code objects.
Definition: MCContext.h:76
void reportError(SMLoc L, const Twine &Msg)
Definition: MCContext.cpp:1058
MCContext & getContext() const
This represents a section on linux, lots of unix variants and some bare metal systems.
Definition: MCSectionELF.h:26
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:40
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition: MCSymbol.h:250
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:205
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition: MCSymbol.h:300
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition: MCSymbol.h:232
MCStreamer & getStreamer()
Definition: MCStreamer.h:101
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void setAlignment(Align A)
setAlignment - Set the alignment of the function.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
Definition: MachineInstr.h:68
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:275
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getNumKernargPreloadedSGPRs() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:857
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::string str() const
str - Get the contents as an std::string.
Definition: StringRef.h:222
bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
Definition: StringRef.h:164
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
const Triple & getTargetTriple() const
const MCSubtargetInfo * getMCSubtargetInfo() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:366
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:672
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:411
unsigned getNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, std::optional< bool > EnableWavefrontSize32)
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getNumSGPRBlocks(const MCSubtargetInfo *STI, unsigned NumSGPRs)
void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const MCSubtargetInfo *STI)
unsigned getCodeObjectVersion(const Module &M)
IsaVersion getIsaVersion(StringRef GPU)
bool isCompute(CallingConv::ID cc)
bool isGFX90A(const MCSubtargetInfo &STI)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:194
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:185
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:197
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:203
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:188
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:191
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:141
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:215
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:210
@ SHT_PROGBITS
Definition: ELF.h:1004
@ STT_AMDGPU_HSA_KERNEL
Definition: ELF.h:1272
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1684
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:417
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2375
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:313
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
Target & getTheGCNTarget()
The target for GCN GPUs.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1853
@ DS_Error
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
AMD Kernel Code Object (amd_kernel_code_t).
uint16_t workitem_vgpr_count
Number of vector registers used by each work-item.
uint32_t code_properties
Code properties.
uint8_t kernarg_segment_alignment
The maximum byte alignment of variables used by the kernel in the specified memory segment.
uint32_t workgroup_group_segment_byte_size
The amount of group segment memory required by a work-group in bytes.
uint16_t wavefront_sgpr_count
Number of scalar registers used by a wavefront.
uint32_t workitem_private_segment_byte_size
The amount of memory required for the combined private, spill and arg segments for a work-item in byt...
uint64_t kernarg_segment_byte_size
The size in bytes of the kernarg segment that holds the values of the arguments to the kernel.
uint64_t compute_pgm_resource_registers
Shader program settings for CS.
const SIFunctionResourceInfo & getResourceInfo(const Function *F) const
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
virtual bool isAnalysisRemarkEnabled(StringRef PassName) const
Return true if analysis remarks are enabled, override to provide different implementation.
Track resource usage for kernels / entry functions.
Definition: SIProgramInfo.h:25
uint64_t getPGMRSrc1(CallingConv::ID CC) const
uint32_t NumSGPRsForWavesPerEU
Definition: SIProgramInfo.h:70
uint32_t TrapHandlerEnable
Definition: SIProgramInfo.h:46
uint64_t getComputePGMRSrc2() const
Compute the value of the ComputePGMRsrc2 register.
uint32_t NumVGPRsForWavesPerEU
Definition: SIProgramInfo.h:73
uint64_t ComputePGMRSrc3GFX90A
Definition: SIProgramInfo.h:56
uint64_t getComputePGMRSrc1() const
Compute the value of the ComputePGMRsrc1 register.
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.