LLVM 20.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
23#include "GCNSubtarget.h"
28#include "R600AsmPrinter.h"
40#include "llvm/MC/MCAssembler.h"
41#include "llvm/MC/MCContext.h"
43#include "llvm/MC/MCStreamer.h"
49
50using namespace llvm;
51using namespace llvm::AMDGPU;
52
53// This should get the default rounding mode from the kernel. We just set the
54// default here, but this could change if the OpenCL rounding mode pragmas are
55// used.
56//
57// The denormal mode here should match what is reported by the OpenCL runtime
58// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
59// can also be override to flush with the -cl-denorms-are-zero compiler flag.
60//
61// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
62// precision, and leaves single precision to flush all and does not report
63// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
64// CL_FP_DENORM for both.
65//
66// FIXME: It seems some instructions do not support single precision denormals
67// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
68// and sin_f32, cos_f32 on most parts).
69
70// We want to use these instructions, and using fp32 denormals also causes
71// instructions to run at the double precision rate for the device so it's
72// probably best to just report no single precision denormals.
76 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
77 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
78}
79
80static AsmPrinter *
82 std::unique_ptr<MCStreamer> &&Streamer) {
83 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
84}
85
91}
92
94 std::unique_ptr<MCStreamer> Streamer)
95 : AsmPrinter(TM, std::move(Streamer)) {
96 assert(OutStreamer && "AsmPrinter constructed without streamer");
97}
98
100 return "AMDGPU Assembly Printer";
101}
102
104 return TM.getMCSubtargetInfo();
105}
106
108 if (!OutStreamer)
109 return nullptr;
110 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
111}
112
115}
116
117void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
119
120 // TODO: Which one is called first, emitStartOfAsmFile or
121 // emitFunctionBodyStart?
122 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
123 initializeTargetID(M);
124
127 return;
128
130
133 CodeObjectVersion);
134 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
135 }
136
139}
140
142 // Init target streamer if it has not yet happened
144 initTargetStreamer(M);
145
148
149 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
150 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
152 HSAMetadataStream->end();
153 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
154 (void)Success;
155 assert(Success && "Malformed HSA Metadata");
156 }
157}
158
161 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
162 const Function &F = MF->getFunction();
163
164 // TODO: We're checking this late, would be nice to check it earlier.
165 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
167 STM.getCPU() + " is only available on code object version 6 or better",
168 /*gen_crash_diag*/ false);
169 }
170
171 // TODO: Which one is called first, emitStartOfAsmFile or
172 // emitFunctionBodyStart?
173 if (!getTargetStreamer()->getTargetID())
174 initializeTargetID(*F.getParent());
175
176 const auto &FunctionTargetID = STM.getTargetID();
177 // Make sure function's xnack settings are compatible with module's
178 // xnack settings.
179 if (FunctionTargetID.isXnackSupported() &&
180 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
181 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
182 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
183 "' function does not match module xnack setting");
184 return;
185 }
186 // Make sure function's sramecc settings are compatible with module's
187 // sramecc settings.
188 if (FunctionTargetID.isSramEccSupported() &&
189 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
190 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
191 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
192 "' function does not match module sramecc setting");
193 return;
194 }
195
196 if (!MFI.isEntryFunction())
197 return;
198
199 if (STM.isMesaKernel(F) &&
200 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
201 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
202 AMDGPUMCKernelCodeT KernelCode;
203 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
204 KernelCode.validate(&STM, MF->getContext());
206 }
207
208 if (STM.isAmdHsaOS())
209 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
210}
211
214 if (!MFI.isEntryFunction())
215 return;
216
218 return;
219
220 auto &Streamer = getTargetStreamer()->getStreamer();
221 auto &Context = Streamer.getContext();
222 auto &ObjectFileInfo = *Context.getObjectFileInfo();
223 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
224
225 Streamer.pushSection();
226 Streamer.switchSection(&ReadOnlySection);
227
228 // CP microcode requires the kernel descriptor to be allocated on 64 byte
229 // alignment.
230 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
231 ReadOnlySection.ensureMinAlignment(Align(64));
232
233 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
234
235 SmallString<128> KernelName;
236 getNameWithPrefix(KernelName, &MF->getFunction());
238 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
239 CurrentProgramInfo.NumVGPRsForWavesPerEU,
241 CurrentProgramInfo.NumSGPRsForWavesPerEU,
243 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
244 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
245 Context),
246 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
247
248 Streamer.popSection();
249}
250
252 Register RegNo = MI->getOperand(0).getReg();
253
256 OS << "implicit-def: "
257 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
258
259 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
260 OS << " : SGPR spill to VGPR lane";
261
262 OutStreamer->AddComment(OS.str());
263 OutStreamer->addBlankLine();
264}
265
269 return;
270 }
271
273 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
274 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
275 SmallString<128> SymbolName;
276 getNameWithPrefix(SymbolName, &MF->getFunction()),
278 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
279 }
280 if (DumpCodeInstEmitter) {
281 // Disassemble function name label to text.
282 DisasmLines.push_back(MF->getName().str() + ":");
283 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
284 HexLines.emplace_back("");
285 }
286
288}
289
291 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
292 // Write a line for the basic block label if it is not only fallthrough.
293 DisasmLines.push_back(
294 (Twine("BB") + Twine(getFunctionNumber())
295 + "_" + Twine(MBB.getNumber()) + ":").str());
296 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
297 HexLines.emplace_back("");
298 }
300}
301
304 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
306 Twine(GV->getName()) +
307 ": unsupported initializer for address space");
308 return;
309 }
310
311 // LDS variables aren't emitted in HSA or PAL yet.
313 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
314 return;
315
316 MCSymbol *GVSym = getSymbol(GV);
317
318 GVSym->redefineIfPossible();
319 if (GVSym->isDefined() || GVSym->isVariable())
320 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
321 "' is already defined");
322
323 const DataLayout &DL = GV->getDataLayout();
324 uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
325 Align Alignment = GV->getAlign().value_or(Align(4));
326
327 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
328 emitLinkage(GV, GVSym);
329 auto *TS = getTargetStreamer();
330 TS->emitAMDGPULDS(GVSym, Size, Alignment);
331 return;
332 }
333
335}
336
338 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
339
341 switch (CodeObjectVersion) {
343 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
344 break;
346 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
347 break;
349 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
350 break;
351 default:
352 report_fatal_error("Unexpected code object version");
353 }
354 }
355
357}
358
359void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
360 if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
361 return;
362
365 MCSymbol *FnSym = TM.getSymbol(&F);
366
367 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
368 int64_t Val;
369 if (Value->evaluateAsAbsolute(Val)) {
370 Res = Val;
371 return true;
372 }
373 return false;
374 };
375
376 const uint64_t MaxScratchPerWorkitem =
378 MCSymbol *ScratchSizeSymbol =
379 RI.getSymbol(FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext);
380 uint64_t ScratchSize;
381 if (ScratchSizeSymbol->isVariable() &&
382 TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
383 ScratchSize > MaxScratchPerWorkitem) {
384 DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
385 DS_Error);
386 F.getContext().diagnose(DiagStackSize);
387 }
388
389 // Validate addressable scalar registers (i.e., prior to added implicit
390 // SGPRs).
391 MCSymbol *NumSGPRSymbol =
392 RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext);
394 !STM.hasSGPRInitBug()) {
395 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
396 uint64_t NumSgpr;
397 if (NumSGPRSymbol->isVariable() &&
398 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
399 NumSgpr > MaxAddressableNumSGPRs) {
400 DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers",
401 NumSgpr, MaxAddressableNumSGPRs,
403 F.getContext().diagnose(Diag);
404 return;
405 }
406 }
407
408 MCSymbol *VCCUsedSymbol =
409 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext);
410 MCSymbol *FlatUsedSymbol =
411 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext);
412 uint64_t VCCUsed, FlatUsed, NumSgpr;
413
414 if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
415 FlatUsedSymbol->isVariable() &&
416 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
417 TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
418 TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
419
420 // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
421 // resolvable.
422 NumSgpr += IsaInfo::getNumExtraSGPRs(
423 &STM, VCCUsed, FlatUsed,
424 getTargetStreamer()->getTargetID()->isXnackOnOrAny());
426 STM.hasSGPRInitBug()) {
427 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
428 if (NumSgpr > MaxAddressableNumSGPRs) {
429 DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr,
430 MaxAddressableNumSGPRs, DS_Error,
432 F.getContext().diagnose(Diag);
433 return;
434 }
435 }
436
437 MCSymbol *NumVgprSymbol =
438 RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext);
439 MCSymbol *NumAgprSymbol =
440 RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext);
441 uint64_t NumVgpr, NumAgpr;
442
444 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
446 if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
447 TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
448 TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
450 unsigned MaxWaves = MFI.getMaxWavesPerEU();
451 uint64_t TotalNumVgpr =
452 getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
453 uint64_t NumVGPRsForWavesPerEU = std::max(
454 {TotalNumVgpr, (uint64_t)1, (uint64_t)STM.getMinNumVGPRs(MaxWaves)});
455 uint64_t NumSGPRsForWavesPerEU = std::max(
456 {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
457 const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
458 STM.computeOccupancy(F, MFI.getLDSSize()),
459 MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
460 MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM,
461 OutContext);
462 uint64_t Occupancy;
463
464 const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
465 F, "amdgpu-waves-per-eu", {0, 0}, true);
466
467 if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
469 F, F.getSubprogram(),
470 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
471 "'" +
472 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
473 ", final occupancy is " + Twine(Occupancy));
474 F.getContext().diagnose(Diag);
475 return;
476 }
477 }
478 }
479}
480
482 // Pad with s_code_end to help tools and guard against instruction prefetch
483 // causing stale data in caches. Arguably this should be done by the linker,
484 // which is why this isn't done for Mesa.
485 const MCSubtargetInfo &STI = *getGlobalSTI();
486 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
489 OutStreamer->switchSection(getObjFileLowering().getTextSection());
491 }
492
493 // Assign expressions which can only be resolved when all other functions are
494 // known.
496
497 // Switch section and emit all GPR maximums within the processed module.
498 OutStreamer->pushSection();
499 MCSectionELF *MaxGPRSection =
500 OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
501 OutStreamer->switchSection(MaxGPRSection);
505 OutStreamer->popSection();
506
507 for (Function &F : M.functions())
508 validateMCResourceInfo(F);
509
510 RI.reset();
511
513}
514
515SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
517 raw_svector_ostream OSS(Str);
518 auto &Streamer = getTargetStreamer()->getStreamer();
519 auto &Context = Streamer.getContext();
520 const MCExpr *New = foldAMDGPUMCExpr(Value, Context);
521 printAMDGPUMCExpr(New, OSS, MAI);
522 return Str;
523}
524
525// Print comments that apply to both callable functions and entry points.
526void AMDGPUAsmPrinter::emitCommonFunctionComments(
527 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
528 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
529 const AMDGPUMachineFunction *MFI) {
530 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
531 OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR),
532 false);
533 OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
534 if (NumAGPR && TotalNumVGPR) {
535 OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
536 OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
537 false);
538 }
539 OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
540 false);
541 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
542 false);
543}
544
545const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
546 const MachineFunction &MF) const {
548 MCContext &Ctx = MF.getContext();
549 uint16_t KernelCodeProperties = 0;
550 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
551
552 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
553 KernelCodeProperties |=
554 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
555 }
556 if (UserSGPRInfo.hasDispatchPtr()) {
557 KernelCodeProperties |=
558 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
559 }
560 if (UserSGPRInfo.hasQueuePtr()) {
561 KernelCodeProperties |=
562 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
563 }
564 if (UserSGPRInfo.hasKernargSegmentPtr()) {
565 KernelCodeProperties |=
566 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
567 }
568 if (UserSGPRInfo.hasDispatchID()) {
569 KernelCodeProperties |=
570 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
571 }
572 if (UserSGPRInfo.hasFlatScratchInit()) {
573 KernelCodeProperties |=
574 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
575 }
576 if (UserSGPRInfo.hasPrivateSegmentSize()) {
577 KernelCodeProperties |=
578 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
579 }
581 KernelCodeProperties |=
582 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
583 }
584
585 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
586 // un-evaluatable at this point so it cannot be conditionally checked here.
587 // Instead, we'll directly shift the possibly unknown MCExpr into its place
588 // and bitwise-or it into KernelCodeProperties.
589 const MCExpr *KernelCodePropExpr =
590 MCConstantExpr::create(KernelCodeProperties, Ctx);
591 const MCExpr *OrValue = MCConstantExpr::create(
592 amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
593 OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
594 OrValue, Ctx);
595 KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
596
597 return KernelCodePropExpr;
598}
599
601AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
602 const SIProgramInfo &PI) const {
604 const Function &F = MF.getFunction();
606 MCContext &Ctx = MF.getContext();
607
608 MCKernelDescriptor KernelDescriptor;
609
610 KernelDescriptor.group_segment_fixed_size =
612 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
613
614 Align MaxKernArgAlign;
615 KernelDescriptor.kernarg_size = MCConstantExpr::create(
616 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
617
618 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
619 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
620 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
621
622 int64_t PGRM_Rsrc3 = 1;
623 bool EvaluatableRsrc3 =
624 CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3);
625 (void)PGRM_Rsrc3;
626 (void)EvaluatableRsrc3;
627 assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
628 static_cast<uint64_t>(PGRM_Rsrc3) == 0);
629 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
630
631 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
632 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
633 Ctx);
634
635 return KernelDescriptor;
636}
637
639 // Init target streamer lazily on the first function so that previous passes
640 // can set metadata.
642 initTargetStreamer(*MF.getFunction().getParent());
643
644 ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
645 CurrentProgramInfo.reset(MF);
646
648 MCContext &Ctx = MF.getContext();
649
650 // The starting address of all shader programs must be 256 bytes aligned.
651 // Regular functions just need the basic required instruction alignment.
652 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
653
655
658 // FIXME: This should be an explicit check for Mesa.
659 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
660 MCSectionELF *ConfigSection =
661 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
662 OutStreamer->switchSection(ConfigSection);
663 }
664
666 ResourceUsage->getResourceInfo();
668
669 if (MFI->isModuleEntryFunction()) {
670 getSIProgramInfo(CurrentProgramInfo, MF);
671 }
672
673 if (STM.isAmdPalOS()) {
674 if (MFI->isEntryFunction())
675 EmitPALMetadata(MF, CurrentProgramInfo);
676 else if (MFI->isModuleEntryFunction())
677 emitPALFunctionMetadata(MF);
678 } else if (!STM.isAmdHsaOS()) {
679 EmitProgramInfoSI(MF, CurrentProgramInfo);
680 }
681
682 DumpCodeInstEmitter = nullptr;
683 if (STM.dumpCode()) {
684 // For -dumpcode, get the assembler out of the streamer. This only works
685 // with -filetype=obj.
686 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
687 if (Assembler)
688 DumpCodeInstEmitter = Assembler->getEmitterPtr();
689 }
690
691 DisasmLines.clear();
692 HexLines.clear();
694
696
697 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
698 STM.hasMAIInsts());
699
700 {
703 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext),
704 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext),
705 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext),
706 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
707 OutContext),
708 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext),
709 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesFlatScratch,
710 OutContext),
711 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasDynSizedStack,
712 OutContext),
713 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion,
714 OutContext),
715 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasIndirectCall,
716 OutContext));
717 }
718
719 if (isVerbose()) {
720 MCSectionELF *CommentSection =
721 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
722 OutStreamer->switchSection(CommentSection);
723
724 if (!MFI->isEntryFunction()) {
726 OutStreamer->emitRawComment(" Function info:", false);
727
728 emitCommonFunctionComments(
729 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext)
732 RIK::RIK_NumAGPR, OutContext)
734 : nullptr,
735 RI.createTotalNumVGPRs(MF, Ctx),
737 MF,
739 Ctx),
740 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
743 getFunctionCodeSize(MF), MFI);
744 return false;
745 }
746
747 OutStreamer->emitRawComment(" Kernel info:", false);
748 emitCommonFunctionComments(
749 CurrentProgramInfo.NumArchVGPR,
750 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
751 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
752 CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
753
754 OutStreamer->emitRawComment(
755 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
756 OutStreamer->emitRawComment(
757 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
758 OutStreamer->emitRawComment(
759 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
760 " bytes/workgroup (compile time only)", false);
761
762 OutStreamer->emitRawComment(
763 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
764
765 OutStreamer->emitRawComment(
766 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
767
768 OutStreamer->emitRawComment(
769 " NumSGPRsForWavesPerEU: " +
770 getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
771 false);
772 OutStreamer->emitRawComment(
773 " NumVGPRsForWavesPerEU: " +
774 getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
775 false);
776
777 if (STM.hasGFX90AInsts()) {
778 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
779 CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
780 AdjustedAccum = MCBinaryExpr::createMul(
781 AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
782 OutStreamer->emitRawComment(
783 " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
784 }
785
786 OutStreamer->emitRawComment(
787 " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
788
789 OutStreamer->emitRawComment(
790 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
791
792 OutStreamer->emitRawComment(
793 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
794 getMCExprStr(CurrentProgramInfo.ScratchEnable),
795 false);
796 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
797 Twine(CurrentProgramInfo.UserSGPR),
798 false);
799 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
800 Twine(CurrentProgramInfo.TrapHandlerEnable),
801 false);
802 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
803 Twine(CurrentProgramInfo.TGIdXEnable),
804 false);
805 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
806 Twine(CurrentProgramInfo.TGIdYEnable),
807 false);
808 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
809 Twine(CurrentProgramInfo.TGIdZEnable),
810 false);
811 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
812 Twine(CurrentProgramInfo.TIdIGCompCount),
813 false);
814
815 [[maybe_unused]] int64_t PGMRSrc3;
816 assert(STM.hasGFX90AInsts() ||
817 (CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(
818 PGMRSrc3) &&
819 static_cast<uint64_t>(PGMRSrc3) == 0));
820 if (STM.hasGFX90AInsts()) {
821 OutStreamer->emitRawComment(
822 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
823 getMCExprStr(MCKernelDescriptor::bits_get(
824 CurrentProgramInfo.ComputePGMRSrc3GFX90A,
825 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
826 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
827 false);
828 OutStreamer->emitRawComment(
829 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
830 getMCExprStr(MCKernelDescriptor::bits_get(
831 CurrentProgramInfo.ComputePGMRSrc3GFX90A,
832 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
833 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
834 false);
835 }
836 }
837
838 if (DumpCodeInstEmitter) {
839
840 OutStreamer->switchSection(
841 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
842
843 for (size_t i = 0; i < DisasmLines.size(); ++i) {
844 std::string Comment = "\n";
845 if (!HexLines[i].empty()) {
846 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
847 Comment += " ; " + HexLines[i] + "\n";
848 }
849
850 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
851 OutStreamer->emitBytes(StringRef(Comment));
852 }
853 }
854
855 return false;
856}
857
858// TODO: Fold this into emitFunctionBodyStart.
859void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
860 // In the beginning all features are either 'Any' or 'NotSupported',
861 // depending on global target features. This will cover empty modules.
863 getGlobalSTI()->getFeatureString());
864
865 // If module is empty, we are done.
866 if (M.empty())
867 return;
868
869 // If module is not empty, need to find first 'Off' or 'On' feature
870 // setting per feature from functions in module.
871 for (auto &F : M) {
872 auto &TSTargetID = getTargetStreamer()->getTargetID();
873 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
874 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
875 break;
876
878 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
879 if (TSTargetID->isXnackSupported())
880 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
881 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
882 if (TSTargetID->isSramEccSupported())
883 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
884 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
885 }
886}
887
888uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
890 const SIInstrInfo *TII = STM.getInstrInfo();
891
892 uint64_t CodeSize = 0;
893
894 for (const MachineBasicBlock &MBB : MF) {
895 for (const MachineInstr &MI : MBB) {
896 // TODO: CodeSize should account for multiple functions.
897
898 // TODO: Should we count size of debug info?
899 if (MI.isDebugInstr())
900 continue;
901
902 CodeSize += TII->getInstSizeInBytes(MI);
903 }
904 }
905
906 return CodeSize;
907}
908
909// AccumOffset computed for the MCExpr equivalent of:
910// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
911static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
912 const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
913 const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
914
915 // Can't be lower than 1 for subsequent alignTo.
916 const MCExpr *MaximumTaken =
917 AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
918
919 // Practically, it's computing divideCeil(MaximumTaken, 4).
920 const MCExpr *DivCeil = MCBinaryExpr::createDiv(
921 AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
922 Ctx);
923
924 return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
925}
926
927void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
928 const MachineFunction &MF) {
930 MCContext &Ctx = MF.getContext();
931
932 auto CreateExpr = [&Ctx](int64_t Value) {
933 return MCConstantExpr::create(Value, Ctx);
934 };
935
936 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
937 int64_t Val;
938 if (Value->evaluateAsAbsolute(Val)) {
939 Res = Val;
940 return true;
941 }
942 return false;
943 };
944
945 auto GetSymRefExpr =
946 [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
948 return MCSymbolRefExpr::create(Sym, Ctx);
949 };
950
952 ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
953 ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
955 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
956
957 ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
958 ProgInfo.TgSplit = STM.isTgSplitEnabled();
959 ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
960 ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
961 ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
962 ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
963 ProgInfo.DynamicCallStack =
964 MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
965 GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
966
968
969 // The calculations related to SGPR/VGPR blocks are
970 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
971 // unified.
972 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
973 ProgInfo.VCCUsed, ProgInfo.FlatUsed,
974 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
975
976 // Check the addressable register limit before we add ExtraSGPRs.
978 !STM.hasSGPRInitBug()) {
979 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
980 uint64_t NumSgpr;
981 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
982 NumSgpr > MaxAddressableNumSGPRs) {
983 // This can happen due to a compiler bug or when using inline asm.
986 MF.getFunction(), "addressable scalar registers", NumSgpr,
987 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
988 Ctx.diagnose(Diag);
989 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
990 }
991 }
992
993 // Account for extra SGPRs and VGPRs reserved for debugger use.
994 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
995
996 const Function &F = MF.getFunction();
997
998 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
999 // dispatch registers are function args.
1000 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
1001
1002 if (isShader(F.getCallingConv())) {
1003 bool IsPixelShader =
1004 F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
1005
1006 // Calculate the number of VGPR registers based on the SPI input registers
1007 uint32_t InputEna = 0;
1008 uint32_t InputAddr = 0;
1009 unsigned LastEna = 0;
1010
1011 if (IsPixelShader) {
1012 // Note for IsPixelShader:
1013 // By this stage, all enabled inputs are tagged in InputAddr as well.
1014 // We will use InputAddr to determine whether the input counts against the
1015 // vgpr total and only use the InputEnable to determine the last input
1016 // that is relevant - if extra arguments are used, then we have to honour
1017 // the InputAddr for any intermediate non-enabled inputs.
1018 InputEna = MFI->getPSInputEnable();
1019 InputAddr = MFI->getPSInputAddr();
1020
1021 // We only need to consider input args up to the last used arg.
1022 assert((InputEna || InputAddr) &&
1023 "PSInputAddr and PSInputEnable should "
1024 "never both be 0 for AMDGPU_PS shaders");
1025 // There are some rare circumstances where InputAddr is non-zero and
1026 // InputEna can be set to 0. In this case we default to setting LastEna
1027 // to 1.
1028 LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
1029 }
1030
1031 // FIXME: We should be using the number of registers determined during
1032 // calling convention lowering to legalize the types.
1033 const DataLayout &DL = F.getDataLayout();
1034 unsigned PSArgCount = 0;
1035 unsigned IntermediateVGPR = 0;
1036 for (auto &Arg : F.args()) {
1037 unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
1038 if (Arg.hasAttribute(Attribute::InReg)) {
1039 WaveDispatchNumSGPR += NumRegs;
1040 } else {
1041 // If this is a PS shader and we're processing the PS Input args (first
1042 // 16 VGPR), use the InputEna and InputAddr bits to define how many
1043 // VGPRs are actually used.
1044 // Any extra VGPR arguments are handled as normal arguments (and
1045 // contribute to the VGPR count whether they're used or not).
1046 if (IsPixelShader && PSArgCount < 16) {
1047 if ((1 << PSArgCount) & InputAddr) {
1048 if (PSArgCount < LastEna)
1049 WaveDispatchNumVGPR += NumRegs;
1050 else
1051 IntermediateVGPR += NumRegs;
1052 }
1053 PSArgCount++;
1054 } else {
1055 // If there are extra arguments we have to include the allocation for
1056 // the non-used (but enabled with InputAddr) input arguments
1057 if (IntermediateVGPR) {
1058 WaveDispatchNumVGPR += IntermediateVGPR;
1059 IntermediateVGPR = 0;
1060 }
1061 WaveDispatchNumVGPR += NumRegs;
1062 }
1063 }
1064 }
1066 {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
1067
1069 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
1070
1072 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1073 } else if (isKernel(F.getCallingConv()) &&
1075 // Consider cases where the total number of UserSGPRs with trailing
1076 // allocated preload SGPRs, is greater than the number of explicitly
1077 // referenced SGPRs.
1078 const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
1079 CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
1080 ProgInfo.NumSGPR =
1081 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
1082 }
1083
1084 // Adjust number of registers used to meet default/requested minimum/maximum
1085 // number of waves per execution unit request.
1086 unsigned MaxWaves = MFI->getMaxWavesPerEU();
1087 ProgInfo.NumSGPRsForWavesPerEU =
1088 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
1089 CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
1090 Ctx);
1091 ProgInfo.NumVGPRsForWavesPerEU =
1092 AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
1093 CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
1094 Ctx);
1095
1097 STM.hasSGPRInitBug()) {
1098 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1099 uint64_t NumSgpr;
1100 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1101 NumSgpr > MaxAddressableNumSGPRs) {
1102 // This can happen due to a compiler bug or when using inline asm to use
1103 // the registers which are usually reserved for vcc etc.
1105 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
1106 NumSgpr, MaxAddressableNumSGPRs,
1108 Ctx.diagnose(Diag);
1109 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
1110 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
1111 }
1112 }
1113
1114 if (STM.hasSGPRInitBug()) {
1115 ProgInfo.NumSGPR =
1117 ProgInfo.NumSGPRsForWavesPerEU =
1119 }
1120
1121 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1123 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
1124 MFI->getNumUserSGPRs(),
1126 Ctx.diagnose(Diag);
1127 }
1128
1129 if (MFI->getLDSSize() >
1130 static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
1133 MF.getFunction(), "local memory", MFI->getLDSSize(),
1135 Ctx.diagnose(Diag);
1136 }
1137 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
1138 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
1139 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
1140 unsigned Granule) {
1141 const MCExpr *OneConst = CreateExpr(1ul);
1142 const MCExpr *GranuleConst = CreateExpr(Granule);
1143 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
1144 const MCExpr *AlignToGPR =
1145 AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
1146 const MCExpr *DivGPR =
1147 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
1148 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
1149 return SubGPR;
1150 };
1151
1152 ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,
1154 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
1156
1157 const SIModeRegisterDefaults Mode = MFI->getMode();
1158
1159 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1160 // register.
1161 ProgInfo.FloatMode = getFPMode(Mode);
1162
1163 ProgInfo.IEEEMode = Mode.IEEE;
1164
1165 // Make clamp modifier on NaN input returns 0.
1166 ProgInfo.DX10Clamp = Mode.DX10Clamp;
1167
1168 unsigned LDSAlignShift;
1169 if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
1170 // LDS is allocated in 320 dword blocks.
1171 LDSAlignShift = 11;
1172 } else if (STM.getFeatureBits().test(
1173 FeatureAddressableLocalMemorySize65536)) {
1174 // LDS is allocated in 128 dword blocks.
1175 LDSAlignShift = 9;
1176 } else {
1177 // LDS is allocated in 64 dword blocks.
1178 LDSAlignShift = 8;
1179 }
1180
1181 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1182 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1183
1184 ProgInfo.LDSSize = MFI->getLDSSize();
1185 ProgInfo.LDSBlocks =
1186 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
1187
1188 // The MCExpr equivalent of divideCeil.
1189 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1190 const MCExpr *Ceil =
1191 AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
1192 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
1193 };
1194
1195 // Scratch is allocated in 64-dword or 256-dword blocks.
1196 unsigned ScratchAlignShift =
1197 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1198 // We need to program the hardware with the amount of scratch memory that
1199 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1200 // scratch memory used per thread.
1201 ProgInfo.ScratchBlocks = DivideCeil(
1203 CreateExpr(STM.getWavefrontSize()), Ctx),
1204 CreateExpr(1ULL << ScratchAlignShift));
1205
1206 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1207 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1208 ProgInfo.MemOrdered = 1;
1209 }
1210
1211 // 0 = X, 1 = XY, 2 = XYZ
1212 unsigned TIDIGCompCnt = 0;
1213 if (MFI->hasWorkItemIDZ())
1214 TIDIGCompCnt = 2;
1215 else if (MFI->hasWorkItemIDY())
1216 TIDIGCompCnt = 1;
1217
1218 // The private segment wave byte offset is the last of the system SGPRs. We
1219 // initially assumed it was allocated, and may have used it. It shouldn't harm
1220 // anything to disable it if we know the stack isn't used here. We may still
1221 // have emitted code reading it to initialize scratch, but if that's unused
1222 // reading garbage should be OK.
1225 MCConstantExpr::create(0, Ctx), Ctx),
1226 ProgInfo.DynamicCallStack, Ctx);
1227
1228 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1229 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1230 ProgInfo.TrapHandlerEnable =
1231 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
1232 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1233 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1234 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1235 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1236 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1237 ProgInfo.EXCPEnMSB = 0;
1238 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1239 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1240 ProgInfo.EXCPEnable = 0;
1241
1242 if (STM.hasGFX90AInsts()) {
1243 // return ((Dst & ~Mask) | (Value << Shift))
1244 auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1245 uint32_t Shift) {
1246 const auto *Shft = MCConstantExpr::create(Shift, Ctx);
1247 const auto *Msk = MCConstantExpr::create(Mask, Ctx);
1248 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1250 Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
1251 return Dst;
1252 };
1253
1254 ProgInfo.ComputePGMRSrc3GFX90A =
1255 SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
1256 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1257 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1258 ProgInfo.ComputePGMRSrc3GFX90A =
1259 SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),
1260 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1261 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1262 }
1263
1265 STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
1266 ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
1267
1268 const auto [MinWEU, MaxWEU] =
1269 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1270 uint64_t Occupancy;
1271 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1273 F, F.getSubprogram(),
1274 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1275 "'" +
1276 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1277 ", final occupancy is " + Twine(Occupancy));
1278 F.getContext().diagnose(Diag);
1279 }
1280}
1281
1282static unsigned getRsrcReg(CallingConv::ID CallConv) {
1283 switch (CallConv) {
1284 default: [[fallthrough]];
1292 }
1293}
1294
1295void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1296 const SIProgramInfo &CurrentProgramInfo) {
1298 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1299 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1300 MCContext &Ctx = MF.getContext();
1301
1302 // (((Value) & Mask) << Shift)
1303 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1304 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
1305 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
1307 shft, Ctx);
1308 };
1309
1310 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1311 int64_t Val;
1312 if (Value->evaluateAsAbsolute(Val))
1313 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
1314 else
1315 OutStreamer->emitValue(Value, Size);
1316 };
1317
1320
1321 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
1322 /*Size=*/4);
1323
1325 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
1326
1328
1329 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1330 // appropriate generation.
1331 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1332 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1333 /*Mask=*/0x3FFFF, /*Shift=*/12),
1334 /*Size=*/4);
1335 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1336 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1337 /*Mask=*/0x7FFF, /*Shift=*/12),
1338 /*Size=*/4);
1339 } else {
1340 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1341 /*Mask=*/0x1FFF, /*Shift=*/12),
1342 /*Size=*/4);
1343 }
1344
1345 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1346 // 0" comment but I don't see a corresponding field in the register spec.
1347 } else {
1348 OutStreamer->emitInt32(RsrcReg);
1349
1350 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1351 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1352 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1353 MF.getContext());
1354 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1356
1357 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1358 // appropriate generation.
1359 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1360 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1361 /*Mask=*/0x3FFFF, /*Shift=*/12),
1362 /*Size=*/4);
1363 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1364 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1365 /*Mask=*/0x7FFF, /*Shift=*/12),
1366 /*Size=*/4);
1367 } else {
1368 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1369 /*Mask=*/0x1FFF, /*Shift=*/12),
1370 /*Size=*/4);
1371 }
1372 }
1373
1376 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1377 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1378 : CurrentProgramInfo.LDSBlocks;
1379 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1381 OutStreamer->emitInt32(MFI->getPSInputEnable());
1383 OutStreamer->emitInt32(MFI->getPSInputAddr());
1384 }
1385
1386 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1387 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1388 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1389 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1390}
1391
1392// Helper function to add common PAL Metadata 3.0+
1394 const SIProgramInfo &CurrentProgramInfo,
1395 CallingConv::ID CC, const GCNSubtarget &ST) {
1396 if (ST.hasIEEEMode())
1397 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1398
1399 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1400 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1401
1402 if (AMDGPU::isCompute(CC)) {
1403 MD->setHwStage(CC, ".trap_present",
1404 (bool)CurrentProgramInfo.TrapHandlerEnable);
1405 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1406 }
1407
1408 MD->setHwStage(CC, ".lds_size",
1409 (unsigned)(CurrentProgramInfo.LdsSize *
1410 getLdsDwGranularity(ST) * sizeof(uint32_t)));
1411}
1412
1413// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1414// is AMDPAL. It stores each compute/SPI register setting and other PAL
1415// metadata items into the PALMD::Metadata, combining with any provided by the
1416// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1417// is then written as a single block in the .note section.
1418void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1419 const SIProgramInfo &CurrentProgramInfo) {
1421 auto CC = MF.getFunction().getCallingConv();
1422 auto *MD = getTargetStreamer()->getPALMetadata();
1423 auto &Ctx = MF.getContext();
1424
1425 MD->setEntryPoint(CC, MF.getFunction().getName());
1426 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1427
1428 // Only set AGPRs for supported devices
1429 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1430 if (STM.hasMAIInsts()) {
1431 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1432 }
1433
1434 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1435 if (MD->getPALMajorVersion() < 3) {
1436 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
1437 if (AMDGPU::isCompute(CC)) {
1438 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1439 } else {
1440 const MCExpr *HasScratchBlocks =
1441 MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
1442 MCConstantExpr::create(0, Ctx), Ctx);
1443 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1444 MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1445 }
1446 } else {
1447 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1448 MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
1449 CurrentProgramInfo.ScratchEnable);
1450 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
1451 }
1452
1453 // ScratchSize is in bytes, 16 aligned.
1454 MD->setScratchSize(
1455 CC,
1456 AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
1457 MCConstantExpr::create(16, Ctx), Ctx),
1458 Ctx);
1459
1461 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1462 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1463 : CurrentProgramInfo.LDSBlocks;
1464 if (MD->getPALMajorVersion() < 3) {
1465 MD->setRsrc2(
1466 CC,
1468 Ctx);
1469 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1470 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1471 } else {
1472 // Graphics registers
1473 const unsigned ExtraLdsDwGranularity =
1474 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1475 MD->setGraphicsRegisters(
1476 ".ps_extra_lds_size",
1477 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1478
1479 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1480 static StringLiteral const PsInputFields[] = {
1481 ".persp_sample_ena", ".persp_center_ena",
1482 ".persp_centroid_ena", ".persp_pull_model_ena",
1483 ".linear_sample_ena", ".linear_center_ena",
1484 ".linear_centroid_ena", ".line_stipple_tex_ena",
1485 ".pos_x_float_ena", ".pos_y_float_ena",
1486 ".pos_z_float_ena", ".pos_w_float_ena",
1487 ".front_face_ena", ".ancillary_ena",
1488 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1489 unsigned PSInputEna = MFI->getPSInputEnable();
1490 unsigned PSInputAddr = MFI->getPSInputAddr();
1491 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1492 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1493 (bool)((PSInputEna >> Idx) & 1));
1494 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1495 (bool)((PSInputAddr >> Idx) & 1));
1496 }
1497 }
1498 }
1499
1500 // For version 3 and above the wave front size is already set in the metadata
1501 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1502 MD->setWave32(MF.getFunction().getCallingConv());
1503}
1504
1505void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1506 auto *MD = getTargetStreamer()->getPALMetadata();
1507 const MachineFrameInfo &MFI = MF.getFrameInfo();
1508 StringRef FnName = MF.getFunction().getName();
1509 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1511 MCContext &Ctx = MF.getContext();
1512
1513 if (MD->getPALMajorVersion() < 3) {
1514 // Set compute registers
1515 MD->setRsrc1(
1517 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1518 MD->setRsrc2(CallingConv::AMDGPU_CS,
1519 CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1520 } else {
1521 EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
1522 }
1523
1524 // Set optional info
1525 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1526 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1527 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1528}
1529
1530// This is supposed to be log2(Size)
1532 switch (Size) {
1533 case 4:
1534 return AMD_ELEMENT_4_BYTES;
1535 case 8:
1536 return AMD_ELEMENT_8_BYTES;
1537 case 16:
1538 return AMD_ELEMENT_16_BYTES;
1539 default:
1540 llvm_unreachable("invalid private_element_size");
1541 }
1542}
1543
1544void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1545 const SIProgramInfo &CurrentProgramInfo,
1546 const MachineFunction &MF) const {
1547 const Function &F = MF.getFunction();
1548 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1549 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1550
1552 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1553 MCContext &Ctx = MF.getContext();
1554
1555 Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false);
1556
1558 CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
1560 CurrentProgramInfo.getComputePGMRSrc2(Ctx);
1562
1563 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1564
1566 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1567
1568 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1569 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1571 }
1572
1573 if (UserSGPRInfo.hasDispatchPtr())
1575
1576 if (UserSGPRInfo.hasQueuePtr())
1578
1579 if (UserSGPRInfo.hasKernargSegmentPtr())
1581
1582 if (UserSGPRInfo.hasDispatchID())
1584
1585 if (UserSGPRInfo.hasFlatScratchInit())
1587
1588 if (UserSGPRInfo.hasPrivateSegmentSize())
1590
1591 if (STM.isXNACKEnabled())
1593
1594 Align MaxKernArgAlign;
1595 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1596 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1597 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1598 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1599 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1600
1601 // kernarg_segment_alignment is specified as log of the alignment.
1602 // The minimum alignment is 16.
1603 // FIXME: The metadata treats the minimum as 4?
1604 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1605}
1606
1608 const char *ExtraCode, raw_ostream &O) {
1609 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1610 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1611 return false;
1612
1613 if (ExtraCode && ExtraCode[0]) {
1614 if (ExtraCode[1] != 0)
1615 return true; // Unknown modifier.
1616
1617 switch (ExtraCode[0]) {
1618 case 'r':
1619 break;
1620 default:
1621 return true;
1622 }
1623 }
1624
1625 // TODO: Should be able to support other operand types like globals.
1626 const MachineOperand &MO = MI->getOperand(OpNo);
1627 if (MO.isReg()) {
1630 return false;
1631 }
1632 if (MO.isImm()) {
1633 int64_t Val = MO.getImm();
1635 O << Val;
1636 } else if (isUInt<16>(Val)) {
1637 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1638 } else if (isUInt<32>(Val)) {
1639 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1640 } else {
1641 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1642 }
1643 return false;
1644 }
1645 return true;
1646}
1647
1654}
1655
1656void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1657 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1658 bool isModuleEntryFunction, bool hasMAIInsts) {
1659 if (!ORE)
1660 return;
1661
1662 const char *Name = "kernel-resource-usage";
1663 const char *Indent = " ";
1664
1665 // If the remark is not specifically enabled, do not output to yaml
1668 return;
1669
1670 // Currently non-kernel functions have no resources to emit.
1672 return;
1673
1674 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1675 StringRef RemarkLabel, auto Argument) {
1676 // Add an indent for every line besides the line with the kernel name. This
1677 // makes it easier to tell which resource usage go with which kernel since
1678 // the kernel name will always be displayed first.
1679 std::string LabelStr = RemarkLabel.str() + ": ";
1680 if (RemarkName != "FunctionName")
1681 LabelStr = Indent + LabelStr;
1682
1683 ORE->emit([&]() {
1684 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1686 &MF.front())
1687 << LabelStr << ore::NV(RemarkName, Argument);
1688 });
1689 };
1690
1691 // FIXME: Formatting here is pretty nasty because clang does not accept
1692 // newlines from diagnostics. This forces us to emit multiple diagnostic
1693 // remarks to simulate newlines. If and when clang does accept newlines, this
1694 // formatting should be aggregated into one remark with newlines to avoid
1695 // printing multiple diagnostic location and diag opts.
1696 EmitResourceUsageRemark("FunctionName", "Function Name",
1697 MF.getFunction().getName());
1698 EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
1699 getMCExprStr(CurrentProgramInfo.NumSGPR));
1700 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1701 getMCExprStr(CurrentProgramInfo.NumArchVGPR));
1702 if (hasMAIInsts) {
1703 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1704 getMCExprStr(CurrentProgramInfo.NumAccVGPR));
1705 }
1706 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1707 getMCExprStr(CurrentProgramInfo.ScratchSize));
1708 int64_t DynStack;
1709 bool DynStackEvaluatable =
1710 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
1711 StringRef DynamicStackStr =
1712 DynStackEvaluatable && DynStack ? "True" : "False";
1713 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1714 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1715 getMCExprStr(CurrentProgramInfo.Occupancy));
1716 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1717 CurrentProgramInfo.SGPRSpill);
1718 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1719 CurrentProgramInfo.VGPRSpill);
1720 if (isModuleEntryFunction)
1721 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1722 CurrentProgramInfo.LDSSize);
1723}
#define Success
LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, const SIProgramInfo &CurrentProgramInfo, CallingConv::ID CC, const GCNSubtarget &ST)
static unsigned getRsrcReg(CallingConv::ID CallConv)
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static const MCExpr * computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
AMDHSA kernel descriptor MCExpr struct for use in MC layer.
MC infrastructure to propagate the function level resource usage info.
Analyzes how many registers and other resources are used by functions.
AMDHSA kernel descriptor definitions.
MC layer struct for AMDGPUMCKernelCodeT, provides MCExpr functionality where required.
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_EXTERNAL_VISIBILITY
Definition: Compiler.h:128
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::string Name
uint64_t Size
Symbol * Sym
Definition: ELF_riscv.cpp:479
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
R600 Assembly printer class.
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition: SIDefines.h:1089
#define R_0286E8_SPI_TMPRING_SIZE
Definition: SIDefines.h:1227
#define FP_ROUND_MODE_DP(x)
Definition: SIDefines.h:1209
#define C_00B84C_SCRATCH_EN
Definition: SIDefines.h:1125
#define FP_ROUND_ROUND_TO_NEAREST
Definition: SIDefines.h:1201
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition: SIDefines.h:1160
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition: SIDefines.h:1222
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition: SIDefines.h:1112
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition: SIDefines.h:1111
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition: SIDefines.h:1120
#define R_0286CC_SPI_PS_INPUT_ENA
Definition: SIDefines.h:1159
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition: SIDefines.h:1098
#define FP_DENORM_MODE_DP(x)
Definition: SIDefines.h:1220
#define R_00B848_COMPUTE_PGM_RSRC1
Definition: SIDefines.h:1162
#define R_SPILLED_SGPRS
Definition: SIDefines.h:1241
#define FP_ROUND_MODE_SP(x)
Definition: SIDefines.h:1208
#define FP_DENORM_MODE_SP(x)
Definition: SIDefines.h:1219
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition: SIDefines.h:1103
#define R_SPILLED_VGPRS
Definition: SIDefines.h:1242
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition: SIDefines.h:1097
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition: SIDefines.h:1122
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition: SIDefines.h:1096
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
Emit the specified function out to the OutStreamer.
void emitFunctionBodyEnd() override
Targets can override this to emit stuff after the last basic block in the function.
bool doFinalization(Module &M) override
Shut down the asmprinter.
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(MCRegister Reg, raw_ostream &O, const MCRegisterInfo &MRI)
static const AMDGPUMCExpr * createMax(ArrayRef< const MCExpr * > Args, MCContext &Ctx)
Definition: AMDGPUMCExpr.h:69
static const AMDGPUMCExpr * createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs, const GCNSubtarget &STM, MCContext &Ctx)
Mimics GCNSubtarget::computeOccupancy for MCExpr.
static const AMDGPUMCExpr * createTotalNumVGPR(const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx)
static const AMDGPUMCExpr * createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, bool XNACKUsed, MCContext &Ctx)
Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed are unresolvable but neede...
static const AMDGPUMCExpr * createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx)
Definition: AMDGPUMCExpr.h:83
void setHwStage(unsigned CC, StringRef field, unsigned Val)
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR, const MCExpr *NextSGPR, const MCExpr *ReserveVCC, const MCExpr *ReserveFlatScr)
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall)
virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV)
void initializeTargetID(const MCSubtargetInfo &STI)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, const MCSymbol *MaxSGPR)
virtual void EmitDirectiveAMDGCNTarget()
virtual void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header)
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
This class is intended to be used as a driving class for all asm writers.
Definition: AsmPrinter.h:86
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
Definition: AsmPrinter.cpp:408
MCSymbol * getSymbol(const GlobalValue *GV) const
Definition: AsmPrinter.cpp:697
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
Definition: AsmPrinter.cpp:719
TargetMachine & TM
Target machine description.
Definition: AsmPrinter.h:89
const MCAsmInfo * MAI
Target Asm Printer information.
Definition: AsmPrinter.h:92
MachineFunction * MF
The current machine function.
Definition: AsmPrinter.h:104
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
Definition: AsmPrinter.cpp:459
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
Definition: AsmPrinter.cpp:652
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
Definition: AsmPrinter.cpp:450
unsigned getFunctionNumber() const
Return a unique ID for the current function.
Definition: AsmPrinter.cpp:404
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition: AsmPrinter.h:116
MCSymbol * CurrentFnSym
The symbol for the current function.
Definition: AsmPrinter.h:123
MachineModuleInfo * MMI
This is a pointer to the current MachineModuleInfo.
Definition: AsmPrinter.h:107
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition: AsmPrinter.h:96
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition: AsmPrinter.h:101
bool isVerbose() const
Return true if assembly output should contain comments.
Definition: AsmPrinter.h:257
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
Definition: AsmPrinter.cpp:692
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Diagnostic information for optimization failures.
Diagnostic information for stack size etc.
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1874
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
bool hasSGPRInitBug() const
bool isTgSplitEnabled() const
Definition: GCNSubtarget.h:623
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool isCuModeEnabled() const
Definition: GCNSubtarget.h:627
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
Definition: GCNSubtarget.h:317
bool dumpCode() const
Definition: GCNSubtarget.h:523
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:615
bool isWave32() const
unsigned getMaxNumUserSGPRs() const
Generation getGeneration() const
Definition: GCNSubtarget.h:327
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
Definition: GCNSubtarget.h:331
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasPrivateSegmentSize() const
MaybeAlign getAlign() const
Returns the alignment of the given variable or function.
Definition: GlobalObject.h:79
VisibilityTypes getVisibility() const
Definition: GlobalValue.h:248
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:296
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition: Globals.cpp:130
Type * getValueType() const
Definition: GlobalValue.h:296
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
const DiagnosticHandler * getDiagHandlerPtr() const
getDiagHandlerPtr - Returns const raw pointer of DiagnosticHandler set by setDiagnosticHandler.
MCCodeEmitter * getEmitterPtr() const
Definition: MCAssembler.h:186
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:542
static const MCBinaryExpr * createAdd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:537
static const MCBinaryExpr * createOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:602
static const MCBinaryExpr * createLOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:572
static const MCBinaryExpr * createMul(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:592
static const MCBinaryExpr * createGT(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:557
static const MCBinaryExpr * createDiv(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:547
static const MCBinaryExpr * createShl(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:607
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:622
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:222
Context object for machine code objects.
Definition: MCContext.h:83
const MCObjectFileInfo * getObjectFileInfo() const
Definition: MCContext.h:416
MCSectionELF * getELFSection(const Twine &Section, unsigned Type, unsigned Flags)
Definition: MCContext.h:551
void reportError(SMLoc L, const Twine &Msg)
Definition: MCContext.cpp:1072
Base class for the full range of assembler expressions which are needed for parsing.
Definition: MCExpr.h:34
MCSection * getReadOnlySection() const
MCContext & getContext() const
void gatherResourceInfo(const MachineFunction &MF, const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI, MCContext &OutContext)
AMDGPUResourceUsageAnalysis gathers resource usage on a per-function granularity.
MCSymbol * getMaxSGPRSymbol(MCContext &OutContext)
MCSymbol * getMaxAGPRSymbol(MCContext &OutContext)
const MCExpr * createTotalNumVGPRs(const MachineFunction &MF, MCContext &Ctx)
void finalize(MCContext &OutContext)
MCSymbol * getMaxVGPRSymbol(MCContext &OutContext)
const MCExpr * createTotalNumSGPRs(const MachineFunction &MF, bool hasXnack, MCContext &Ctx)
MCSymbol * getSymbol(StringRef FuncName, ResourceInfoKind RIK, MCContext &OutContext)
This represents a section on linux, lots of unix variants and some bare metal systems.
Definition: MCSectionELF.h:27
void ensureMinAlignment(Align MinAlignment)
Makes sure that Alignment is at least MinAlignment.
Definition: MCSection.h:150
MCContext & getContext() const
Definition: MCStreamer.h:300
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx)
Definition: MCExpr.h:398
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
const MCExpr * getVariableValue(bool SetUsed=true) const
getVariableValue - Get the value for variable symbols.
Definition: MCSymbol.h:305
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition: MCSymbol.h:250
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:205
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition: MCSymbol.h:300
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition: MCSymbol.h:232
MCStreamer & getStreamer()
Definition: MCStreamer.h:102
static const MCUnaryExpr * createNot(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition: MCExpr.h:467
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void setAlignment(Align A)
setAlignment - Set the alignment of the function.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MCContext & getContext() const
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
Definition: MachineInstr.h:69
This class contains meta information specific to a module.
MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getNumKernargPreloadedSGPRs() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:853
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
std::string str() const
str - Get the contents as an std::string.
Definition: StringRef.h:229
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const Triple & getTargetTriple() const
const MCSubtargetInfo * getMCSubtargetInfo() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
MCSymbol * getSymbol(const GlobalValue *GV) const
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:392
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, std::optional< bool > EnableWavefrontSize32)
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI)
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
void printAMDGPUMCExpr(const MCExpr *Expr, raw_ostream &OS, const MCAsmInfo *MAI)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
IsaVersion getIsaVersion(StringRef GPU)
bool isCompute(CallingConv::ID cc)
const MCExpr * maskShiftSet(const MCExpr *Val, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
Provided with the MCExpr * Val, uint32 Mask and Shift, will return the masked and left shifted,...
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX90A(const MCSubtargetInfo &STI)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isShader(CallingConv::ID cc)
const MCExpr * foldAMDGPUMCExpr(const MCExpr *Expr, MCContext &Ctx)
bool isGFX10Plus(const MCSubtargetInfo &STI)
constexpr std::pair< unsigned, unsigned > getShiftMask(unsigned Value)
Deduce the least significant bit aligned shift and mask values for a binary Complement Value (as they...
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
bool isModuleEntryFunctionCC(CallingConv::ID CC)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ SHT_PROGBITS
Definition: ELF.h:1098
@ STT_AMDGPU_HSA_KERNEL
Definition: ELF.h:1375
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
Target & getTheGCNTarget()
The target for GCN GPUs.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1873
@ DS_Error
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
const SIFunctionResourceInfo & getResourceInfo() const
void validate(const MCSubtargetInfo *STI, MCContext &Ctx)
void initDefault(const MCSubtargetInfo *STI, MCContext &Ctx, bool InitMCExpr=true)
static const MCExpr * bits_get(const MCExpr *Src, uint32_t Shift, uint32_t Mask, MCContext &Ctx)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
virtual bool isAnalysisRemarkEnabled(StringRef PassName) const
Return true if analysis remarks are enabled, override to provide different implementation.
Track resource usage for kernels / entry functions.
Definition: SIProgramInfo.h:31
const MCExpr * NumSGPR
Definition: SIProgramInfo.h:70
const MCExpr * ComputePGMRSrc3GFX90A
Definition: SIProgramInfo.h:63
const MCExpr * NumArchVGPR
Definition: SIProgramInfo.h:66
const MCExpr * getComputePGMRSrc2(MCContext &Ctx) const
Compute the value of the ComputePGMRsrc2 register.
const MCExpr * VGPRBlocks
Definition: SIProgramInfo.h:33
const MCExpr * ScratchBlocks
Definition: SIProgramInfo.h:48
const MCExpr * getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const
Compute the value of the ComputePGMRsrc1 register.
const MCExpr * VCCUsed
Definition: SIProgramInfo.h:90
const MCExpr * FlatUsed
Definition: SIProgramInfo.h:74
uint32_t TrapHandlerEnable
Definition: SIProgramInfo.h:53
const MCExpr * ScratchEnable
Definition: SIProgramInfo.h:51
const MCExpr * AccumOffset
Definition: SIProgramInfo.h:68
const MCExpr * NumAccVGPR
Definition: SIProgramInfo.h:67
const MCExpr * DynamicCallStack
Definition: SIProgramInfo.h:87
const MCExpr * SGPRBlocks
Definition: SIProgramInfo.h:34
const MCExpr * NumVGPRsForWavesPerEU
Definition: SIProgramInfo.h:80
const MCExpr * NumVGPR
Definition: SIProgramInfo.h:65
const MCExpr * getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST, MCContext &Ctx) const
const MCExpr * Occupancy
Definition: SIProgramInfo.h:83
const MCExpr * ScratchSize
Definition: SIProgramInfo.h:44
const MCExpr * NumSGPRsForWavesPerEU
Definition: SIProgramInfo.h:77
void reset(const MachineFunction &MF)
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.