LLVM 23.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
23#include "GCNSubtarget.h"
28#include "R600AsmPrinter.h"
40#include "llvm/MC/MCAssembler.h"
41#include "llvm/MC/MCContext.h"
43#include "llvm/MC/MCStreamer.h"
44#include "llvm/MC/MCValue.h"
51
52using namespace llvm;
53using namespace llvm::AMDGPU;
54
55// This should get the default rounding mode from the kernel. We just set the
56// default here, but this could change if the OpenCL rounding mode pragmas are
57// used.
58//
59// The denormal mode here should match what is reported by the OpenCL runtime
60// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
61// can also be override to flush with the -cl-denorms-are-zero compiler flag.
62//
63// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
64// precision, and leaves single precision to flush all and does not report
65// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
66// CL_FP_DENORM for both.
67//
68// FIXME: It seems some instructions do not support single precision denormals
69// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
70// and sin_f32, cos_f32 on most parts).
71
72// We want to use these instructions, and using fp32 denormals also causes
73// instructions to run at the double precision rate for the device so it's
74// probably best to just report no single precision denormals.
81
82static AsmPrinter *
84 std::unique_ptr<MCStreamer> &&Streamer) {
85 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
86}
87
95
97 std::unique_ptr<MCStreamer> Streamer)
98 : AsmPrinter(TM, std::move(Streamer)) {
99 assert(OutStreamer && "AsmPrinter constructed without streamer");
100}
101
103 return "AMDGPU Assembly Printer";
104}
105
107 return TM.getMCSubtargetInfo();
108}
109
111 if (!OutStreamer)
112 return nullptr;
113 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
114}
115
119
120void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
122
123 // TODO: Which one is called first, emitStartOfAsmFile or
124 // emitFunctionBodyStart?
125 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
126 initializeTargetID(M);
127
130 return;
131
133
136 CodeObjectVersion);
137 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
138 }
139
142}
143
145 // Init target streamer if it has not yet happened
147 initTargetStreamer(M);
148
149 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
151
152 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
153 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
154 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
155 HSAMetadataStream->end();
156 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
157 (void)Success;
158 assert(Success && "Malformed HSA Metadata");
159 }
160}
161
163 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
164 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
165 const Function &F = MF->getFunction();
166
167 // TODO: We're checking this late, would be nice to check it earlier.
168 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
170 STM.getCPU() + " is only available on code object version 6 or better");
171 }
172
173 // TODO: Which one is called first, emitStartOfAsmFile or
174 // emitFunctionBodyStart?
175 if (!getTargetStreamer()->getTargetID())
176 initializeTargetID(*F.getParent());
177
178 const auto &FunctionTargetID = STM.getTargetID();
179 // Make sure function's xnack settings are compatible with module's
180 // xnack settings.
181 if (FunctionTargetID.isXnackSupported() &&
182 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
183 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
184 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
185 "' function does not match module xnack setting");
186 return;
187 }
188 // Make sure function's sramecc settings are compatible with module's
189 // sramecc settings.
190 if (FunctionTargetID.isSramEccSupported() &&
191 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
192 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
193 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
194 "' function does not match module sramecc setting");
195 return;
196 }
197
198 if (!MFI.isEntryFunction())
199 return;
200
201 if (STM.isMesaKernel(F) &&
202 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
203 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
204 AMDGPUMCKernelCodeT KernelCode;
205 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
206 KernelCode.validate(&STM, MF->getContext());
208 }
209
210 if (STM.isAmdHsaOS())
211 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
212}
213
215 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
216 if (!MFI.isEntryFunction())
217 return;
218
219 if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
220 return;
221
222 auto &Streamer = getTargetStreamer()->getStreamer();
223 auto &Context = Streamer.getContext();
224 auto &ObjectFileInfo = *Context.getObjectFileInfo();
225 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
226
227 Streamer.pushSection();
228 Streamer.switchSection(&ReadOnlySection);
229
230 // CP microcode requires the kernel descriptor to be allocated on 64 byte
231 // alignment.
232 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
233 ReadOnlySection.ensureMinAlignment(Align(64));
234
235 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
236
237 SmallString<128> KernelName;
238 getNameWithPrefix(KernelName, &MF->getFunction());
240 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
241 CurrentProgramInfo.NumVGPRsForWavesPerEU,
243 CurrentProgramInfo.NumSGPRsForWavesPerEU,
245 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
246 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
247 Context),
248 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
249
250 Streamer.popSection();
251}
252
254 Register RegNo = MI->getOperand(0).getReg();
255
257 raw_svector_ostream OS(Str);
258 OS << "implicit-def: "
259 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
260
261 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
262 OS << " : SGPR spill to VGPR lane";
263
264 OutStreamer->AddComment(OS.str());
265 OutStreamer->addBlankLine();
266}
267
269 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
271 return;
272 }
273
274 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
275 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
276 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
277 SmallString<128> SymbolName;
278 getNameWithPrefix(SymbolName, &MF->getFunction()),
280 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
281 }
282 if (DumpCodeInstEmitter) {
283 // Disassemble function name label to text.
284 DisasmLines.push_back(MF->getName().str() + ":");
285 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
286 HexLines.emplace_back("");
287 }
288
290}
291
293 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
294 // Write a line for the basic block label if it is not only fallthrough.
295 DisasmLines.push_back(
296 (Twine("BB") + Twine(getFunctionNumber())
297 + "_" + Twine(MBB.getNumber()) + ":").str());
298 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
299 HexLines.emplace_back("");
300 }
302}
303
306 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
307 OutContext.reportError({},
308 Twine(GV->getName()) +
309 ": unsupported initializer for address space");
310 return;
311 }
312
313 // LDS variables aren't emitted in HSA or PAL yet.
314 const Triple::OSType OS = TM.getTargetTriple().getOS();
315 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
316 return;
317
318 MCSymbol *GVSym = getSymbol(GV);
319
320 GVSym->redefineIfPossible();
321 if (GVSym->isDefined() || GVSym->isVariable())
322 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
323 "' is already defined");
324
325 const DataLayout &DL = GV->getDataLayout();
327 Align Alignment = GV->getAlign().value_or(Align(4));
328
329 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
330 emitLinkage(GV, GVSym);
331 auto *TS = getTargetStreamer();
332 TS->emitAMDGPULDS(GVSym, Size, Alignment);
333 return;
334 }
335
337}
338
340 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
341
342 if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
343 switch (CodeObjectVersion) {
345 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
346 break;
348 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
349 break;
351 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
352 break;
353 default:
354 reportFatalUsageError("unsupported code object version");
355 }
356 }
357
359}
360
361/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
362///
363/// Remove dependency on GCNSubtarget and depend only only the necessary values
364/// for said occupancy computation. Should match computeOccupancy implementation
365/// without passing \p STM on.
366const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
367 const MCExpr *NumVGPRs,
368 unsigned DynamicVGPRBlockSize,
369 const GCNSubtarget &STM, MCContext &Ctx) {
370 unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
371 unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
372 unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
373 unsigned Generation = STM.getGeneration();
374
375 auto CreateExpr = [&Ctx](unsigned Value) {
376 return MCConstantExpr::create(Value, Ctx);
377 };
378
380 {CreateExpr(MaxWaves), CreateExpr(Granule),
381 CreateExpr(TargetTotalNumVGPRs),
382 CreateExpr(Generation), CreateExpr(InitOcc),
383 NumSGPRs, NumVGPRs},
384 Ctx);
385}
386
387void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
388 if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
389 return;
390
392 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
393 MCSymbol *FnSym = TM.getSymbol(&F);
394
395 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
396 int64_t Val;
397 if (Value->evaluateAsAbsolute(Val)) {
398 Res = Val;
399 return true;
400 }
401 return false;
402 };
403
404 const uint64_t MaxScratchPerWorkitem =
406 MCSymbol *ScratchSizeSymbol =
407 RI.getSymbol(FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext);
408 uint64_t ScratchSize;
409 if (ScratchSizeSymbol->isVariable() &&
410 TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
411 ScratchSize > MaxScratchPerWorkitem) {
412 DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
413 DS_Error);
414 F.getContext().diagnose(DiagStackSize);
415 }
416
417 // Validate addressable scalar registers (i.e., prior to added implicit
418 // SGPRs).
419 MCSymbol *NumSGPRSymbol =
420 RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext);
422 !STM.hasSGPRInitBug()) {
423 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
424 uint64_t NumSgpr;
425 if (NumSGPRSymbol->isVariable() &&
426 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
427 NumSgpr > MaxAddressableNumSGPRs) {
428 F.getContext().diagnose(DiagnosticInfoResourceLimit(
429 F, "addressable scalar registers", NumSgpr, MaxAddressableNumSGPRs,
431 return;
432 }
433 }
434
435 MCSymbol *VCCUsedSymbol =
436 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext);
437 MCSymbol *FlatUsedSymbol =
438 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext);
439 uint64_t VCCUsed, FlatUsed, NumSgpr;
440
441 if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
442 FlatUsedSymbol->isVariable() &&
443 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
444 TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
445 TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
446
447 // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
448 // resolvable.
449 NumSgpr += IsaInfo::getNumExtraSGPRs(
450 &STM, VCCUsed, FlatUsed,
451 getTargetStreamer()->getTargetID()->isXnackOnOrAny());
453 STM.hasSGPRInitBug()) {
454 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
455 if (NumSgpr > MaxAddressableNumSGPRs) {
456 F.getContext().diagnose(DiagnosticInfoResourceLimit(
457 F, "scalar registers", NumSgpr, MaxAddressableNumSGPRs, DS_Error,
459 return;
460 }
461 }
462
463 MCSymbol *NumVgprSymbol =
464 RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext);
465 MCSymbol *NumAgprSymbol =
466 RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext);
467 uint64_t NumVgpr, NumAgpr;
468
469 MachineModuleInfo &MMI =
471 MachineFunction *MF = MMI.getMachineFunction(F);
472 if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
473 TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
474 TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
475 const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
476 unsigned MaxWaves = MFI.getMaxWavesPerEU();
477 uint64_t TotalNumVgpr =
478 getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
479 uint64_t NumVGPRsForWavesPerEU =
480 std::max({TotalNumVgpr, (uint64_t)1,
481 (uint64_t)STM.getMinNumVGPRs(
482 MaxWaves, MFI.getDynamicVGPRBlockSize())});
483 uint64_t NumSGPRsForWavesPerEU = std::max(
484 {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
485 const MCExpr *OccupancyExpr = createOccupancy(
486 STM.getOccupancyWithWorkGroupSizes(*MF).second,
487 MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
488 MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
490 uint64_t Occupancy;
491
492 const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
493 F, "amdgpu-waves-per-eu", {0, 0}, true);
494
495 if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
496 DiagnosticInfoOptimizationFailure Diag(
497 F, F.getSubprogram(),
498 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
499 "'" +
500 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
501 ", final occupancy is " + Twine(Occupancy));
502 F.getContext().diagnose(Diag);
503 return;
504 }
505 }
506 }
507}
508
510 // Pad with s_code_end to help tools and guard against instruction prefetch
511 // causing stale data in caches. Arguably this should be done by the linker,
512 // which is why this isn't done for Mesa.
513 // Don't do it if there is no code.
514 const MCSubtargetInfo &STI = *getGlobalSTI();
515 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
519 if (TextSect->hasInstructions()) {
520 OutStreamer->switchSection(TextSect);
522 }
523 }
524
525 // Assign expressions which can only be resolved when all other functions are
526 // known.
527 RI.finalize(OutContext);
528
529 // Switch section and emit all GPR maximums within the processed module.
530 OutStreamer->pushSection();
531 MCSectionELF *MaxGPRSection =
532 OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
533 OutStreamer->switchSection(MaxGPRSection);
535 RI.getMaxVGPRSymbol(OutContext), RI.getMaxAGPRSymbol(OutContext),
536 RI.getMaxSGPRSymbol(OutContext), RI.getMaxNamedBarrierSymbol(OutContext));
537 OutStreamer->popSection();
538
539 for (Function &F : M.functions())
540 validateMCResourceInfo(F);
541
542 RI.reset();
543
545}
546
547SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
549 raw_svector_ostream OSS(Str);
550 auto &Streamer = getTargetStreamer()->getStreamer();
551 auto &Context = Streamer.getContext();
552 const MCExpr *New = foldAMDGPUMCExpr(Value, Context);
553 printAMDGPUMCExpr(New, OSS, MAI);
554 return Str;
555}
556
557// Print comments that apply to both callable functions and entry points.
558void AMDGPUAsmPrinter::emitCommonFunctionComments(
559 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
560 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
561 const AMDGPUMachineFunctionInfo *MFI) {
562 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
563 OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR),
564 false);
565 OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
566 if (NumAGPR && TotalNumVGPR) {
567 OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
568 OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
569 false);
570 }
571 OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
572 false);
573 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
574 false);
575}
576
577const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
578 const MachineFunction &MF) const {
579 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
580 MCContext &Ctx = MF.getContext();
581 uint16_t KernelCodeProperties = 0;
582 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
583
584 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
585 KernelCodeProperties |=
586 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
587 }
588 if (UserSGPRInfo.hasDispatchPtr()) {
589 KernelCodeProperties |=
590 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
591 }
592 if (UserSGPRInfo.hasQueuePtr()) {
593 KernelCodeProperties |=
594 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
595 }
596 if (UserSGPRInfo.hasKernargSegmentPtr()) {
597 KernelCodeProperties |=
598 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
599 }
600 if (UserSGPRInfo.hasDispatchID()) {
601 KernelCodeProperties |=
602 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
603 }
604 if (UserSGPRInfo.hasFlatScratchInit()) {
605 KernelCodeProperties |=
606 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
607 }
608 if (UserSGPRInfo.hasPrivateSegmentSize()) {
609 KernelCodeProperties |=
610 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
611 }
612 if (MF.getSubtarget<GCNSubtarget>().isWave32()) {
613 KernelCodeProperties |=
614 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
615 }
616
617 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
618 // un-evaluatable at this point so it cannot be conditionally checked here.
619 // Instead, we'll directly shift the possibly unknown MCExpr into its place
620 // and bitwise-or it into KernelCodeProperties.
621 const MCExpr *KernelCodePropExpr =
622 MCConstantExpr::create(KernelCodeProperties, Ctx);
623 const MCExpr *OrValue = MCConstantExpr::create(
624 amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
625 OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
626 OrValue, Ctx);
627 KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
628
629 return KernelCodePropExpr;
630}
631
632MCKernelDescriptor
633AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
634 const SIProgramInfo &PI) const {
635 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
636 const Function &F = MF.getFunction();
637 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
638 MCContext &Ctx = MF.getContext();
639
640 MCKernelDescriptor KernelDescriptor;
641
642 KernelDescriptor.group_segment_fixed_size =
644 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
645
646 Align MaxKernArgAlign;
647 KernelDescriptor.kernarg_size = MCConstantExpr::create(
648 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
649
650 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
651 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
652 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
653
654 int64_t PGM_Rsrc3 = 1;
655 bool EvaluatableRsrc3 =
656 CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGM_Rsrc3);
657 (void)PGM_Rsrc3;
658 (void)EvaluatableRsrc3;
660 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() || !EvaluatableRsrc3 ||
661 static_cast<uint64_t>(PGM_Rsrc3) == 0);
662 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
663
664 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
665 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
666 Ctx);
667
668 return KernelDescriptor;
669}
670
672 // Init target streamer lazily on the first function so that previous passes
673 // can set metadata.
675 initTargetStreamer(*MF.getFunction().getParent());
676
677 ResourceUsage =
679 CurrentProgramInfo.reset(MF);
680
681 const AMDGPUMachineFunctionInfo *MFI =
682 MF.getInfo<AMDGPUMachineFunctionInfo>();
683 MCContext &Ctx = MF.getContext();
684
685 // The starting address of all shader programs must be 256 bytes aligned.
686 // Regular functions just need the basic required instruction alignment.
687 MF.ensureAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
688
690
691 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
693 // FIXME: This should be an explicit check for Mesa.
694 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
695 MCSectionELF *ConfigSection =
696 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
697 OutStreamer->switchSection(ConfigSection);
698 }
699
700 RI.gatherResourceInfo(MF, *ResourceUsage, OutContext);
701
702 if (MFI->isModuleEntryFunction()) {
703 getSIProgramInfo(CurrentProgramInfo, MF);
704 }
705
706 if (STM.isAmdPalOS()) {
707 if (MFI->isEntryFunction())
708 EmitPALMetadata(MF, CurrentProgramInfo);
709 else if (MFI->isModuleEntryFunction())
710 emitPALFunctionMetadata(MF);
711 } else if (!STM.isAmdHsaOS()) {
712 EmitProgramInfoSI(MF, CurrentProgramInfo);
713 }
714
715 DumpCodeInstEmitter = nullptr;
716 if (STM.dumpCode()) {
717 // For -dumpcode, get the assembler out of the streamer. This only works
718 // with -filetype=obj.
719 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
720 if (Assembler)
721 DumpCodeInstEmitter = Assembler->getEmitterPtr();
722 }
723
724 DisasmLines.clear();
725 HexLines.clear();
727
729
730 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
731 STM.hasMAIInsts());
732
733 {
736 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext),
737 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext),
738 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext),
739 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumNamedBarrier,
740 OutContext),
741 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
742 OutContext),
743 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext),
744 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesFlatScratch,
745 OutContext),
746 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasDynSizedStack,
747 OutContext),
748 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion,
749 OutContext),
750 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasIndirectCall,
751 OutContext));
752 }
753
754 // Emit _dvgpr$ symbol when appropriate.
755 emitDVgprSymbol(MF);
756
757 if (isVerbose()) {
758 MCSectionELF *CommentSection =
759 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
760 OutStreamer->switchSection(CommentSection);
761
762 if (!MFI->isEntryFunction()) {
764 OutStreamer->emitRawComment(" Function info:", false);
765
766 emitCommonFunctionComments(
767 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext)
768 ->getVariableValue(),
769 STM.hasMAIInsts() ? RI.getSymbol(CurrentFnSym->getName(),
770 RIK::RIK_NumAGPR, OutContext)
771 ->getVariableValue()
772 : nullptr,
773 RI.createTotalNumVGPRs(MF, Ctx),
774 RI.createTotalNumSGPRs(
775 MF,
776 MF.getSubtarget<GCNSubtarget>().getTargetID().isXnackOnOrAny(),
777 Ctx),
778 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
780 ->getVariableValue(),
781 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
782 return false;
783 }
784
785 OutStreamer->emitRawComment(" Kernel info:", false);
786 emitCommonFunctionComments(
787 CurrentProgramInfo.NumArchVGPR,
788 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
789 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
790 CurrentProgramInfo.ScratchSize,
791 CurrentProgramInfo.getFunctionCodeSize(MF), MFI);
792
793 OutStreamer->emitRawComment(
794 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
795 OutStreamer->emitRawComment(
796 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
797 OutStreamer->emitRawComment(
798 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
799 " bytes/workgroup (compile time only)", false);
800
801 OutStreamer->emitRawComment(
802 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
803
804 OutStreamer->emitRawComment(
805 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
806
807 OutStreamer->emitRawComment(
808 " NumSGPRsForWavesPerEU: " +
809 getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
810 false);
811 OutStreamer->emitRawComment(
812 " NumVGPRsForWavesPerEU: " +
813 getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
814 false);
815
816 if (STM.hasGFX90AInsts()) {
817 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
818 CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
819 AdjustedAccum = MCBinaryExpr::createMul(
820 AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
821 OutStreamer->emitRawComment(
822 " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
823 }
824
825 if (STM.hasGFX1250Insts())
826 OutStreamer->emitRawComment(
827 " NamedBarCnt: " + getMCExprStr(CurrentProgramInfo.NamedBarCnt),
828 false);
829
830 OutStreamer->emitRawComment(
831 " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
832
833 OutStreamer->emitRawComment(
834 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
835
836 OutStreamer->emitRawComment(
837 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
838 getMCExprStr(CurrentProgramInfo.ScratchEnable),
839 false);
840 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
841 Twine(CurrentProgramInfo.UserSGPR),
842 false);
843 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
844 Twine(CurrentProgramInfo.TrapHandlerEnable),
845 false);
846 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
847 Twine(CurrentProgramInfo.TGIdXEnable),
848 false);
849 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
850 Twine(CurrentProgramInfo.TGIdYEnable),
851 false);
852 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
853 Twine(CurrentProgramInfo.TGIdZEnable),
854 false);
855 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
856 Twine(CurrentProgramInfo.TIdIGCompCount),
857 false);
858
859 [[maybe_unused]] int64_t PGMRSrc3;
861 STM.hasGFX90AInsts() || STM.hasGFX1250Insts() ||
862 (CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
863 static_cast<uint64_t>(PGMRSrc3) == 0));
864 if (STM.hasGFX90AInsts()) {
865 OutStreamer->emitRawComment(
866 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
867 getMCExprStr(MCKernelDescriptor::bits_get(
868 CurrentProgramInfo.ComputePGMRSrc3,
869 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
870 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
871 false);
872 OutStreamer->emitRawComment(
873 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
874 getMCExprStr(MCKernelDescriptor::bits_get(
875 CurrentProgramInfo.ComputePGMRSrc3,
876 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
877 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
878 false);
879 }
880 }
881
882 if (DumpCodeInstEmitter) {
883
884 OutStreamer->switchSection(
885 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
886
887 for (size_t i = 0; i < DisasmLines.size(); ++i) {
888 std::string Comment = "\n";
889 if (!HexLines[i].empty()) {
890 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
891 Comment += " ; " + HexLines[i] + "\n";
892 }
893
894 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
895 OutStreamer->emitBytes(StringRef(Comment));
896 }
897 }
898
899 return false;
900}
901
902// When appropriate, add a _dvgpr$ symbol, with the value of the function
903// symbol, plus an offset encoding one less than the number of VGPR blocks used
904// by the function in bits 5..3 of the symbol value. A "VGPR block" can be
905// either 16 VGPRs (for a max of 128), or 32 VGPRs (for a max of 256). This is
906// used by a front-end to have functions that are chained rather than called,
907// and a dispatcher that dynamically resizes the VGPR count before dispatching
908// to a function.
909void AMDGPUAsmPrinter::emitDVgprSymbol(MachineFunction &MF) {
911 if (MFI.isDynamicVGPREnabled() &&
913 MCContext &Ctx = MF.getContext();
914 unsigned BlockSize = MFI.getDynamicVGPRBlockSize();
915 MCValue NumVGPRs;
916 if (!CurrentProgramInfo.NumVGPRsForWavesPerEU->evaluateAsRelocatable(
917 NumVGPRs, nullptr) ||
918 !NumVGPRs.isAbsolute()) {
919 llvm_unreachable("unable to resolve NumVGPRs for _dvgpr$ symbol");
920 }
921 // Calculate number of VGPR blocks.
922 // Treat 0 VGPRs as 1 VGPR to avoid underflowing.
923 unsigned NumBlocks =
924 divideCeil(std::max(unsigned(NumVGPRs.getConstant()), 1U), BlockSize);
925
926 if (NumBlocks > 8) {
928 "too many DVGPR blocks for _dvgpr$ symbol for '" +
929 Twine(CurrentFnSym->getName()) + "'");
930 return;
931 }
932 unsigned EncodedNumBlocks = (NumBlocks - 1) << 3;
933 // Add to function symbol to create _dvgpr$ symbol.
934 const MCExpr *DVgprFuncVal = MCBinaryExpr::createAdd(
936 MCConstantExpr::create(EncodedNumBlocks, Ctx), Ctx);
937 MCSymbol *DVgprFuncSym =
938 Ctx.getOrCreateSymbol(Twine("_dvgpr$") + CurrentFnSym->getName());
939 OutStreamer->emitAssignment(DVgprFuncSym, DVgprFuncVal);
940 emitVisibility(DVgprFuncSym, MF.getFunction().getVisibility());
941 emitLinkage(&MF.getFunction(), DVgprFuncSym);
942 }
943}
944
945// TODO: Fold this into emitFunctionBodyStart.
946void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
947 // In the beginning all features are either 'Any' or 'NotSupported',
948 // depending on global target features. This will cover empty modules.
950 getGlobalSTI()->getFeatureString());
951
952 // If module is empty, we are done.
953 if (M.empty())
954 return;
955
956 // If module is not empty, need to find first 'Off' or 'On' feature
957 // setting per feature from functions in module.
958 for (auto &F : M) {
959 auto &TSTargetID = getTargetStreamer()->getTargetID();
960 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
961 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
962 break;
963
964 const GCNSubtarget &STM = TM.getSubtarget<GCNSubtarget>(F);
965 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
966 if (TSTargetID->isXnackSupported())
967 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
968 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
969 if (TSTargetID->isSramEccSupported())
970 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
971 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
972 }
973}
974
975// AccumOffset computed for the MCExpr equivalent of:
976// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
977static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
978 const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
979 const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
980
981 // Can't be lower than 1 for subsequent alignTo.
982 const MCExpr *MaximumTaken =
983 AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
984
985 // Practically, it's computing divideCeil(MaximumTaken, 4).
986 const MCExpr *DivCeil = MCBinaryExpr::createDiv(
987 AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
988 Ctx);
989
990 return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
991}
992
993void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
994 const MachineFunction &MF) {
995 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
996 MCContext &Ctx = MF.getContext();
997
998 auto CreateExpr = [&Ctx](int64_t Value) {
999 return MCConstantExpr::create(Value, Ctx);
1000 };
1001
1002 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
1003 int64_t Val;
1004 if (Value->evaluateAsAbsolute(Val)) {
1005 Res = Val;
1006 return true;
1007 }
1008 return false;
1009 };
1010
1011 auto GetSymRefExpr =
1012 [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
1013 MCSymbol *Sym = RI.getSymbol(CurrentFnSym->getName(), RIK, OutContext);
1014 return MCSymbolRefExpr::create(Sym, Ctx);
1015 };
1016
1018 ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
1019 ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
1021 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1022
1023 ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
1024 ProgInfo.TgSplit = STM.isTgSplitEnabled();
1025 ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
1026 ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
1027 ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
1028 ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
1029 ProgInfo.DynamicCallStack =
1030 MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
1031 GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
1032
1033 const MCExpr *BarBlkConst = MCConstantExpr::create(4, Ctx);
1034 const MCExpr *AlignToBlk = AMDGPUMCExpr::createAlignTo(
1035 GetSymRefExpr(RIK::RIK_NumNamedBarrier), BarBlkConst, Ctx);
1036 ProgInfo.NamedBarCnt = MCBinaryExpr::createDiv(AlignToBlk, BarBlkConst, Ctx);
1037
1038 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1039
1040 // The calculations related to SGPR/VGPR blocks are
1041 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
1042 // unified.
1043 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
1044 ProgInfo.VCCUsed, ProgInfo.FlatUsed,
1045 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
1046
1047 // Check the addressable register limit before we add ExtraSGPRs.
1049 !STM.hasSGPRInitBug()) {
1050 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1051 uint64_t NumSgpr;
1052 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1053 NumSgpr > MaxAddressableNumSGPRs) {
1054 // This can happen due to a compiler bug or when using inline asm.
1055 LLVMContext &Ctx = MF.getFunction().getContext();
1056 Ctx.diagnose(DiagnosticInfoResourceLimit(
1057 MF.getFunction(), "addressable scalar registers", NumSgpr,
1058 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit));
1059 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
1060 }
1061 }
1062
1063 // Account for extra SGPRs and VGPRs reserved for debugger use.
1064 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
1065
1066 const Function &F = MF.getFunction();
1067
1068 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1069 // dispatch registers as function args.
1070 unsigned WaveDispatchNumSGPR = MFI->getNumWaveDispatchSGPRs(),
1071 WaveDispatchNumVGPR = MFI->getNumWaveDispatchVGPRs();
1072
1073 if (WaveDispatchNumSGPR) {
1075 {ProgInfo.NumSGPR,
1076 MCBinaryExpr::createAdd(CreateExpr(WaveDispatchNumSGPR), ExtraSGPRs,
1077 Ctx)},
1078 Ctx);
1079 }
1080
1081 if (WaveDispatchNumVGPR) {
1083 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
1084
1086 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1087 }
1088
1089 // Adjust number of registers used to meet default/requested minimum/maximum
1090 // number of waves per execution unit request.
1091 unsigned MaxWaves = MFI->getMaxWavesPerEU();
1092 ProgInfo.NumSGPRsForWavesPerEU =
1093 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
1094 CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
1095 Ctx);
1096 ProgInfo.NumVGPRsForWavesPerEU =
1097 AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
1098 CreateExpr(STM.getMinNumVGPRs(
1099 MaxWaves, MFI->getDynamicVGPRBlockSize()))},
1100 Ctx);
1101
1103 STM.hasSGPRInitBug()) {
1104 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1105 uint64_t NumSgpr;
1106 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1107 NumSgpr > MaxAddressableNumSGPRs) {
1108 // This can happen due to a compiler bug or when using inline asm to use
1109 // the registers which are usually reserved for vcc etc.
1110 LLVMContext &Ctx = MF.getFunction().getContext();
1111 Ctx.diagnose(DiagnosticInfoResourceLimit(
1112 MF.getFunction(), "scalar registers", NumSgpr, MaxAddressableNumSGPRs,
1114 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
1115 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
1116 }
1117 }
1118
1119 if (STM.hasSGPRInitBug()) {
1120 ProgInfo.NumSGPR =
1122 ProgInfo.NumSGPRsForWavesPerEU =
1124 }
1125
1126 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1127 LLVMContext &Ctx = MF.getFunction().getContext();
1128 Ctx.diagnose(DiagnosticInfoResourceLimit(
1129 MF.getFunction(), "user SGPRs", MFI->getNumUserSGPRs(),
1131 }
1132
1133 if (MFI->getLDSSize() > STM.getAddressableLocalMemorySize()) {
1134 LLVMContext &Ctx = MF.getFunction().getContext();
1135 Ctx.diagnose(DiagnosticInfoResourceLimit(
1136 MF.getFunction(), "local memory", MFI->getLDSSize(),
1138 }
1139 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
1140 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
1141 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
1142 unsigned Granule) {
1143 const MCExpr *OneConst = CreateExpr(1ul);
1144 const MCExpr *GranuleConst = CreateExpr(Granule);
1145 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
1146 const MCExpr *AlignToGPR =
1147 AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
1148 const MCExpr *DivGPR =
1149 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
1150 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
1151 return SubGPR;
1152 };
1153 // GFX10+ will always allocate 128 SGPRs and this field must be 0
1155 ProgInfo.SGPRBlocks = CreateExpr(0ul);
1156 } else {
1157 ProgInfo.SGPRBlocks = GetNumGPRBlocks(
1159 }
1160 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
1162
1163 const SIModeRegisterDefaults Mode = MFI->getMode();
1164
1165 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1166 // register.
1167 ProgInfo.FloatMode = getFPMode(Mode);
1168
1169 ProgInfo.IEEEMode = Mode.IEEE;
1170
1171 // Make clamp modifier on NaN input returns 0.
1172 ProgInfo.DX10Clamp = Mode.DX10Clamp;
1173
1174 unsigned LDSAlignShift = 8;
1175 switch (getLdsDwGranularity(STM)) {
1176 case 512:
1177 case 320:
1178 LDSAlignShift = 11;
1179 break;
1180 case 128:
1181 LDSAlignShift = 9;
1182 break;
1183 case 64:
1184 LDSAlignShift = 8;
1185 break;
1186 default:
1187 llvm_unreachable("invald LDS block size");
1188 }
1189
1190 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1191 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1192
1193 ProgInfo.LDSSize = MFI->getLDSSize();
1194 ProgInfo.LDSBlocks =
1195 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
1196
1197 // The MCExpr equivalent of divideCeil.
1198 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1199 const MCExpr *Ceil =
1200 AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
1201 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
1202 };
1203
1204 // Scratch is allocated in 64-dword or 256-dword blocks.
1205 unsigned ScratchAlignShift =
1206 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1207 // We need to program the hardware with the amount of scratch memory that
1208 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1209 // scratch memory used per thread.
1210 ProgInfo.ScratchBlocks = DivideCeil(
1212 CreateExpr(STM.getWavefrontSize()), Ctx),
1213 CreateExpr(1ULL << ScratchAlignShift));
1214
1215 if (STM.supportsWGP()) {
1216 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1217 }
1218
1219 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1220 ProgInfo.MemOrdered = 1;
1221 ProgInfo.FwdProgress = !F.hasFnAttribute("amdgpu-no-fwd-progress");
1222 }
1223
1224 // 0 = X, 1 = XY, 2 = XYZ
1225 unsigned TIDIGCompCnt = 0;
1226 if (MFI->hasWorkItemIDZ())
1227 TIDIGCompCnt = 2;
1228 else if (MFI->hasWorkItemIDY())
1229 TIDIGCompCnt = 1;
1230
1231 // The private segment wave byte offset is the last of the system SGPRs. We
1232 // initially assumed it was allocated, and may have used it. It shouldn't harm
1233 // anything to disable it if we know the stack isn't used here. We may still
1234 // have emitted code reading it to initialize scratch, but if that's unused
1235 // reading garbage should be OK.
1238 MCConstantExpr::create(0, Ctx), Ctx),
1239 ProgInfo.DynamicCallStack, Ctx);
1240
1241 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1242 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1243 ProgInfo.TrapHandlerEnable = STM.isAmdHsaOS() ? 0 : STM.hasTrapHandler();
1244 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1245 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1246 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1247 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1248 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1249 ProgInfo.EXCPEnMSB = 0;
1250 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1251 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1252 ProgInfo.EXCPEnable = 0;
1253
1254 // return ((Dst & ~Mask) | (Value << Shift))
1255 auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1256 uint32_t Shift) {
1257 const auto *Shft = MCConstantExpr::create(Shift, Ctx);
1258 const auto *Msk = MCConstantExpr::create(Mask, Ctx);
1259 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1261 Ctx);
1262 return Dst;
1263 };
1264
1265 if (STM.hasGFX90AInsts()) {
1266 ProgInfo.ComputePGMRSrc3 =
1267 SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.AccumOffset,
1268 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1269 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1270 ProgInfo.ComputePGMRSrc3 =
1271 SetBits(ProgInfo.ComputePGMRSrc3, CreateExpr(ProgInfo.TgSplit),
1272 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1273 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1274 }
1275
1276 if (STM.hasGFX1250Insts())
1277 ProgInfo.ComputePGMRSrc3 =
1278 SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
1279 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
1280 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
1281
1282 ProgInfo.Occupancy = createOccupancy(
1283 STM.computeOccupancy(F, ProgInfo.LDSSize).second,
1285 MFI->getDynamicVGPRBlockSize(), STM, Ctx);
1286
1287 const auto [MinWEU, MaxWEU] =
1288 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1289 uint64_t Occupancy;
1290 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1291 DiagnosticInfoOptimizationFailure Diag(
1292 F, F.getSubprogram(),
1293 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1294 "'" +
1295 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1296 ", final occupancy is " + Twine(Occupancy));
1297 F.getContext().diagnose(Diag);
1298 }
1299
1300 if (isGFX11Plus(STM)) {
1301 uint32_t CodeSizeInBytes = (uint32_t)std::min(
1302 ProgInfo.getFunctionCodeSize(MF, true /* IsLowerBound */),
1303 (uint64_t)std::numeric_limits<uint32_t>::max());
1304 uint32_t CodeSizeInLines = divideCeil(CodeSizeInBytes, 128);
1305 uint32_t Field, Shift, Width;
1306 if (isGFX11(STM)) {
1307 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
1308 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
1309 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
1310 } else {
1311 Field = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
1312 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
1313 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
1314 }
1315 uint64_t InstPrefSize = std::min(CodeSizeInLines, (1u << Width) - 1);
1316 ProgInfo.ComputePGMRSrc3 = SetBits(ProgInfo.ComputePGMRSrc3,
1317 CreateExpr(InstPrefSize), Field, Shift);
1318 }
1319}
1320
1333
1334void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1335 const SIProgramInfo &CurrentProgramInfo) {
1336 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1337 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1338 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1339 MCContext &Ctx = MF.getContext();
1340
1341 // (((Value) & Mask) << Shift)
1342 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1343 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
1344 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
1346 shft, Ctx);
1347 };
1348
1349 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1350 int64_t Val;
1351 if (Value->evaluateAsAbsolute(Val))
1352 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
1353 else
1354 OutStreamer->emitValue(Value, Size);
1355 };
1356
1357 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
1359
1360 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
1361 /*Size=*/4);
1362
1364 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
1365
1367
1368 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1369 // appropriate generation.
1370 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1371 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1372 /*Mask=*/0x3FFFF, /*Shift=*/12),
1373 /*Size=*/4);
1374 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1375 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1376 /*Mask=*/0x7FFF, /*Shift=*/12),
1377 /*Size=*/4);
1378 } else {
1379 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1380 /*Mask=*/0x1FFF, /*Shift=*/12),
1381 /*Size=*/4);
1382 }
1383
1384 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1385 // 0" comment but I don't see a corresponding field in the register spec.
1386 } else {
1387 OutStreamer->emitInt32(RsrcReg);
1388
1389 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1390 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1391 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1392 MF.getContext());
1393 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1395
1396 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1397 // appropriate generation.
1398 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1399 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1400 /*Mask=*/0x3FFFF, /*Shift=*/12),
1401 /*Size=*/4);
1402 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1403 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1404 /*Mask=*/0x7FFF, /*Shift=*/12),
1405 /*Size=*/4);
1406 } else {
1407 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1408 /*Mask=*/0x1FFF, /*Shift=*/12),
1409 /*Size=*/4);
1410 }
1411 }
1412
1413 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1415 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1416 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1417 : CurrentProgramInfo.LDSBlocks;
1418 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1420 OutStreamer->emitInt32(MFI->getPSInputEnable());
1422 OutStreamer->emitInt32(MFI->getPSInputAddr());
1423 }
1424
1425 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1426 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1427 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1428 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1429}
1430
1431// Helper function to add common PAL Metadata 3.0+
1433 const SIProgramInfo &CurrentProgramInfo,
1434 CallingConv::ID CC, const GCNSubtarget &ST,
1435 unsigned DynamicVGPRBlockSize) {
1436 if (ST.hasFeature(AMDGPU::FeatureDX10ClampAndIEEEMode))
1437 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1438
1439 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1440 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1441 MD->setHwStage(CC, ".forward_progress", (bool)CurrentProgramInfo.FwdProgress);
1442
1443 if (AMDGPU::isCompute(CC)) {
1444 MD->setHwStage(CC, ".trap_present",
1445 (bool)CurrentProgramInfo.TrapHandlerEnable);
1446 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1447
1448 if (DynamicVGPRBlockSize != 0)
1449 MD->setComputeRegisters(".dynamic_vgpr_en", true);
1450 }
1451
1453 CC, ".lds_size",
1454 (unsigned)(CurrentProgramInfo.LdsSize * getLdsDwGranularity(ST) *
1455 sizeof(uint32_t)));
1456}
1457
1458// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1459// is AMDPAL. It stores each compute/SPI register setting and other PAL
1460// metadata items into the PALMD::Metadata, combining with any provided by the
1461// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1462// is then written as a single block in the .note section.
1463void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1464 const SIProgramInfo &CurrentProgramInfo) {
1465 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1466 auto CC = MF.getFunction().getCallingConv();
1467 auto *MD = getTargetStreamer()->getPALMetadata();
1468 auto &Ctx = MF.getContext();
1469
1470 MD->setEntryPoint(CC, MF.getFunction().getName());
1471 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1472
1473 // For targets that support dynamic VGPRs, set the number of saved dynamic
1474 // VGPRs (if any) in the PAL metadata.
1475 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1476 if (MFI->isDynamicVGPREnabled() &&
1478 MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
1480
1481 // Only set AGPRs for supported devices
1482 if (STM.hasMAIInsts()) {
1483 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1484 }
1485
1486 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1487 if (MD->getPALMajorVersion() < 3) {
1488 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
1489 if (AMDGPU::isCompute(CC)) {
1490 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1491 } else {
1492 const MCExpr *HasScratchBlocks =
1493 MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
1494 MCConstantExpr::create(0, Ctx), Ctx);
1495 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1496 MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1497 }
1498 } else {
1499 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1500 MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
1501 CurrentProgramInfo.ScratchEnable);
1502 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM,
1504 }
1505
1506 // ScratchSize is in bytes, 16 aligned.
1507 MD->setScratchSize(
1508 CC,
1509 AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
1510 MCConstantExpr::create(16, Ctx), Ctx),
1511 Ctx);
1512
1513 if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1514 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1515 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1516 : CurrentProgramInfo.LDSBlocks;
1517 if (MD->getPALMajorVersion() < 3) {
1518 MD->setRsrc2(
1519 CC,
1521 Ctx);
1522 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1523 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1524 } else {
1525 // Graphics registers
1526 const unsigned ExtraLdsDwGranularity =
1527 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1528 MD->setGraphicsRegisters(
1529 ".ps_extra_lds_size",
1530 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1531
1532 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1533 static StringLiteral const PsInputFields[] = {
1534 ".persp_sample_ena", ".persp_center_ena",
1535 ".persp_centroid_ena", ".persp_pull_model_ena",
1536 ".linear_sample_ena", ".linear_center_ena",
1537 ".linear_centroid_ena", ".line_stipple_tex_ena",
1538 ".pos_x_float_ena", ".pos_y_float_ena",
1539 ".pos_z_float_ena", ".pos_w_float_ena",
1540 ".front_face_ena", ".ancillary_ena",
1541 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1542 unsigned PSInputEna = MFI->getPSInputEnable();
1543 unsigned PSInputAddr = MFI->getPSInputAddr();
1544 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1545 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1546 (bool)((PSInputEna >> Idx) & 1));
1547 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1548 (bool)((PSInputAddr >> Idx) & 1));
1549 }
1550 }
1551 }
1552
1553 // For version 3 and above the wave front size is already set in the metadata
1554 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1555 MD->setWave32(MF.getFunction().getCallingConv());
1556}
1557
1558void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1559 auto *MD = getTargetStreamer()->getPALMetadata();
1560 const MachineFrameInfo &MFI = MF.getFrameInfo();
1561 StringRef FnName = MF.getFunction().getName();
1562 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1563 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1564 MCContext &Ctx = MF.getContext();
1565
1566 if (MD->getPALMajorVersion() < 3) {
1567 // Set compute registers
1568 MD->setRsrc1(
1570 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1571 MD->setRsrc2(CallingConv::AMDGPU_CS,
1572 CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1573 } else {
1575 MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST,
1576 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
1577 }
1578
1579 // Set optional info
1580 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1581 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1582 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1583}
1584
1585// This is supposed to be log2(Size)
1587 switch (Size) {
1588 case 4:
1589 return AMD_ELEMENT_4_BYTES;
1590 case 8:
1591 return AMD_ELEMENT_8_BYTES;
1592 case 16:
1593 return AMD_ELEMENT_16_BYTES;
1594 default:
1595 llvm_unreachable("invalid private_element_size");
1596 }
1597}
1598
1599void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1600 const SIProgramInfo &CurrentProgramInfo,
1601 const MachineFunction &MF) const {
1602 const Function &F = MF.getFunction();
1603 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1604 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1605
1606 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1607 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1608 MCContext &Ctx = MF.getContext();
1609
1610 Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false);
1611
1613 CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
1615 CurrentProgramInfo.getComputePGMRSrc2(Ctx);
1617
1618 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1619
1621 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1622
1623 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1624 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1626 }
1627
1628 if (UserSGPRInfo.hasDispatchPtr())
1630
1631 if (UserSGPRInfo.hasQueuePtr())
1633
1634 if (UserSGPRInfo.hasKernargSegmentPtr())
1636
1637 if (UserSGPRInfo.hasDispatchID())
1639
1640 if (UserSGPRInfo.hasFlatScratchInit())
1642
1643 if (UserSGPRInfo.hasPrivateSegmentSize())
1645
1646 if (STM.isXNACKEnabled())
1648
1649 Align MaxKernArgAlign;
1650 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1651 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1652 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1653 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1654 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1655
1656 // kernarg_segment_alignment is specified as log of the alignment.
1657 // The minimum alignment is 16.
1658 // FIXME: The metadata treats the minimum as 4?
1659 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1660}
1661
1663 const char *ExtraCode, raw_ostream &O) {
1664 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1665 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1666 return false;
1667
1668 if (ExtraCode && ExtraCode[0]) {
1669 if (ExtraCode[1] != 0)
1670 return true; // Unknown modifier.
1671
1672 switch (ExtraCode[0]) {
1673 case 'r':
1674 break;
1675 default:
1676 return true;
1677 }
1678 }
1679
1680 // TODO: Should be able to support other operand types like globals.
1681 const MachineOperand &MO = MI->getOperand(OpNo);
1682 if (MO.isReg()) {
1684 *MF->getSubtarget().getRegisterInfo());
1685 return false;
1686 }
1687 if (MO.isImm()) {
1688 int64_t Val = MO.getImm();
1690 O << Val;
1691 } else if (isUInt<16>(Val)) {
1692 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1693 } else if (isUInt<32>(Val)) {
1694 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1695 } else {
1696 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1697 }
1698 return false;
1699 }
1700 return true;
1701}
1702
1710
1711void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1712 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1713 bool isModuleEntryFunction, bool hasMAIInsts) {
1714 if (!ORE)
1715 return;
1716
1717 const char *Name = "kernel-resource-usage";
1718 const char *Indent = " ";
1719
1720 // If the remark is not specifically enabled, do not output to yaml
1722 if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name))
1723 return;
1724
1725 // Currently non-kernel functions have no resources to emit.
1727 return;
1728
1729 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1730 StringRef RemarkLabel, auto Argument) {
1731 // Add an indent for every line besides the line with the kernel name. This
1732 // makes it easier to tell which resource usage go with which kernel since
1733 // the kernel name will always be displayed first.
1734 std::string LabelStr = RemarkLabel.str() + ": ";
1735 if (RemarkName != "FunctionName")
1736 LabelStr = Indent + LabelStr;
1737
1738 ORE->emit([&]() {
1739 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1741 &MF.front())
1742 << LabelStr << ore::NV(RemarkName, Argument);
1743 });
1744 };
1745
1746 // FIXME: Formatting here is pretty nasty because clang does not accept
1747 // newlines from diagnostics. This forces us to emit multiple diagnostic
1748 // remarks to simulate newlines. If and when clang does accept newlines, this
1749 // formatting should be aggregated into one remark with newlines to avoid
1750 // printing multiple diagnostic location and diag opts.
1751 EmitResourceUsageRemark("FunctionName", "Function Name",
1752 MF.getFunction().getName());
1753 EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
1754 getMCExprStr(CurrentProgramInfo.NumSGPR));
1755 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1756 getMCExprStr(CurrentProgramInfo.NumArchVGPR));
1757 if (hasMAIInsts) {
1758 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1759 getMCExprStr(CurrentProgramInfo.NumAccVGPR));
1760 }
1761 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1762 getMCExprStr(CurrentProgramInfo.ScratchSize));
1763 int64_t DynStack;
1764 bool DynStackEvaluatable =
1765 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
1766 StringRef DynamicStackStr =
1767 DynStackEvaluatable && DynStack ? "True" : "False";
1768 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1769 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1770 getMCExprStr(CurrentProgramInfo.Occupancy));
1771 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1772 CurrentProgramInfo.SGPRSpill);
1773 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1774 CurrentProgramInfo.VGPRSpill);
1775 if (isModuleEntryFunction)
1776 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1777 CurrentProgramInfo.LDSSize);
1778}
1779
1780char AMDGPUAsmPrinter::ID = 0;
1781
1782INITIALIZE_PASS(AMDGPUAsmPrinter, "amdgpu-asm-printer",
1783 "AMDGPU Assembly Printer", false, false)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, const SIProgramInfo &CurrentProgramInfo, CallingConv::ID CC, const GCNSubtarget &ST, unsigned DynamicVGPRBlockSize)
const AMDGPUMCExpr * createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx)
Mimics GCNSubtarget::computeOccupancy for MCExpr.
static unsigned getRsrcReg(CallingConv::ID CallConv)
LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static const MCExpr * computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
AMDHSA kernel descriptor MCExpr struct for use in MC layer.
MC infrastructure to propagate the function level resource usage info.
Analyzes how many registers and other resources are used by functions.
AMDHSA kernel descriptor definitions.
MC layer struct for AMDGPUMCKernelCodeT, provides MCExpr functionality where required.
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define LLVM_ABI
Definition Compiler.h:213
#define LLVM_EXTERNAL_VISIBILITY
Definition Compiler.h:132
AMD GCN specific subclass of TargetSubtarget.
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
OptimizedStructLayoutField Field
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
R600 Assembly printer class.
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition SIDefines.h:1144
#define R_0286E8_SPI_TMPRING_SIZE
Definition SIDefines.h:1282
#define FP_ROUND_MODE_DP(x)
Definition SIDefines.h:1264
#define C_00B84C_SCRATCH_EN
Definition SIDefines.h:1180
#define FP_ROUND_ROUND_TO_NEAREST
Definition SIDefines.h:1256
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition SIDefines.h:1215
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition SIDefines.h:1277
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition SIDefines.h:1167
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition SIDefines.h:1166
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition SIDefines.h:1175
#define R_0286CC_SPI_PS_INPUT_ENA
Definition SIDefines.h:1214
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition SIDefines.h:1153
#define FP_DENORM_MODE_DP(x)
Definition SIDefines.h:1275
#define R_00B848_COMPUTE_PGM_RSRC1
Definition SIDefines.h:1217
#define R_SPILLED_SGPRS
Definition SIDefines.h:1296
#define FP_ROUND_MODE_SP(x)
Definition SIDefines.h:1263
#define FP_DENORM_MODE_SP(x)
Definition SIDefines.h:1274
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition SIDefines.h:1158
#define R_SPILLED_VGPRS
Definition SIDefines.h:1297
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition SIDefines.h:1152
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition SIDefines.h:1177
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition SIDefines.h:1151
static const int BlockSize
Definition TarWriter.cpp:33
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
void emitFunctionBodyEnd() override
Targets can override this to emit stuff after the last basic block in the function.
bool doFinalization(Module &M) override
doFinalization - Virtual method overriden by subclasses to do any necessary clean up after all passes...
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
doInitialization - Virtual method overridden by subclasses to do any necessary initialization before ...
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(MCRegister Reg, raw_ostream &O, const MCRegisterInfo &MRI)
AMDGPU target specific MCExpr operations.
static const AMDGPUMCExpr * createMax(ArrayRef< const MCExpr * > Args, MCContext &Ctx)
static const AMDGPUMCExpr * createTotalNumVGPR(const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx)
static const AMDGPUMCExpr * create(VariantKind Kind, ArrayRef< const MCExpr * > Args, MCContext &Ctx)
static const AMDGPUMCExpr * createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, bool XNACKUsed, MCContext &Ctx)
Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed are unresolvable but neede...
static const AMDGPUMCExpr * createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx)
void setHwStage(unsigned CC, StringRef field, unsigned Val)
void updateHwStageMaximum(unsigned CC, StringRef field, unsigned Val)
void setComputeRegisters(StringRef field, unsigned Val)
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR, const MCExpr *NextSGPR, const MCExpr *ReserveVCC, const MCExpr *ReserveFlatScr)
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV)
void initializeTargetID(const MCSubtargetInfo &STI)
virtual void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, const MCSymbol *NumExplicitSGPR, const MCSymbol *NumNamedBarrier, const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header)
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, const MCSymbol *MaxSGPR, const MCSymbol *MaxNamedBarrier)
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
This class is intended to be used as a driving class for all asm writers.
Definition AsmPrinter.h:91
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
MCSymbol * getSymbol(const GlobalValue *GV) const
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
TargetMachine & TM
Target machine description.
Definition AsmPrinter.h:94
const MCAsmInfo * MAI
Target Asm Printer information.
Definition AsmPrinter.h:97
MachineFunction * MF
The current machine function.
Definition AsmPrinter.h:109
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
unsigned getFunctionNumber() const
Return a unique ID for the current function.
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition AsmPrinter.h:121
AsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer, char &ID=AsmPrinter::ID)
MCSymbol * CurrentFnSym
The symbol for the current function.
Definition AsmPrinter.h:128
MachineModuleInfo * MMI
This is a pointer to the current MachineModuleInfo.
Definition AsmPrinter.h:112
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition AsmPrinter.h:101
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition AsmPrinter.h:106
bool isVerbose() const
Return true if assembly output should contain comments.
Definition AsmPrinter.h:310
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
DISubprogram * getSubprogram() const
Get the attached subprogram.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool isTgSplitEnabled() const
bool isCuModeEnabled() const
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
bool isWave32() const
bool supportsWGP() const
unsigned getMaxNumUserSGPRs() const
Generation getGeneration() const
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
bool hasPrivateSegmentBuffer() const
VisibilityTypes getVisibility() const
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:337
unsigned getAddressSpace() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:141
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
MaybeAlign getAlign() const
Returns the alignment of the given variable.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:569
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
MCCodeEmitter * getEmitterPtr() const
static const MCBinaryExpr * createAdd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:343
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:348
static const MCBinaryExpr * createOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:408
static const MCBinaryExpr * createLOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:378
static const MCBinaryExpr * createMul(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:398
static const MCBinaryExpr * createGT(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:363
static const MCBinaryExpr * createDiv(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:353
static const MCBinaryExpr * createShl(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:413
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition MCExpr.h:428
static LLVM_ABI const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition MCExpr.cpp:212
Context object for machine code objects.
Definition MCContext.h:83
const MCObjectFileInfo * getObjectFileInfo() const
Definition MCContext.h:413
LLVM_ABI void reportError(SMLoc L, const Twine &Msg)
LLVM_ABI MCSymbol * getOrCreateSymbol(const Twine &Name)
Lookup the symbol inside with the specified Name.
Base class for the full range of assembler expressions which are needed for parsing.
Definition MCExpr.h:34
LLVM_ABI bool evaluateAsRelocatable(MCValue &Res, const MCAssembler *Asm) const
Try to evaluate the expression to a relocatable value, i.e.
Definition MCExpr.cpp:450
MCSection * getReadOnlySection() const
MCSection * getTextSection() const
MCContext & getContext() const
This represents a section on linux, lots of unix variants and some bare metal systems.
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:569
void ensureMinAlignment(Align MinAlignment)
Makes sure that Alignment is at least MinAlignment.
Definition MCSection.h:657
bool hasInstructions() const
Definition MCSection.h:665
MCContext & getContext() const
Definition MCStreamer.h:323
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:214
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition MCSymbol.h:42
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition MCSymbol.h:233
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition MCSymbol.h:267
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition MCSymbol.h:212
const MCExpr * getVariableValue() const
Get the expression of the variable symbol.
Definition MCSymbol.h:270
MCStreamer & getStreamer()
Definition MCStreamer.h:103
static const MCUnaryExpr * createNot(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition MCExpr.h:273
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
MCContext & getContext() const
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getScratchReservedForDynamicVGPRs() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:436
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, std::optional< bool > EnableWavefrontSize32)
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI)
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI)
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
void printAMDGPUMCExpr(const MCExpr *Expr, raw_ostream &OS, const MCAsmInfo *MAI)
LLVM_READNONE constexpr bool isModuleEntryFunctionCC(CallingConv::ID CC)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
const MCExpr * maskShiftSet(const MCExpr *Val, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
Provided with the MCExpr * Val, uint32 Mask and Shift, will return the masked and left shifted,...
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX90A(const MCSubtargetInfo &STI)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isGFX11Plus(const MCSubtargetInfo &STI)
const MCExpr * foldAMDGPUMCExpr(const MCExpr *Expr, MCContext &Ctx)
bool isGFX10Plus(const MCSubtargetInfo &STI)
constexpr std::pair< unsigned, unsigned > getShiftMask(unsigned Value)
Deduce the least significant bit aligned shift and mask values for a binary Complement Value (as they...
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ SHT_PROGBITS
Definition ELF.h:1148
@ STT_AMDGPU_HSA_KERNEL
Definition ELF.h:1431
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
@ Success
The lock was released successfully.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
Target & getTheGCNTarget()
The target for GCN GPUs.
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1917
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
void validate(const MCSubtargetInfo *STI, MCContext &Ctx)
void initDefault(const MCSubtargetInfo *STI, MCContext &Ctx, bool InitMCExpr=true)
static const MCExpr * bits_get(const MCExpr *Src, uint32_t Shift, uint32_t Mask, MCContext &Ctx)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Track resource usage for kernels / entry functions.
const MCExpr * NumSGPR
const MCExpr * NumArchVGPR
uint64_t getFunctionCodeSize(const MachineFunction &MF, bool IsLowerBound=false)
const MCExpr * getComputePGMRSrc2(MCContext &Ctx) const
Compute the value of the ComputePGMRsrc2 register.
const MCExpr * VGPRBlocks
const MCExpr * ScratchBlocks
const MCExpr * ComputePGMRSrc3
const MCExpr * getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const
Compute the value of the ComputePGMRsrc1 register.
const MCExpr * VCCUsed
const MCExpr * FlatUsed
const MCExpr * NamedBarCnt
const MCExpr * ScratchEnable
const MCExpr * AccumOffset
const MCExpr * NumAccVGPR
const MCExpr * DynamicCallStack
const MCExpr * SGPRBlocks
const MCExpr * NumVGPRsForWavesPerEU
const MCExpr * NumVGPR
const MCExpr * Occupancy
const MCExpr * ScratchSize
const MCExpr * NumSGPRsForWavesPerEU
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.