LLVM 20.0.0git
AMDGPUAsmPrinter.cpp
Go to the documentation of this file.
1//===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10///
11/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
12/// code. When passed an MCAsmStreamer it prints assembly and when passed
13/// an MCObjectStreamer it outputs binary code.
14//
15//===----------------------------------------------------------------------===//
16//
17
18#include "AMDGPUAsmPrinter.h"
19#include "AMDGPU.h"
23#include "GCNSubtarget.h"
28#include "R600AsmPrinter.h"
40#include "llvm/MC/MCAssembler.h"
41#include "llvm/MC/MCContext.h"
43#include "llvm/MC/MCStreamer.h"
49
50using namespace llvm;
51using namespace llvm::AMDGPU;
52
53// This should get the default rounding mode from the kernel. We just set the
54// default here, but this could change if the OpenCL rounding mode pragmas are
55// used.
56//
57// The denormal mode here should match what is reported by the OpenCL runtime
58// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
59// can also be override to flush with the -cl-denorms-are-zero compiler flag.
60//
61// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
62// precision, and leaves single precision to flush all and does not report
63// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
64// CL_FP_DENORM for both.
65//
66// FIXME: It seems some instructions do not support single precision denormals
67// regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
68// and sin_f32, cos_f32 on most parts).
69
70// We want to use these instructions, and using fp32 denormals also causes
71// instructions to run at the double precision rate for the device so it's
72// probably best to just report no single precision denormals.
76 FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
77 FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
78}
79
80static AsmPrinter *
82 std::unique_ptr<MCStreamer> &&Streamer) {
83 return new AMDGPUAsmPrinter(tm, std::move(Streamer));
84}
85
91}
92
94 std::unique_ptr<MCStreamer> Streamer)
95 : AsmPrinter(TM, std::move(Streamer)) {
96 assert(OutStreamer && "AsmPrinter constructed without streamer");
97}
98
100 return "AMDGPU Assembly Printer";
101}
102
104 return TM.getMCSubtargetInfo();
105}
106
108 if (!OutStreamer)
109 return nullptr;
110 return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
111}
112
115}
116
117void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
119
120 // TODO: Which one is called first, emitStartOfAsmFile or
121 // emitFunctionBodyStart?
122 if (getTargetStreamer() && !getTargetStreamer()->getTargetID())
123 initializeTargetID(M);
124
127 return;
128
130
133 CodeObjectVersion);
134 HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
135 }
136
139}
140
142 // Init target streamer if it has not yet happened
144 initTargetStreamer(M);
145
148
149 // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
150 // Emit HSA Metadata (NT_AMD_HSA_METADATA).
152 HSAMetadataStream->end();
153 bool Success = HSAMetadataStream->emitTo(*getTargetStreamer());
154 (void)Success;
155 assert(Success && "Malformed HSA Metadata");
156 }
157}
158
161 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
162 const Function &F = MF->getFunction();
163
164 // TODO: We're checking this late, would be nice to check it earlier.
165 if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
167 STM.getCPU() + " is only available on code object version 6 or better",
168 /*gen_crash_diag*/ false);
169 }
170
171 // TODO: Which one is called first, emitStartOfAsmFile or
172 // emitFunctionBodyStart?
173 if (!getTargetStreamer()->getTargetID())
174 initializeTargetID(*F.getParent());
175
176 const auto &FunctionTargetID = STM.getTargetID();
177 // Make sure function's xnack settings are compatible with module's
178 // xnack settings.
179 if (FunctionTargetID.isXnackSupported() &&
180 FunctionTargetID.getXnackSetting() != IsaInfo::TargetIDSetting::Any &&
181 FunctionTargetID.getXnackSetting() != getTargetStreamer()->getTargetID()->getXnackSetting()) {
182 OutContext.reportError({}, "xnack setting of '" + Twine(MF->getName()) +
183 "' function does not match module xnack setting");
184 return;
185 }
186 // Make sure function's sramecc settings are compatible with module's
187 // sramecc settings.
188 if (FunctionTargetID.isSramEccSupported() &&
189 FunctionTargetID.getSramEccSetting() != IsaInfo::TargetIDSetting::Any &&
190 FunctionTargetID.getSramEccSetting() != getTargetStreamer()->getTargetID()->getSramEccSetting()) {
191 OutContext.reportError({}, "sramecc setting of '" + Twine(MF->getName()) +
192 "' function does not match module sramecc setting");
193 return;
194 }
195
196 if (!MFI.isEntryFunction())
197 return;
198
199 if (STM.isMesaKernel(F) &&
200 (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
201 F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
202 AMDGPUMCKernelCodeT KernelCode;
203 getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
204 KernelCode.validate(&STM, MF->getContext());
206 }
207
208 if (STM.isAmdHsaOS())
209 HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
210
211 if (MFI.getNumKernargPreloadedSGPRs() > 0) {
214 STM.isAmdHsaOS());
215 }
216}
217
220 if (!MFI.isEntryFunction())
221 return;
222
224 return;
225
226 auto &Streamer = getTargetStreamer()->getStreamer();
227 auto &Context = Streamer.getContext();
228 auto &ObjectFileInfo = *Context.getObjectFileInfo();
229 auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
230
231 Streamer.pushSection();
232 Streamer.switchSection(&ReadOnlySection);
233
234 // CP microcode requires the kernel descriptor to be allocated on 64 byte
235 // alignment.
236 Streamer.emitValueToAlignment(Align(64), 0, 1, 0);
237 ReadOnlySection.ensureMinAlignment(Align(64));
238
239 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
240
241 SmallString<128> KernelName;
242 getNameWithPrefix(KernelName, &MF->getFunction());
244 STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
245 CurrentProgramInfo.NumVGPRsForWavesPerEU,
247 CurrentProgramInfo.NumSGPRsForWavesPerEU,
249 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
250 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Context),
251 Context),
252 CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
253
254 Streamer.popSection();
255}
256
258 Register RegNo = MI->getOperand(0).getReg();
259
262 OS << "implicit-def: "
263 << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
264
265 if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
266 OS << " : SGPR spill to VGPR lane";
267
268 OutStreamer->AddComment(OS.str());
269 OutStreamer->addBlankLine();
270}
271
275 return;
276 }
277
279 const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
280 if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
281 SmallString<128> SymbolName;
282 getNameWithPrefix(SymbolName, &MF->getFunction()),
284 SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
285 }
286 if (DumpCodeInstEmitter) {
287 // Disassemble function name label to text.
288 DisasmLines.push_back(MF->getName().str() + ":");
289 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
290 HexLines.emplace_back("");
291 }
292
294}
295
297 if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
298 // Write a line for the basic block label if it is not only fallthrough.
299 DisasmLines.push_back(
300 (Twine("BB") + Twine(getFunctionNumber())
301 + "_" + Twine(MBB.getNumber()) + ":").str());
302 DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
303 HexLines.emplace_back("");
304 }
306}
307
310 if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
312 Twine(GV->getName()) +
313 ": unsupported initializer for address space");
314 return;
315 }
316
317 // LDS variables aren't emitted in HSA or PAL yet.
319 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
320 return;
321
322 MCSymbol *GVSym = getSymbol(GV);
323
324 GVSym->redefineIfPossible();
325 if (GVSym->isDefined() || GVSym->isVariable())
326 report_fatal_error("symbol '" + Twine(GVSym->getName()) +
327 "' is already defined");
328
329 const DataLayout &DL = GV->getDataLayout();
330 uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
331 Align Alignment = GV->getAlign().value_or(Align(4));
332
333 emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
334 emitLinkage(GV, GVSym);
335 auto *TS = getTargetStreamer();
336 TS->emitAMDGPULDS(GVSym, Size, Alignment);
337 return;
338 }
339
341}
342
344 CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
345
347 switch (CodeObjectVersion) {
349 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV4>();
350 break;
352 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV5>();
353 break;
355 HSAMetadataStream = std::make_unique<HSAMD::MetadataStreamerMsgPackV6>();
356 break;
357 default:
358 report_fatal_error("Unexpected code object version");
359 }
360 }
361
363}
364
365void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
366 if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
367 return;
368
371 MCSymbol *FnSym = TM.getSymbol(&F);
372
373 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
374 int64_t Val;
375 if (Value->evaluateAsAbsolute(Val)) {
376 Res = Val;
377 return true;
378 }
379 return false;
380 };
381
382 const uint64_t MaxScratchPerWorkitem =
384 MCSymbol *ScratchSizeSymbol =
385 RI.getSymbol(FnSym->getName(), RIK::RIK_PrivateSegSize, OutContext);
386 uint64_t ScratchSize;
387 if (ScratchSizeSymbol->isVariable() &&
388 TryGetMCExprValue(ScratchSizeSymbol->getVariableValue(), ScratchSize) &&
389 ScratchSize > MaxScratchPerWorkitem) {
390 DiagnosticInfoStackSize DiagStackSize(F, ScratchSize, MaxScratchPerWorkitem,
391 DS_Error);
392 F.getContext().diagnose(DiagStackSize);
393 }
394
395 // Validate addressable scalar registers (i.e., prior to added implicit
396 // SGPRs).
397 MCSymbol *NumSGPRSymbol =
398 RI.getSymbol(FnSym->getName(), RIK::RIK_NumSGPR, OutContext);
400 !STM.hasSGPRInitBug()) {
401 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
402 uint64_t NumSgpr;
403 if (NumSGPRSymbol->isVariable() &&
404 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
405 NumSgpr > MaxAddressableNumSGPRs) {
406 DiagnosticInfoResourceLimit Diag(F, "addressable scalar registers",
407 NumSgpr, MaxAddressableNumSGPRs,
409 F.getContext().diagnose(Diag);
410 return;
411 }
412 }
413
414 MCSymbol *VCCUsedSymbol =
415 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesVCC, OutContext);
416 MCSymbol *FlatUsedSymbol =
417 RI.getSymbol(FnSym->getName(), RIK::RIK_UsesFlatScratch, OutContext);
418 uint64_t VCCUsed, FlatUsed, NumSgpr;
419
420 if (NumSGPRSymbol->isVariable() && VCCUsedSymbol->isVariable() &&
421 FlatUsedSymbol->isVariable() &&
422 TryGetMCExprValue(NumSGPRSymbol->getVariableValue(), NumSgpr) &&
423 TryGetMCExprValue(VCCUsedSymbol->getVariableValue(), VCCUsed) &&
424 TryGetMCExprValue(FlatUsedSymbol->getVariableValue(), FlatUsed)) {
425
426 // Recomputes NumSgprs + implicit SGPRs but all symbols should now be
427 // resolvable.
428 NumSgpr += IsaInfo::getNumExtraSGPRs(
429 &STM, VCCUsed, FlatUsed,
430 getTargetStreamer()->getTargetID()->isXnackOnOrAny());
432 STM.hasSGPRInitBug()) {
433 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
434 if (NumSgpr > MaxAddressableNumSGPRs) {
435 DiagnosticInfoResourceLimit Diag(F, "scalar registers", NumSgpr,
436 MaxAddressableNumSGPRs, DS_Error,
438 F.getContext().diagnose(Diag);
439 return;
440 }
441 }
442
443 MCSymbol *NumVgprSymbol =
444 RI.getSymbol(FnSym->getName(), RIK::RIK_NumVGPR, OutContext);
445 MCSymbol *NumAgprSymbol =
446 RI.getSymbol(FnSym->getName(), RIK::RIK_NumAGPR, OutContext);
447 uint64_t NumVgpr, NumAgpr;
448
450 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
452 if (MF && NumVgprSymbol->isVariable() && NumAgprSymbol->isVariable() &&
453 TryGetMCExprValue(NumVgprSymbol->getVariableValue(), NumVgpr) &&
454 TryGetMCExprValue(NumAgprSymbol->getVariableValue(), NumAgpr)) {
456 unsigned MaxWaves = MFI.getMaxWavesPerEU();
457 uint64_t TotalNumVgpr =
458 getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
459 uint64_t NumVGPRsForWavesPerEU = std::max(
460 {TotalNumVgpr, (uint64_t)1, (uint64_t)STM.getMinNumVGPRs(MaxWaves)});
461 uint64_t NumSGPRsForWavesPerEU = std::max(
462 {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
463 const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
464 STM.computeOccupancy(F, MFI.getLDSSize()),
465 MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
466 MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM,
467 OutContext);
468 uint64_t Occupancy;
469
470 const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
471 F, "amdgpu-waves-per-eu", {0, 0}, true);
472
473 if (TryGetMCExprValue(OccupancyExpr, Occupancy) && Occupancy < MinWEU) {
475 F, F.getSubprogram(),
476 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
477 "'" +
478 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
479 ", final occupancy is " + Twine(Occupancy));
480 F.getContext().diagnose(Diag);
481 return;
482 }
483 }
484 }
485}
486
488 // Pad with s_code_end to help tools and guard against instruction prefetch
489 // causing stale data in caches. Arguably this should be done by the linker,
490 // which is why this isn't done for Mesa.
491 const MCSubtargetInfo &STI = *getGlobalSTI();
492 if ((AMDGPU::isGFX10Plus(STI) || AMDGPU::isGFX90A(STI)) &&
495 OutStreamer->switchSection(getObjFileLowering().getTextSection());
497 }
498
499 // Assign expressions which can only be resolved when all other functions are
500 // known.
502
503 // Switch section and emit all GPR maximums within the processed module.
504 OutStreamer->pushSection();
505 MCSectionELF *MaxGPRSection =
506 OutContext.getELFSection(".AMDGPU.gpr_maximums", ELF::SHT_PROGBITS, 0);
507 OutStreamer->switchSection(MaxGPRSection);
511 OutStreamer->popSection();
512
513 for (Function &F : M.functions())
514 validateMCResourceInfo(F);
515
516 RI.reset();
517
519}
520
521SmallString<128> AMDGPUAsmPrinter::getMCExprStr(const MCExpr *Value) {
523 raw_svector_ostream OSS(Str);
524 auto &Streamer = getTargetStreamer()->getStreamer();
525 auto &Context = Streamer.getContext();
526 const MCExpr *New = foldAMDGPUMCExpr(Value, Context);
527 printAMDGPUMCExpr(New, OSS, MAI);
528 return Str;
529}
530
531// Print comments that apply to both callable functions and entry points.
532void AMDGPUAsmPrinter::emitCommonFunctionComments(
533 const MCExpr *NumVGPR, const MCExpr *NumAGPR, const MCExpr *TotalNumVGPR,
534 const MCExpr *NumSGPR, const MCExpr *ScratchSize, uint64_t CodeSize,
535 const AMDGPUMachineFunction *MFI) {
536 OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
537 OutStreamer->emitRawComment(" TotalNumSgprs: " + getMCExprStr(NumSGPR),
538 false);
539 OutStreamer->emitRawComment(" NumVgprs: " + getMCExprStr(NumVGPR), false);
540 if (NumAGPR && TotalNumVGPR) {
541 OutStreamer->emitRawComment(" NumAgprs: " + getMCExprStr(NumAGPR), false);
542 OutStreamer->emitRawComment(" TotalNumVgprs: " + getMCExprStr(TotalNumVGPR),
543 false);
544 }
545 OutStreamer->emitRawComment(" ScratchSize: " + getMCExprStr(ScratchSize),
546 false);
547 OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
548 false);
549}
550
551const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
552 const MachineFunction &MF) const {
554 MCContext &Ctx = MF.getContext();
555 uint16_t KernelCodeProperties = 0;
556 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
557
558 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
559 KernelCodeProperties |=
560 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
561 }
562 if (UserSGPRInfo.hasDispatchPtr()) {
563 KernelCodeProperties |=
564 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
565 }
566 if (UserSGPRInfo.hasQueuePtr()) {
567 KernelCodeProperties |=
568 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
569 }
570 if (UserSGPRInfo.hasKernargSegmentPtr()) {
571 KernelCodeProperties |=
572 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
573 }
574 if (UserSGPRInfo.hasDispatchID()) {
575 KernelCodeProperties |=
576 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
577 }
578 if (UserSGPRInfo.hasFlatScratchInit()) {
579 KernelCodeProperties |=
580 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
581 }
582 if (UserSGPRInfo.hasPrivateSegmentSize()) {
583 KernelCodeProperties |=
584 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE;
585 }
587 KernelCodeProperties |=
588 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
589 }
590
591 // CurrentProgramInfo.DynamicCallStack is a MCExpr and could be
592 // un-evaluatable at this point so it cannot be conditionally checked here.
593 // Instead, we'll directly shift the possibly unknown MCExpr into its place
594 // and bitwise-or it into KernelCodeProperties.
595 const MCExpr *KernelCodePropExpr =
596 MCConstantExpr::create(KernelCodeProperties, Ctx);
597 const MCExpr *OrValue = MCConstantExpr::create(
598 amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK_SHIFT, Ctx);
599 OrValue = MCBinaryExpr::createShl(CurrentProgramInfo.DynamicCallStack,
600 OrValue, Ctx);
601 KernelCodePropExpr = MCBinaryExpr::createOr(KernelCodePropExpr, OrValue, Ctx);
602
603 return KernelCodePropExpr;
604}
605
607AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
608 const SIProgramInfo &PI) const {
610 const Function &F = MF.getFunction();
612 MCContext &Ctx = MF.getContext();
613
614 MCKernelDescriptor KernelDescriptor;
615
616 KernelDescriptor.group_segment_fixed_size =
618 KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
619
620 Align MaxKernArgAlign;
621 KernelDescriptor.kernarg_size = MCConstantExpr::create(
622 STM.getKernArgSegmentSize(F, MaxKernArgAlign), Ctx);
623
624 KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM, Ctx);
625 KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2(Ctx);
626 KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
627
628 int64_t PGRM_Rsrc3 = 1;
629 bool EvaluatableRsrc3 =
630 CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(PGRM_Rsrc3);
631 (void)PGRM_Rsrc3;
632 (void)EvaluatableRsrc3;
633 assert(STM.hasGFX90AInsts() || !EvaluatableRsrc3 ||
634 static_cast<uint64_t>(PGRM_Rsrc3) == 0);
635 KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3GFX90A;
636
637 KernelDescriptor.kernarg_preload = MCConstantExpr::create(
638 AMDGPU::hasKernargPreload(STM) ? Info->getNumKernargPreloadedSGPRs() : 0,
639 Ctx);
640
641 return KernelDescriptor;
642}
643
645 // Init target streamer lazily on the first function so that previous passes
646 // can set metadata.
648 initTargetStreamer(*MF.getFunction().getParent());
649
650 ResourceUsage = &getAnalysis<AMDGPUResourceUsageAnalysis>();
651 CurrentProgramInfo.reset(MF);
652
654 MCContext &Ctx = MF.getContext();
655
656 // The starting address of all shader programs must be 256 bytes aligned.
657 // Regular functions just need the basic required instruction alignment.
658 MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
659
661
664 // FIXME: This should be an explicit check for Mesa.
665 if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
666 MCSectionELF *ConfigSection =
667 Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
668 OutStreamer->switchSection(ConfigSection);
669 }
670
672 ResourceUsage->getResourceInfo();
674
675 if (MFI->isModuleEntryFunction()) {
676 getSIProgramInfo(CurrentProgramInfo, MF);
677 }
678
679 if (STM.isAmdPalOS()) {
680 if (MFI->isEntryFunction())
681 EmitPALMetadata(MF, CurrentProgramInfo);
682 else if (MFI->isModuleEntryFunction())
683 emitPALFunctionMetadata(MF);
684 } else if (!STM.isAmdHsaOS()) {
685 EmitProgramInfoSI(MF, CurrentProgramInfo);
686 }
687
688 DumpCodeInstEmitter = nullptr;
689 if (STM.dumpCode()) {
690 // For -dumpcode, get the assembler out of the streamer. This only works
691 // with -filetype=obj.
692 MCAssembler *Assembler = OutStreamer->getAssemblerPtr();
693 if (Assembler)
694 DumpCodeInstEmitter = Assembler->getEmitterPtr();
695 }
696
697 DisasmLines.clear();
698 HexLines.clear();
700
702
703 emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
704 STM.hasMAIInsts());
705
706 {
709 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext),
710 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumAGPR, OutContext),
711 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumSGPR, OutContext),
712 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
713 OutContext),
714 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesVCC, OutContext),
715 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_UsesFlatScratch,
716 OutContext),
717 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasDynSizedStack,
718 OutContext),
719 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasRecursion,
720 OutContext),
721 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_HasIndirectCall,
722 OutContext));
723 }
724
725 if (isVerbose()) {
726 MCSectionELF *CommentSection =
727 Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
728 OutStreamer->switchSection(CommentSection);
729
730 if (!MFI->isEntryFunction()) {
732 OutStreamer->emitRawComment(" Function info:", false);
733
734 emitCommonFunctionComments(
735 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_NumVGPR, OutContext)
738 RIK::RIK_NumAGPR, OutContext)
740 : nullptr,
741 RI.createTotalNumVGPRs(MF, Ctx),
743 MF,
745 Ctx),
746 RI.getSymbol(CurrentFnSym->getName(), RIK::RIK_PrivateSegSize,
749 getFunctionCodeSize(MF), MFI);
750 return false;
751 }
752
753 OutStreamer->emitRawComment(" Kernel info:", false);
754 emitCommonFunctionComments(
755 CurrentProgramInfo.NumArchVGPR,
756 STM.hasMAIInsts() ? CurrentProgramInfo.NumAccVGPR : nullptr,
757 CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR,
758 CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF), MFI);
759
760 OutStreamer->emitRawComment(
761 " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
762 OutStreamer->emitRawComment(
763 " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
764 OutStreamer->emitRawComment(
765 " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
766 " bytes/workgroup (compile time only)", false);
767
768 OutStreamer->emitRawComment(
769 " SGPRBlocks: " + getMCExprStr(CurrentProgramInfo.SGPRBlocks), false);
770
771 OutStreamer->emitRawComment(
772 " VGPRBlocks: " + getMCExprStr(CurrentProgramInfo.VGPRBlocks), false);
773
774 OutStreamer->emitRawComment(
775 " NumSGPRsForWavesPerEU: " +
776 getMCExprStr(CurrentProgramInfo.NumSGPRsForWavesPerEU),
777 false);
778 OutStreamer->emitRawComment(
779 " NumVGPRsForWavesPerEU: " +
780 getMCExprStr(CurrentProgramInfo.NumVGPRsForWavesPerEU),
781 false);
782
783 if (STM.hasGFX90AInsts()) {
784 const MCExpr *AdjustedAccum = MCBinaryExpr::createAdd(
785 CurrentProgramInfo.AccumOffset, MCConstantExpr::create(1, Ctx), Ctx);
786 AdjustedAccum = MCBinaryExpr::createMul(
787 AdjustedAccum, MCConstantExpr::create(4, Ctx), Ctx);
788 OutStreamer->emitRawComment(
789 " AccumOffset: " + getMCExprStr(AdjustedAccum), false);
790 }
791
792 OutStreamer->emitRawComment(
793 " Occupancy: " + getMCExprStr(CurrentProgramInfo.Occupancy), false);
794
795 OutStreamer->emitRawComment(
796 " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
797
798 OutStreamer->emitRawComment(
799 " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
800 getMCExprStr(CurrentProgramInfo.ScratchEnable),
801 false);
802 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
803 Twine(CurrentProgramInfo.UserSGPR),
804 false);
805 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
806 Twine(CurrentProgramInfo.TrapHandlerEnable),
807 false);
808 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
809 Twine(CurrentProgramInfo.TGIdXEnable),
810 false);
811 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
812 Twine(CurrentProgramInfo.TGIdYEnable),
813 false);
814 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
815 Twine(CurrentProgramInfo.TGIdZEnable),
816 false);
817 OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
818 Twine(CurrentProgramInfo.TIdIGCompCount),
819 false);
820
821 [[maybe_unused]] int64_t PGMRSrc3;
822 assert(STM.hasGFX90AInsts() ||
823 (CurrentProgramInfo.ComputePGMRSrc3GFX90A->evaluateAsAbsolute(
824 PGMRSrc3) &&
825 static_cast<uint64_t>(PGMRSrc3) == 0));
826 if (STM.hasGFX90AInsts()) {
827 OutStreamer->emitRawComment(
828 " COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: " +
829 getMCExprStr(MCKernelDescriptor::bits_get(
830 CurrentProgramInfo.ComputePGMRSrc3GFX90A,
831 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT,
832 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET, Ctx)),
833 false);
834 OutStreamer->emitRawComment(
835 " COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: " +
836 getMCExprStr(MCKernelDescriptor::bits_get(
837 CurrentProgramInfo.ComputePGMRSrc3GFX90A,
838 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT,
839 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT, Ctx)),
840 false);
841 }
842 }
843
844 if (DumpCodeInstEmitter) {
845
846 OutStreamer->switchSection(
847 Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
848
849 for (size_t i = 0; i < DisasmLines.size(); ++i) {
850 std::string Comment = "\n";
851 if (!HexLines[i].empty()) {
852 Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
853 Comment += " ; " + HexLines[i] + "\n";
854 }
855
856 OutStreamer->emitBytes(StringRef(DisasmLines[i]));
857 OutStreamer->emitBytes(StringRef(Comment));
858 }
859 }
860
861 return false;
862}
863
864// TODO: Fold this into emitFunctionBodyStart.
865void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
866 // In the beginning all features are either 'Any' or 'NotSupported',
867 // depending on global target features. This will cover empty modules.
869 getGlobalSTI()->getFeatureString());
870
871 // If module is empty, we are done.
872 if (M.empty())
873 return;
874
875 // If module is not empty, need to find first 'Off' or 'On' feature
876 // setting per feature from functions in module.
877 for (auto &F : M) {
878 auto &TSTargetID = getTargetStreamer()->getTargetID();
879 if ((!TSTargetID->isXnackSupported() || TSTargetID->isXnackOnOrOff()) &&
880 (!TSTargetID->isSramEccSupported() || TSTargetID->isSramEccOnOrOff()))
881 break;
882
884 const IsaInfo::AMDGPUTargetID &STMTargetID = STM.getTargetID();
885 if (TSTargetID->isXnackSupported())
886 if (TSTargetID->getXnackSetting() == IsaInfo::TargetIDSetting::Any)
887 TSTargetID->setXnackSetting(STMTargetID.getXnackSetting());
888 if (TSTargetID->isSramEccSupported())
889 if (TSTargetID->getSramEccSetting() == IsaInfo::TargetIDSetting::Any)
890 TSTargetID->setSramEccSetting(STMTargetID.getSramEccSetting());
891 }
892}
893
894uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
896 const SIInstrInfo *TII = STM.getInstrInfo();
897
898 uint64_t CodeSize = 0;
899
900 for (const MachineBasicBlock &MBB : MF) {
901 for (const MachineInstr &MI : MBB) {
902 // TODO: CodeSize should account for multiple functions.
903
904 // TODO: Should we count size of debug info?
905 if (MI.isDebugInstr())
906 continue;
907
908 CodeSize += TII->getInstSizeInBytes(MI);
909 }
910 }
911
912 return CodeSize;
913}
914
915// AccumOffset computed for the MCExpr equivalent of:
916// alignTo(std::max(1, NumVGPR), 4) / 4 - 1;
917static const MCExpr *computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx) {
918 const MCExpr *ConstFour = MCConstantExpr::create(4, Ctx);
919 const MCExpr *ConstOne = MCConstantExpr::create(1, Ctx);
920
921 // Can't be lower than 1 for subsequent alignTo.
922 const MCExpr *MaximumTaken =
923 AMDGPUMCExpr::createMax({ConstOne, NumVGPR}, Ctx);
924
925 // Practically, it's computing divideCeil(MaximumTaken, 4).
926 const MCExpr *DivCeil = MCBinaryExpr::createDiv(
927 AMDGPUMCExpr::createAlignTo(MaximumTaken, ConstFour, Ctx), ConstFour,
928 Ctx);
929
930 return MCBinaryExpr::createSub(DivCeil, ConstOne, Ctx);
931}
932
933void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
934 const MachineFunction &MF) {
936 MCContext &Ctx = MF.getContext();
937
938 auto CreateExpr = [&Ctx](int64_t Value) {
939 return MCConstantExpr::create(Value, Ctx);
940 };
941
942 auto TryGetMCExprValue = [](const MCExpr *Value, uint64_t &Res) -> bool {
943 int64_t Val;
944 if (Value->evaluateAsAbsolute(Val)) {
945 Res = Val;
946 return true;
947 }
948 return false;
949 };
950
951 auto GetSymRefExpr =
952 [&](MCResourceInfo::ResourceInfoKind RIK) -> const MCExpr * {
954 return MCSymbolRefExpr::create(Sym, Ctx);
955 };
956
958 ProgInfo.NumArchVGPR = GetSymRefExpr(RIK::RIK_NumVGPR);
959 ProgInfo.NumAccVGPR = GetSymRefExpr(RIK::RIK_NumAGPR);
961 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
962
963 ProgInfo.AccumOffset = computeAccumOffset(ProgInfo.NumArchVGPR, Ctx);
964 ProgInfo.TgSplit = STM.isTgSplitEnabled();
965 ProgInfo.NumSGPR = GetSymRefExpr(RIK::RIK_NumSGPR);
966 ProgInfo.ScratchSize = GetSymRefExpr(RIK::RIK_PrivateSegSize);
967 ProgInfo.VCCUsed = GetSymRefExpr(RIK::RIK_UsesVCC);
968 ProgInfo.FlatUsed = GetSymRefExpr(RIK::RIK_UsesFlatScratch);
969 ProgInfo.DynamicCallStack =
970 MCBinaryExpr::createOr(GetSymRefExpr(RIK::RIK_HasDynSizedStack),
971 GetSymRefExpr(RIK::RIK_HasRecursion), Ctx);
972
974
975 // The calculations related to SGPR/VGPR blocks are
976 // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
977 // unified.
978 const MCExpr *ExtraSGPRs = AMDGPUMCExpr::createExtraSGPRs(
979 ProgInfo.VCCUsed, ProgInfo.FlatUsed,
980 getTargetStreamer()->getTargetID()->isXnackOnOrAny(), Ctx);
981
982 // Check the addressable register limit before we add ExtraSGPRs.
984 !STM.hasSGPRInitBug()) {
985 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
986 uint64_t NumSgpr;
987 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
988 NumSgpr > MaxAddressableNumSGPRs) {
989 // This can happen due to a compiler bug or when using inline asm.
992 MF.getFunction(), "addressable scalar registers", NumSgpr,
993 MaxAddressableNumSGPRs, DS_Error, DK_ResourceLimit);
994 Ctx.diagnose(Diag);
995 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs - 1);
996 }
997 }
998
999 // Account for extra SGPRs and VGPRs reserved for debugger use.
1000 ProgInfo.NumSGPR = MCBinaryExpr::createAdd(ProgInfo.NumSGPR, ExtraSGPRs, Ctx);
1001
1002 const Function &F = MF.getFunction();
1003
1004 // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
1005 // dispatch registers are function args.
1006 unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
1007
1008 if (isShader(F.getCallingConv())) {
1009 bool IsPixelShader =
1010 F.getCallingConv() == CallingConv::AMDGPU_PS && !STM.isAmdHsaOS();
1011
1012 // Calculate the number of VGPR registers based on the SPI input registers
1013 uint32_t InputEna = 0;
1014 uint32_t InputAddr = 0;
1015 unsigned LastEna = 0;
1016
1017 if (IsPixelShader) {
1018 // Note for IsPixelShader:
1019 // By this stage, all enabled inputs are tagged in InputAddr as well.
1020 // We will use InputAddr to determine whether the input counts against the
1021 // vgpr total and only use the InputEnable to determine the last input
1022 // that is relevant - if extra arguments are used, then we have to honour
1023 // the InputAddr for any intermediate non-enabled inputs.
1024 InputEna = MFI->getPSInputEnable();
1025 InputAddr = MFI->getPSInputAddr();
1026
1027 // We only need to consider input args up to the last used arg.
1028 assert((InputEna || InputAddr) &&
1029 "PSInputAddr and PSInputEnable should "
1030 "never both be 0 for AMDGPU_PS shaders");
1031 // There are some rare circumstances where InputAddr is non-zero and
1032 // InputEna can be set to 0. In this case we default to setting LastEna
1033 // to 1.
1034 LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
1035 }
1036
1037 // FIXME: We should be using the number of registers determined during
1038 // calling convention lowering to legalize the types.
1039 const DataLayout &DL = F.getDataLayout();
1040 unsigned PSArgCount = 0;
1041 unsigned IntermediateVGPR = 0;
1042 for (auto &Arg : F.args()) {
1043 unsigned NumRegs = (DL.getTypeSizeInBits(Arg.getType()) + 31) / 32;
1044 if (Arg.hasAttribute(Attribute::InReg)) {
1045 WaveDispatchNumSGPR += NumRegs;
1046 } else {
1047 // If this is a PS shader and we're processing the PS Input args (first
1048 // 16 VGPR), use the InputEna and InputAddr bits to define how many
1049 // VGPRs are actually used.
1050 // Any extra VGPR arguments are handled as normal arguments (and
1051 // contribute to the VGPR count whether they're used or not).
1052 if (IsPixelShader && PSArgCount < 16) {
1053 if ((1 << PSArgCount) & InputAddr) {
1054 if (PSArgCount < LastEna)
1055 WaveDispatchNumVGPR += NumRegs;
1056 else
1057 IntermediateVGPR += NumRegs;
1058 }
1059 PSArgCount++;
1060 } else {
1061 // If there are extra arguments we have to include the allocation for
1062 // the non-used (but enabled with InputAddr) input arguments
1063 if (IntermediateVGPR) {
1064 WaveDispatchNumVGPR += IntermediateVGPR;
1065 IntermediateVGPR = 0;
1066 }
1067 WaveDispatchNumVGPR += NumRegs;
1068 }
1069 }
1070 }
1072 {ProgInfo.NumSGPR, CreateExpr(WaveDispatchNumSGPR)}, Ctx);
1073
1075 {ProgInfo.NumVGPR, CreateExpr(WaveDispatchNumVGPR)}, Ctx);
1076
1078 ProgInfo.NumAccVGPR, ProgInfo.NumArchVGPR, Ctx);
1079 } else if (isKernel(F.getCallingConv()) &&
1081 // Consider cases where the total number of UserSGPRs with trailing
1082 // allocated preload SGPRs, is greater than the number of explicitly
1083 // referenced SGPRs.
1084 const MCExpr *UserPlusExtraSGPRs = MCBinaryExpr::createAdd(
1085 CreateExpr(MFI->getNumUserSGPRs()), ExtraSGPRs, Ctx);
1086 ProgInfo.NumSGPR =
1087 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, UserPlusExtraSGPRs}, Ctx);
1088 }
1089
1090 // Adjust number of registers used to meet default/requested minimum/maximum
1091 // number of waves per execution unit request.
1092 unsigned MaxWaves = MFI->getMaxWavesPerEU();
1093 ProgInfo.NumSGPRsForWavesPerEU =
1094 AMDGPUMCExpr::createMax({ProgInfo.NumSGPR, CreateExpr(1ul),
1095 CreateExpr(STM.getMinNumSGPRs(MaxWaves))},
1096 Ctx);
1097 ProgInfo.NumVGPRsForWavesPerEU =
1098 AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
1099 CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
1100 Ctx);
1101
1103 STM.hasSGPRInitBug()) {
1104 unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
1105 uint64_t NumSgpr;
1106 if (TryGetMCExprValue(ProgInfo.NumSGPR, NumSgpr) &&
1107 NumSgpr > MaxAddressableNumSGPRs) {
1108 // This can happen due to a compiler bug or when using inline asm to use
1109 // the registers which are usually reserved for vcc etc.
1111 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "scalar registers",
1112 NumSgpr, MaxAddressableNumSGPRs,
1114 Ctx.diagnose(Diag);
1115 ProgInfo.NumSGPR = CreateExpr(MaxAddressableNumSGPRs);
1116 ProgInfo.NumSGPRsForWavesPerEU = CreateExpr(MaxAddressableNumSGPRs);
1117 }
1118 }
1119
1120 if (STM.hasSGPRInitBug()) {
1121 ProgInfo.NumSGPR =
1123 ProgInfo.NumSGPRsForWavesPerEU =
1125 }
1126
1127 if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
1129 DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
1130 MFI->getNumUserSGPRs(),
1132 Ctx.diagnose(Diag);
1133 }
1134
1135 if (MFI->getLDSSize() >
1136 static_cast<unsigned>(STM.getAddressableLocalMemorySize())) {
1139 MF.getFunction(), "local memory", MFI->getLDSSize(),
1141 Ctx.diagnose(Diag);
1142 }
1143 // The MCExpr equivalent of getNumSGPRBlocks/getNumVGPRBlocks:
1144 // (alignTo(max(1u, NumGPR), GPREncodingGranule) / GPREncodingGranule) - 1
1145 auto GetNumGPRBlocks = [&CreateExpr, &Ctx](const MCExpr *NumGPR,
1146 unsigned Granule) {
1147 const MCExpr *OneConst = CreateExpr(1ul);
1148 const MCExpr *GranuleConst = CreateExpr(Granule);
1149 const MCExpr *MaxNumGPR = AMDGPUMCExpr::createMax({NumGPR, OneConst}, Ctx);
1150 const MCExpr *AlignToGPR =
1151 AMDGPUMCExpr::createAlignTo(MaxNumGPR, GranuleConst, Ctx);
1152 const MCExpr *DivGPR =
1153 MCBinaryExpr::createDiv(AlignToGPR, GranuleConst, Ctx);
1154 const MCExpr *SubGPR = MCBinaryExpr::createSub(DivGPR, OneConst, Ctx);
1155 return SubGPR;
1156 };
1157
1158 ProgInfo.SGPRBlocks = GetNumGPRBlocks(ProgInfo.NumSGPRsForWavesPerEU,
1160 ProgInfo.VGPRBlocks = GetNumGPRBlocks(ProgInfo.NumVGPRsForWavesPerEU,
1162
1163 const SIModeRegisterDefaults Mode = MFI->getMode();
1164
1165 // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
1166 // register.
1167 ProgInfo.FloatMode = getFPMode(Mode);
1168
1169 ProgInfo.IEEEMode = Mode.IEEE;
1170
1171 // Make clamp modifier on NaN input returns 0.
1172 ProgInfo.DX10Clamp = Mode.DX10Clamp;
1173
1174 unsigned LDSAlignShift;
1175 if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
1176 // LDS is allocated in 320 dword blocks.
1177 LDSAlignShift = 11;
1178 } else if (STM.getFeatureBits().test(
1179 FeatureAddressableLocalMemorySize65536)) {
1180 // LDS is allocated in 128 dword blocks.
1181 LDSAlignShift = 9;
1182 } else {
1183 // LDS is allocated in 64 dword blocks.
1184 LDSAlignShift = 8;
1185 }
1186
1187 ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
1188 ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
1189
1190 ProgInfo.LDSSize = MFI->getLDSSize();
1191 ProgInfo.LDSBlocks =
1192 alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
1193
1194 // The MCExpr equivalent of divideCeil.
1195 auto DivideCeil = [&Ctx](const MCExpr *Numerator, const MCExpr *Denominator) {
1196 const MCExpr *Ceil =
1197 AMDGPUMCExpr::createAlignTo(Numerator, Denominator, Ctx);
1198 return MCBinaryExpr::createDiv(Ceil, Denominator, Ctx);
1199 };
1200
1201 // Scratch is allocated in 64-dword or 256-dword blocks.
1202 unsigned ScratchAlignShift =
1203 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
1204 // We need to program the hardware with the amount of scratch memory that
1205 // is used by the entire wave. ProgInfo.ScratchSize is the amount of
1206 // scratch memory used per thread.
1207 ProgInfo.ScratchBlocks = DivideCeil(
1209 CreateExpr(STM.getWavefrontSize()), Ctx),
1210 CreateExpr(1ULL << ScratchAlignShift));
1211
1212 if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
1213 ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
1214 ProgInfo.MemOrdered = 1;
1215 }
1216
1217 // 0 = X, 1 = XY, 2 = XYZ
1218 unsigned TIDIGCompCnt = 0;
1219 if (MFI->hasWorkItemIDZ())
1220 TIDIGCompCnt = 2;
1221 else if (MFI->hasWorkItemIDY())
1222 TIDIGCompCnt = 1;
1223
1224 // The private segment wave byte offset is the last of the system SGPRs. We
1225 // initially assumed it was allocated, and may have used it. It shouldn't harm
1226 // anything to disable it if we know the stack isn't used here. We may still
1227 // have emitted code reading it to initialize scratch, but if that's unused
1228 // reading garbage should be OK.
1231 MCConstantExpr::create(0, Ctx), Ctx),
1232 ProgInfo.DynamicCallStack, Ctx);
1233
1234 ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
1235 // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
1236 ProgInfo.TrapHandlerEnable =
1237 STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
1238 ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
1239 ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
1240 ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
1241 ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
1242 ProgInfo.TIdIGCompCount = TIDIGCompCnt;
1243 ProgInfo.EXCPEnMSB = 0;
1244 // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
1245 ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
1246 ProgInfo.EXCPEnable = 0;
1247
1248 if (STM.hasGFX90AInsts()) {
1249 // return ((Dst & ~Mask) | (Value << Shift))
1250 auto SetBits = [&Ctx](const MCExpr *Dst, const MCExpr *Value, uint32_t Mask,
1251 uint32_t Shift) {
1252 const auto *Shft = MCConstantExpr::create(Shift, Ctx);
1253 const auto *Msk = MCConstantExpr::create(Mask, Ctx);
1254 Dst = MCBinaryExpr::createAnd(Dst, MCUnaryExpr::createNot(Msk, Ctx), Ctx);
1256 Dst, MCBinaryExpr::createShl(Value, Shft, Ctx), Ctx);
1257 return Dst;
1258 };
1259
1260 ProgInfo.ComputePGMRSrc3GFX90A =
1261 SetBits(ProgInfo.ComputePGMRSrc3GFX90A, ProgInfo.AccumOffset,
1262 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET,
1263 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET_SHIFT);
1264 ProgInfo.ComputePGMRSrc3GFX90A =
1265 SetBits(ProgInfo.ComputePGMRSrc3GFX90A, CreateExpr(ProgInfo.TgSplit),
1266 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT,
1267 amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
1268 }
1269
1271 STM.computeOccupancy(F, ProgInfo.LDSSize), ProgInfo.NumSGPRsForWavesPerEU,
1272 ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
1273
1274 const auto [MinWEU, MaxWEU] =
1275 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
1276 uint64_t Occupancy;
1277 if (TryGetMCExprValue(ProgInfo.Occupancy, Occupancy) && Occupancy < MinWEU) {
1279 F, F.getSubprogram(),
1280 "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
1281 "'" +
1282 F.getName() + "': desired occupancy was " + Twine(MinWEU) +
1283 ", final occupancy is " + Twine(Occupancy));
1284 F.getContext().diagnose(Diag);
1285 }
1286}
1287
1288static unsigned getRsrcReg(CallingConv::ID CallConv) {
1289 switch (CallConv) {
1290 default: [[fallthrough]];
1298 }
1299}
1300
1301void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1302 const SIProgramInfo &CurrentProgramInfo) {
1304 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1305 unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1306 MCContext &Ctx = MF.getContext();
1307
1308 // (((Value) & Mask) << Shift)
1309 auto SetBits = [&Ctx](const MCExpr *Value, uint32_t Mask, uint32_t Shift) {
1310 const MCExpr *msk = MCConstantExpr::create(Mask, Ctx);
1311 const MCExpr *shft = MCConstantExpr::create(Shift, Ctx);
1313 shft, Ctx);
1314 };
1315
1316 auto EmitResolvedOrExpr = [this](const MCExpr *Value, unsigned Size) {
1317 int64_t Val;
1318 if (Value->evaluateAsAbsolute(Val))
1319 OutStreamer->emitIntValue(static_cast<uint64_t>(Val), Size);
1320 else
1321 OutStreamer->emitValue(Value, Size);
1322 };
1323
1326
1327 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx),
1328 /*Size=*/4);
1329
1331 EmitResolvedOrExpr(CurrentProgramInfo.getComputePGMRSrc2(Ctx), /*Size=*/4);
1332
1334
1335 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1336 // appropriate generation.
1337 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1338 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1339 /*Mask=*/0x3FFFF, /*Shift=*/12),
1340 /*Size=*/4);
1341 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1342 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1343 /*Mask=*/0x7FFF, /*Shift=*/12),
1344 /*Size=*/4);
1345 } else {
1346 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1347 /*Mask=*/0x1FFF, /*Shift=*/12),
1348 /*Size=*/4);
1349 }
1350
1351 // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1352 // 0" comment but I don't see a corresponding field in the register spec.
1353 } else {
1354 OutStreamer->emitInt32(RsrcReg);
1355
1356 const MCExpr *GPRBlocks = MCBinaryExpr::createOr(
1357 SetBits(CurrentProgramInfo.VGPRBlocks, /*Mask=*/0x3F, /*Shift=*/0),
1358 SetBits(CurrentProgramInfo.SGPRBlocks, /*Mask=*/0x0F, /*Shift=*/6),
1359 MF.getContext());
1360 EmitResolvedOrExpr(GPRBlocks, /*Size=*/4);
1362
1363 // Sets bits according to S_0286E8_WAVESIZE_* mask and shift values for the
1364 // appropriate generation.
1365 if (STM.getGeneration() >= AMDGPUSubtarget::GFX12) {
1366 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1367 /*Mask=*/0x3FFFF, /*Shift=*/12),
1368 /*Size=*/4);
1369 } else if (STM.getGeneration() == AMDGPUSubtarget::GFX11) {
1370 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1371 /*Mask=*/0x7FFF, /*Shift=*/12),
1372 /*Size=*/4);
1373 } else {
1374 EmitResolvedOrExpr(SetBits(CurrentProgramInfo.ScratchBlocks,
1375 /*Mask=*/0x1FFF, /*Shift=*/12),
1376 /*Size=*/4);
1377 }
1378 }
1379
1382 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1383 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1384 : CurrentProgramInfo.LDSBlocks;
1385 OutStreamer->emitInt32(S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
1387 OutStreamer->emitInt32(MFI->getPSInputEnable());
1389 OutStreamer->emitInt32(MFI->getPSInputAddr());
1390 }
1391
1392 OutStreamer->emitInt32(R_SPILLED_SGPRS);
1393 OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
1394 OutStreamer->emitInt32(R_SPILLED_VGPRS);
1395 OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
1396}
1397
1398// Helper function to add common PAL Metadata 3.0+
1400 const SIProgramInfo &CurrentProgramInfo,
1401 CallingConv::ID CC, const GCNSubtarget &ST) {
1402 if (ST.hasIEEEMode())
1403 MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
1404
1405 MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
1406 MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
1407
1408 if (AMDGPU::isCompute(CC)) {
1409 MD->setHwStage(CC, ".trap_present",
1410 (bool)CurrentProgramInfo.TrapHandlerEnable);
1411 MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
1412 }
1413
1414 MD->setHwStage(CC, ".lds_size",
1415 (unsigned)(CurrentProgramInfo.LdsSize *
1416 getLdsDwGranularity(ST) * sizeof(uint32_t)));
1417}
1418
1419// This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1420// is AMDPAL. It stores each compute/SPI register setting and other PAL
1421// metadata items into the PALMD::Metadata, combining with any provided by the
1422// frontend as LLVM metadata. Once all functions are written, the PAL metadata
1423// is then written as a single block in the .note section.
1424void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1425 const SIProgramInfo &CurrentProgramInfo) {
1427 auto CC = MF.getFunction().getCallingConv();
1428 auto *MD = getTargetStreamer()->getPALMetadata();
1429 auto &Ctx = MF.getContext();
1430
1431 MD->setEntryPoint(CC, MF.getFunction().getName());
1432 MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
1433
1434 // Only set AGPRs for supported devices
1435 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1436 if (STM.hasMAIInsts()) {
1437 MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
1438 }
1439
1440 MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU, Ctx);
1441 if (MD->getPALMajorVersion() < 3) {
1442 MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM, Ctx), Ctx);
1443 if (AMDGPU::isCompute(CC)) {
1444 MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1445 } else {
1446 const MCExpr *HasScratchBlocks =
1447 MCBinaryExpr::createGT(CurrentProgramInfo.ScratchBlocks,
1448 MCConstantExpr::create(0, Ctx), Ctx);
1449 auto [Shift, Mask] = getShiftMask(C_00B84C_SCRATCH_EN);
1450 MD->setRsrc2(CC, maskShiftSet(HasScratchBlocks, Mask, Shift, Ctx), Ctx);
1451 }
1452 } else {
1453 MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
1454 MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
1455 CurrentProgramInfo.ScratchEnable);
1456 EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
1457 }
1458
1459 // ScratchSize is in bytes, 16 aligned.
1460 MD->setScratchSize(
1461 CC,
1462 AMDGPUMCExpr::createAlignTo(CurrentProgramInfo.ScratchSize,
1463 MCConstantExpr::create(16, Ctx), Ctx),
1464 Ctx);
1465
1467 unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
1468 ? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
1469 : CurrentProgramInfo.LDSBlocks;
1470 if (MD->getPALMajorVersion() < 3) {
1471 MD->setRsrc2(
1472 CC,
1474 Ctx);
1475 MD->setSpiPsInputEna(MFI->getPSInputEnable());
1476 MD->setSpiPsInputAddr(MFI->getPSInputAddr());
1477 } else {
1478 // Graphics registers
1479 const unsigned ExtraLdsDwGranularity =
1480 STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
1481 MD->setGraphicsRegisters(
1482 ".ps_extra_lds_size",
1483 (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
1484
1485 // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
1486 static StringLiteral const PsInputFields[] = {
1487 ".persp_sample_ena", ".persp_center_ena",
1488 ".persp_centroid_ena", ".persp_pull_model_ena",
1489 ".linear_sample_ena", ".linear_center_ena",
1490 ".linear_centroid_ena", ".line_stipple_tex_ena",
1491 ".pos_x_float_ena", ".pos_y_float_ena",
1492 ".pos_z_float_ena", ".pos_w_float_ena",
1493 ".front_face_ena", ".ancillary_ena",
1494 ".sample_coverage_ena", ".pos_fixed_pt_ena"};
1495 unsigned PSInputEna = MFI->getPSInputEnable();
1496 unsigned PSInputAddr = MFI->getPSInputAddr();
1497 for (auto [Idx, Field] : enumerate(PsInputFields)) {
1498 MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
1499 (bool)((PSInputEna >> Idx) & 1));
1500 MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
1501 (bool)((PSInputAddr >> Idx) & 1));
1502 }
1503 }
1504 }
1505
1506 // For version 3 and above the wave front size is already set in the metadata
1507 if (MD->getPALMajorVersion() < 3 && STM.isWave32())
1508 MD->setWave32(MF.getFunction().getCallingConv());
1509}
1510
1511void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
1512 auto *MD = getTargetStreamer()->getPALMetadata();
1513 const MachineFrameInfo &MFI = MF.getFrameInfo();
1514 StringRef FnName = MF.getFunction().getName();
1515 MD->setFunctionScratchSize(FnName, MFI.getStackSize());
1517 MCContext &Ctx = MF.getContext();
1518
1519 if (MD->getPALMajorVersion() < 3) {
1520 // Set compute registers
1521 MD->setRsrc1(
1523 CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST, Ctx), Ctx);
1524 MD->setRsrc2(CallingConv::AMDGPU_CS,
1525 CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
1526 } else {
1527 EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
1528 }
1529
1530 // Set optional info
1531 MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
1532 MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
1533 MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
1534}
1535
1536// This is supposed to be log2(Size)
1538 switch (Size) {
1539 case 4:
1540 return AMD_ELEMENT_4_BYTES;
1541 case 8:
1542 return AMD_ELEMENT_8_BYTES;
1543 case 16:
1544 return AMD_ELEMENT_16_BYTES;
1545 default:
1546 llvm_unreachable("invalid private_element_size");
1547 }
1548}
1549
1550void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out,
1551 const SIProgramInfo &CurrentProgramInfo,
1552 const MachineFunction &MF) const {
1553 const Function &F = MF.getFunction();
1554 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1555 F.getCallingConv() == CallingConv::SPIR_KERNEL);
1556
1558 const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1559 MCContext &Ctx = MF.getContext();
1560
1561 Out.initDefault(&STM, Ctx, /*InitMCExpr=*/false);
1562
1564 CurrentProgramInfo.getComputePGMRSrc1(STM, Ctx);
1566 CurrentProgramInfo.getComputePGMRSrc2(Ctx);
1568
1569 Out.is_dynamic_callstack = CurrentProgramInfo.DynamicCallStack;
1570
1572 getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
1573
1574 const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
1575 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
1577 }
1578
1579 if (UserSGPRInfo.hasDispatchPtr())
1581
1582 if (UserSGPRInfo.hasQueuePtr())
1584
1585 if (UserSGPRInfo.hasKernargSegmentPtr())
1587
1588 if (UserSGPRInfo.hasDispatchID())
1590
1591 if (UserSGPRInfo.hasFlatScratchInit())
1593
1594 if (UserSGPRInfo.hasPrivateSegmentSize())
1596
1597 if (STM.isXNACKEnabled())
1599
1600 Align MaxKernArgAlign;
1601 Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1602 Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1603 Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1604 Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1605 Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1606
1607 // kernarg_segment_alignment is specified as log of the alignment.
1608 // The minimum alignment is 16.
1609 // FIXME: The metadata treats the minimum as 4?
1610 Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
1611}
1612
1614 const char *ExtraCode, raw_ostream &O) {
1615 // First try the generic code, which knows about modifiers like 'c' and 'n'.
1616 if (!AsmPrinter::PrintAsmOperand(MI, OpNo, ExtraCode, O))
1617 return false;
1618
1619 if (ExtraCode && ExtraCode[0]) {
1620 if (ExtraCode[1] != 0)
1621 return true; // Unknown modifier.
1622
1623 switch (ExtraCode[0]) {
1624 case 'r':
1625 break;
1626 default:
1627 return true;
1628 }
1629 }
1630
1631 // TODO: Should be able to support other operand types like globals.
1632 const MachineOperand &MO = MI->getOperand(OpNo);
1633 if (MO.isReg()) {
1636 return false;
1637 }
1638 if (MO.isImm()) {
1639 int64_t Val = MO.getImm();
1641 O << Val;
1642 } else if (isUInt<16>(Val)) {
1643 O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
1644 } else if (isUInt<32>(Val)) {
1645 O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
1646 } else {
1647 O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
1648 }
1649 return false;
1650 }
1651 return true;
1652}
1653
1660}
1661
1662void AMDGPUAsmPrinter::emitResourceUsageRemarks(
1663 const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
1664 bool isModuleEntryFunction, bool hasMAIInsts) {
1665 if (!ORE)
1666 return;
1667
1668 const char *Name = "kernel-resource-usage";
1669 const char *Indent = " ";
1670
1671 // If the remark is not specifically enabled, do not output to yaml
1674 return;
1675
1676 // Currently non-kernel functions have no resources to emit.
1678 return;
1679
1680 auto EmitResourceUsageRemark = [&](StringRef RemarkName,
1681 StringRef RemarkLabel, auto Argument) {
1682 // Add an indent for every line besides the line with the kernel name. This
1683 // makes it easier to tell which resource usage go with which kernel since
1684 // the kernel name will always be displayed first.
1685 std::string LabelStr = RemarkLabel.str() + ": ";
1686 if (RemarkName != "FunctionName")
1687 LabelStr = Indent + LabelStr;
1688
1689 ORE->emit([&]() {
1690 return MachineOptimizationRemarkAnalysis(Name, RemarkName,
1692 &MF.front())
1693 << LabelStr << ore::NV(RemarkName, Argument);
1694 });
1695 };
1696
1697 // FIXME: Formatting here is pretty nasty because clang does not accept
1698 // newlines from diagnostics. This forces us to emit multiple diagnostic
1699 // remarks to simulate newlines. If and when clang does accept newlines, this
1700 // formatting should be aggregated into one remark with newlines to avoid
1701 // printing multiple diagnostic location and diag opts.
1702 EmitResourceUsageRemark("FunctionName", "Function Name",
1703 MF.getFunction().getName());
1704 EmitResourceUsageRemark("NumSGPR", "TotalSGPRs",
1705 getMCExprStr(CurrentProgramInfo.NumSGPR));
1706 EmitResourceUsageRemark("NumVGPR", "VGPRs",
1707 getMCExprStr(CurrentProgramInfo.NumArchVGPR));
1708 if (hasMAIInsts) {
1709 EmitResourceUsageRemark("NumAGPR", "AGPRs",
1710 getMCExprStr(CurrentProgramInfo.NumAccVGPR));
1711 }
1712 EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
1713 getMCExprStr(CurrentProgramInfo.ScratchSize));
1714 int64_t DynStack;
1715 bool DynStackEvaluatable =
1716 CurrentProgramInfo.DynamicCallStack->evaluateAsAbsolute(DynStack);
1717 StringRef DynamicStackStr =
1718 DynStackEvaluatable && DynStack ? "True" : "False";
1719 EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
1720 EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
1721 getMCExprStr(CurrentProgramInfo.Occupancy));
1722 EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
1723 CurrentProgramInfo.SGPRSpill);
1724 EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
1725 CurrentProgramInfo.VGPRSpill);
1726 if (isModuleEntryFunction)
1727 EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
1728 CurrentProgramInfo.LDSSize);
1729}
#define Success
LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter()
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD, const SIProgramInfo &CurrentProgramInfo, CallingConv::ID CC, const GCNSubtarget &ST)
static unsigned getRsrcReg(CallingConv::ID CallConv)
static amd_element_byte_size_t getElementByteSizeValue(unsigned Size)
static uint32_t getFPMode(SIModeRegisterDefaults Mode)
static const MCExpr * computeAccumOffset(const MCExpr *NumVGPR, MCContext &Ctx)
static AsmPrinter * createAMDGPUAsmPrinterPass(TargetMachine &tm, std::unique_ptr< MCStreamer > &&Streamer)
AMDGPU Assembly printer class.
AMDGPU HSA Metadata Streamer.
AMDHSA kernel descriptor MCExpr struct for use in MC layer.
MC infrastructure to propagate the function level resource usage info.
Analyzes how many registers and other resources are used by functions.
AMDHSA kernel descriptor definitions.
MC layer struct for AMDGPUMCKernelCodeT, provides MCExpr functionality where required.
amd_element_byte_size_t
The values used to define the number of bytes to use for the swizzle element size.
@ AMD_ELEMENT_8_BYTES
@ AMD_ELEMENT_16_BYTES
@ AMD_ELEMENT_4_BYTES
#define AMD_HSA_BITS_SET(dst, mask, val)
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID
@ AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE
@ AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
@ AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR
@ AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED
@ AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT
@ AMD_CODE_PROPERTY_IS_PTR64
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_EXTERNAL_VISIBILITY
Definition: Compiler.h:128
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::string Name
uint64_t Size
Symbol * Sym
Definition: ELF_riscv.cpp:479
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
===- MachineOptimizationRemarkEmitter.h - Opt Diagnostics -*- C++ -*-—===//
R600 Assembly printer class.
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS
Definition: SIDefines.h:1089
#define R_0286E8_SPI_TMPRING_SIZE
Definition: SIDefines.h:1227
#define FP_ROUND_MODE_DP(x)
Definition: SIDefines.h:1209
#define C_00B84C_SCRATCH_EN
Definition: SIDefines.h:1125
#define FP_ROUND_ROUND_TO_NEAREST
Definition: SIDefines.h:1201
#define R_0286D0_SPI_PS_INPUT_ADDR
Definition: SIDefines.h:1160
#define R_00B860_COMPUTE_TMPRING_SIZE
Definition: SIDefines.h:1222
#define R_00B428_SPI_SHADER_PGM_RSRC1_HS
Definition: SIDefines.h:1112
#define R_00B328_SPI_SHADER_PGM_RSRC1_ES
Definition: SIDefines.h:1111
#define R_00B528_SPI_SHADER_PGM_RSRC1_LS
Definition: SIDefines.h:1120
#define R_0286CC_SPI_PS_INPUT_ENA
Definition: SIDefines.h:1159
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS
Definition: SIDefines.h:1098
#define FP_DENORM_MODE_DP(x)
Definition: SIDefines.h:1220
#define R_00B848_COMPUTE_PGM_RSRC1
Definition: SIDefines.h:1162
#define R_SPILLED_SGPRS
Definition: SIDefines.h:1241
#define FP_ROUND_MODE_SP(x)
Definition: SIDefines.h:1208
#define FP_DENORM_MODE_SP(x)
Definition: SIDefines.h:1219
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS
Definition: SIDefines.h:1103
#define R_SPILLED_VGPRS
Definition: SIDefines.h:1242
#define S_00B02C_EXTRA_LDS_SIZE(x)
Definition: SIDefines.h:1097
#define R_00B84C_COMPUTE_PGM_RSRC2
Definition: SIDefines.h:1122
#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS
Definition: SIDefines.h:1096
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
void emitFunctionEntryLabel() override
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
const MCSubtargetInfo * getGlobalSTI() const
void emitImplicitDef(const MachineInstr *MI) const override
Targets can override this to customize the output of IMPLICIT_DEF instructions in verbose mode.
std::vector< std::string > DisasmLines
void emitStartOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the start of their fi...
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
std::vector< std::string > HexLines
void emitGlobalVariable(const GlobalVariable *GV) override
Emit the specified global variable to the .s file.
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &O) override
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
bool runOnMachineFunction(MachineFunction &MF) override
Emit the specified function out to the OutStreamer.
void emitFunctionBodyEnd() override
Targets can override this to emit stuff after the last basic block in the function.
bool doFinalization(Module &M) override
Shut down the asmprinter.
void emitEndOfAsmFile(Module &M) override
This virtual method can be overridden by targets that want to emit something at the end of their file...
AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr< MCStreamer > Streamer)
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
void emitFunctionBodyStart() override
Targets can override this to emit stuff before the first basic block in the function.
void emitBasicBlockStart(const MachineBasicBlock &MBB) override
Targets can override this to emit stuff at the start of a basic block.
AMDGPUTargetStreamer * getTargetStreamer() const
static void printRegOperand(MCRegister Reg, raw_ostream &O, const MCRegisterInfo &MRI)
static const AMDGPUMCExpr * createMax(ArrayRef< const MCExpr * > Args, MCContext &Ctx)
Definition: AMDGPUMCExpr.h:69
static const AMDGPUMCExpr * createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs, const GCNSubtarget &STM, MCContext &Ctx)
Mimics GCNSubtarget::computeOccupancy for MCExpr.
static const AMDGPUMCExpr * createTotalNumVGPR(const MCExpr *NumAGPR, const MCExpr *NumVGPR, MCContext &Ctx)
static const AMDGPUMCExpr * createExtraSGPRs(const MCExpr *VCCUsed, const MCExpr *FlatScrUsed, bool XNACKUsed, MCContext &Ctx)
Allow delayed MCExpr resolve of ExtraSGPRs (in case VCCUsed or FlatScrUsed are unresolvable but neede...
static const AMDGPUMCExpr * createAlignTo(const MCExpr *Value, const MCExpr *Align, MCContext &Ctx)
Definition: AMDGPUMCExpr.h:83
void setHwStage(unsigned CC, StringRef field, unsigned Val)
unsigned getAddressableLocalMemorySize() const
Return the maximum number of bytes of LDS that can be allocated to a single workgroup.
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
unsigned getWavefrontSize() const
virtual void EmitAmdhsaKernelDescriptor(const MCSubtargetInfo &STI, StringRef KernelName, const AMDGPU::MCKernelDescriptor &KernelDescriptor, const MCExpr *NextVGPR, const MCExpr *NextSGPR, const MCExpr *ReserveVCC, const MCExpr *ReserveFlatScr)
AMDGPUPALMetadata * getPALMetadata()
virtual void EmitMCResourceInfo(const MCSymbol *NumVGPR, const MCSymbol *NumAGPR, const MCSymbol *NumExplicitSGPR, const MCSymbol *PrivateSegmentSize, const MCSymbol *UsesVCC, const MCSymbol *UsesFlatScratch, const MCSymbol *HasDynamicallySizedStack, const MCSymbol *HasRecursion, const MCSymbol *HasIndirectCall)
virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV)
void initializeTargetID(const MCSubtargetInfo &STI)
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI)
virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type)
virtual void EmitMCResourceMaximums(const MCSymbol *MaxVGPR, const MCSymbol *MaxAGPR, const MCSymbol *MaxSGPR)
virtual void EmitDirectiveAMDGCNTarget()
virtual void EmitAMDKernelCodeT(AMDGPU::AMDGPUMCKernelCodeT &Header)
const std::optional< AMDGPU::IsaInfo::AMDGPUTargetID > & getTargetID() const
virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI, bool TrapEnabled)
void setXnackSetting(TargetIDSetting NewXnackSetting)
Sets xnack setting to NewXnackSetting.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
This class is intended to be used as a driving class for all asm writers.
Definition: AsmPrinter.h:86
const TargetLoweringObjectFile & getObjFileLowering() const
Return information about object file lowering.
Definition: AsmPrinter.cpp:408
MCSymbol * getSymbol(const GlobalValue *GV) const
Definition: AsmPrinter.cpp:697
virtual void emitGlobalVariable(const GlobalVariable *GV)
Emit the specified global variable to the .s file.
Definition: AsmPrinter.cpp:719
TargetMachine & TM
Target machine description.
Definition: AsmPrinter.h:89
const MCAsmInfo * MAI
Target Asm Printer information.
Definition: AsmPrinter.h:92
MachineFunction * MF
The current machine function.
Definition: AsmPrinter.h:104
virtual void SetupMachineFunction(MachineFunction &MF)
This should be called when a new MachineFunction is being processed from runOnMachineFunction.
void emitFunctionBody()
This method emits the body and trailer for a function.
virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const
Return true if the basic block has exactly one predecessor and the control transfer mechanism between...
bool doInitialization(Module &M) override
Set up the AsmPrinter when we are working on a new module.
Definition: AsmPrinter.cpp:459
virtual void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const
This emits linkage information about GVSym based on GV, if this is supported by the target.
Definition: AsmPrinter.cpp:652
void getAnalysisUsage(AnalysisUsage &AU) const override
Record analysis usage.
Definition: AsmPrinter.cpp:450
unsigned getFunctionNumber() const
Return a unique ID for the current function.
Definition: AsmPrinter.cpp:404
MachineOptimizationRemarkEmitter * ORE
Optimization remark emitter.
Definition: AsmPrinter.h:116
MCSymbol * CurrentFnSym
The symbol for the current function.
Definition: AsmPrinter.h:123
MachineModuleInfo * MMI
This is a pointer to the current MachineModuleInfo.
Definition: AsmPrinter.h:107
MCContext & OutContext
This is the context for the output file that we are streaming.
Definition: AsmPrinter.h:96
bool doFinalization(Module &M) override
Shut down the asmprinter.
virtual void emitBasicBlockStart(const MachineBasicBlock &MBB)
Targets can override this to emit stuff at the start of a basic block.
void emitVisibility(MCSymbol *Sym, unsigned Visibility, bool IsDefinition=true) const
This emits visibility information about symbol, if this is supported by the target.
std::unique_ptr< MCStreamer > OutStreamer
This is the MCStreamer object for the file we are generating.
Definition: AsmPrinter.h:101
bool isVerbose() const
Return true if assembly output should contain comments.
Definition: AsmPrinter.h:257
void getNameWithPrefix(SmallVectorImpl< char > &Name, const GlobalValue *GV) const
Definition: AsmPrinter.cpp:692
virtual void emitFunctionEntryLabel()
EmitFunctionEntryLabel - Emit the label that is the entrypoint for the function.
virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, const char *ExtraCode, raw_ostream &OS)
Print the specified operand of MI, an INLINEASM instruction, using the specified assembler variant.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
Diagnostic information for optimization failures.
Diagnostic information for stack size etc.
DISubprogram * getSubprogram() const
Get the attached subprogram.
Definition: Metadata.cpp:1874
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasGFX90AInsts() const
unsigned computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Return occupancy for the given function.
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
bool hasSGPRInitBug() const
bool isTgSplitEnabled() const
Definition: GCNSubtarget.h:623
unsigned getMinNumVGPRs(unsigned WavesPerEU) const
bool isCuModeEnabled() const
Definition: GCNSubtarget.h:627
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
Definition: GCNSubtarget.h:317
bool dumpCode() const
Definition: GCNSubtarget.h:523
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:615
bool isWave32() const
unsigned getMaxNumUserSGPRs() const
Generation getGeneration() const
Definition: GCNSubtarget.h:327
unsigned getAddressableNumSGPRs() const
unsigned getMaxWaveScratchSize() const
Definition: GCNSubtarget.h:331
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasPrivateSegmentSize() const
MaybeAlign getAlign() const
Returns the alignment of the given variable or function.
Definition: GlobalObject.h:79
VisibilityTypes getVisibility() const
Definition: GlobalValue.h:248
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition: Globals.cpp:296
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition: Globals.cpp:130
Type * getValueType() const
Definition: GlobalValue.h:296
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
const DiagnosticHandler * getDiagHandlerPtr() const
getDiagHandlerPtr - Returns const raw pointer of DiagnosticHandler set by setDiagnosticHandler.
MCCodeEmitter * getEmitterPtr() const
Definition: MCAssembler.h:186
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:542
static const MCBinaryExpr * createAdd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:537
static const MCBinaryExpr * createOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:602
static const MCBinaryExpr * createLOr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:572
static const MCBinaryExpr * createMul(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:592
static const MCBinaryExpr * createGT(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:557
static const MCBinaryExpr * createDiv(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:547
static const MCBinaryExpr * createShl(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:607
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:622
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:222
Context object for machine code objects.
Definition: MCContext.h:83
const MCObjectFileInfo * getObjectFileInfo() const
Definition: MCContext.h:416
MCSectionELF * getELFSection(const Twine &Section, unsigned Type, unsigned Flags)
Definition: MCContext.h:551
void reportError(SMLoc L, const Twine &Msg)
Definition: MCContext.cpp:1072
Base class for the full range of assembler expressions which are needed for parsing.
Definition: MCExpr.h:34
MCSection * getReadOnlySection() const
MCContext & getContext() const
void gatherResourceInfo(const MachineFunction &MF, const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &FRI, MCContext &OutContext)
AMDGPUResourceUsageAnalysis gathers resource usage on a per-function granularity.
MCSymbol * getMaxSGPRSymbol(MCContext &OutContext)
MCSymbol * getMaxAGPRSymbol(MCContext &OutContext)
const MCExpr * createTotalNumVGPRs(const MachineFunction &MF, MCContext &Ctx)
void finalize(MCContext &OutContext)
MCSymbol * getMaxVGPRSymbol(MCContext &OutContext)
const MCExpr * createTotalNumSGPRs(const MachineFunction &MF, bool hasXnack, MCContext &Ctx)
MCSymbol * getSymbol(StringRef FuncName, ResourceInfoKind RIK, MCContext &OutContext)
This represents a section on linux, lots of unix variants and some bare metal systems.
Definition: MCSectionELF.h:27
void ensureMinAlignment(Align MinAlignment)
Makes sure that Alignment is at least MinAlignment.
Definition: MCSection.h:150
MCContext & getContext() const
Definition: MCStreamer.h:300
Generic base class for all target subtargets.
const Triple & getTargetTriple() const
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx)
Definition: MCExpr.h:398
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
const MCExpr * getVariableValue(bool SetUsed=true) const
getVariableValue - Get the value for variable symbols.
Definition: MCSymbol.h:305
bool isDefined() const
isDefined - Check if this symbol is defined (i.e., it has an address).
Definition: MCSymbol.h:250
StringRef getName() const
getName - Get the symbol name.
Definition: MCSymbol.h:205
bool isVariable() const
isVariable - Check if this is a variable symbol.
Definition: MCSymbol.h:300
void redefineIfPossible()
Prepare this symbol to be redefined.
Definition: MCSymbol.h:232
MCStreamer & getStreamer()
Definition: MCStreamer.h:102
static const MCUnaryExpr * createNot(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc=SMLoc())
Definition: MCExpr.h:467
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void setAlignment(Align A)
setAlignment - Set the alignment of the function.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MCContext & getContext() const
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
Representation of each machine instruction.
Definition: MachineInstr.h:69
This class contains meta information specific to a module.
MachineFunction * getMachineFunction(const Function &F) const
Returns the MachineFunction associated to IR function F if there is one, otherwise nullptr.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
Diagnostic information for optimization analysis remarks.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Emit an optimization remark.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
GCNUserSGPRUsageInfo & getUserSGPRInfo()
SIModeRegisterDefaults getMode() const
unsigned getNumKernargPreloadedSGPRs() const
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:853
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
std::string str() const
str - Get the contents as an std::string.
Definition: StringRef.h:229
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
const Triple & getTargetTriple() const
const MCSubtargetInfo * getMCSubtargetInfo() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
MCSymbol * getSymbol(const GlobalValue *GV) const
virtual const TargetRegisterInfo * getRegisterInfo() const
getRegisterInfo - If register information is available, return it.
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:392
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
LLVM Value Representation.
Definition: Value.h:74
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ LOCAL_ADDRESS
Address space for local memory.
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, std::optional< bool > EnableWavefrontSize32)
unsigned getNumExtraSGPRs(const MCSubtargetInfo *STI, bool VCCUsed, bool FlatScrUsed, bool XNACKUsed)
unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI)
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR)
void printAMDGPUMCExpr(const MCExpr *Expr, raw_ostream &OS, const MCAsmInfo *MAI)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
IsaVersion getIsaVersion(StringRef GPU)
bool isCompute(CallingConv::ID cc)
const MCExpr * maskShiftSet(const MCExpr *Val, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
Provided with the MCExpr * Val, uint32 Mask and Shift, will return the masked and left shifted,...
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX90A(const MCSubtargetInfo &STI)
bool hasMAIInsts(const MCSubtargetInfo &STI)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isShader(CallingConv::ID cc)
const MCExpr * foldAMDGPUMCExpr(const MCExpr *Expr, MCContext &Ctx)
bool isGFX10Plus(const MCSubtargetInfo &STI)
constexpr std::pair< unsigned, unsigned > getShiftMask(unsigned Value)
Deduce the least significant bit aligned shift and mask values for a binary Complement Value (as they...
unsigned hasKernargPreload(const MCSubtargetInfo &STI)
bool isModuleEntryFunctionCC(CallingConv::ID CC)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ SPIR_KERNEL
Used for SPIR kernel functions.
Definition: CallingConv.h:144
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ SHT_PROGBITS
Definition: ELF.h:1095
@ STT_AMDGPU_HSA_KERNEL
Definition: ELF.h:1372
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
Target & getTheR600Target()
The target for R600 GPUs.
@ DK_ResourceLimit
AsmPrinter * createR600AsmPrinterPass(TargetMachine &TM, std::unique_ptr< MCStreamer > &&Streamer)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
Target & getTheGCNTarget()
The target for GCN GPUs.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1873
@ DS_Error
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:208
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
const SIFunctionResourceInfo & getResourceInfo() const
void validate(const MCSubtargetInfo *STI, MCContext &Ctx)
void initDefault(const MCSubtargetInfo *STI, MCContext &Ctx, bool InitMCExpr=true)
static const MCExpr * bits_get(const MCExpr *Src, uint32_t Shift, uint32_t Mask, MCContext &Ctx)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
virtual bool isAnalysisRemarkEnabled(StringRef PassName) const
Return true if analysis remarks are enabled, override to provide different implementation.
Track resource usage for kernels / entry functions.
Definition: SIProgramInfo.h:31
const MCExpr * NumSGPR
Definition: SIProgramInfo.h:70
const MCExpr * ComputePGMRSrc3GFX90A
Definition: SIProgramInfo.h:63
const MCExpr * NumArchVGPR
Definition: SIProgramInfo.h:66
const MCExpr * getComputePGMRSrc2(MCContext &Ctx) const
Compute the value of the ComputePGMRsrc2 register.
const MCExpr * VGPRBlocks
Definition: SIProgramInfo.h:33
const MCExpr * ScratchBlocks
Definition: SIProgramInfo.h:48
const MCExpr * getComputePGMRSrc1(const GCNSubtarget &ST, MCContext &Ctx) const
Compute the value of the ComputePGMRsrc1 register.
const MCExpr * VCCUsed
Definition: SIProgramInfo.h:90
const MCExpr * FlatUsed
Definition: SIProgramInfo.h:74
uint32_t TrapHandlerEnable
Definition: SIProgramInfo.h:53
const MCExpr * ScratchEnable
Definition: SIProgramInfo.h:51
const MCExpr * AccumOffset
Definition: SIProgramInfo.h:68
const MCExpr * NumAccVGPR
Definition: SIProgramInfo.h:67
const MCExpr * DynamicCallStack
Definition: SIProgramInfo.h:87
const MCExpr * SGPRBlocks
Definition: SIProgramInfo.h:34
const MCExpr * NumVGPRsForWavesPerEU
Definition: SIProgramInfo.h:80
const MCExpr * NumVGPR
Definition: SIProgramInfo.h:65
const MCExpr * getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST, MCContext &Ctx) const
const MCExpr * Occupancy
Definition: SIProgramInfo.h:83
const MCExpr * ScratchSize
Definition: SIProgramInfo.h:44
const MCExpr * NumSGPRsForWavesPerEU
Definition: SIProgramInfo.h:77
void reset(const MachineFunction &MF)
static void RegisterAsmPrinter(Target &T, Target::AsmPrinterCtorTy Fn)
RegisterAsmPrinter - Register an AsmPrinter implementation for the given target.