Line data Source code
1 : //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer -------------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : /// \file
11 : ///
12 : /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
13 : /// code. When passed an MCAsmStreamer it prints assembly and when passed
14 : /// an MCObjectStreamer it outputs binary code.
15 : //
16 : //===----------------------------------------------------------------------===//
17 : //
18 :
19 : #include "AMDGPUAsmPrinter.h"
20 : #include "AMDGPU.h"
21 : #include "AMDGPUSubtarget.h"
22 : #include "AMDGPUTargetMachine.h"
23 : #include "InstPrinter/AMDGPUInstPrinter.h"
24 : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
25 : #include "MCTargetDesc/AMDGPUTargetStreamer.h"
26 : #include "R600AsmPrinter.h"
27 : #include "R600Defines.h"
28 : #include "R600MachineFunctionInfo.h"
29 : #include "R600RegisterInfo.h"
30 : #include "SIDefines.h"
31 : #include "SIInstrInfo.h"
32 : #include "SIMachineFunctionInfo.h"
33 : #include "SIRegisterInfo.h"
34 : #include "Utils/AMDGPUBaseInfo.h"
35 : #include "llvm/BinaryFormat/ELF.h"
36 : #include "llvm/CodeGen/MachineFrameInfo.h"
37 : #include "llvm/IR/DiagnosticInfo.h"
38 : #include "llvm/MC/MCContext.h"
39 : #include "llvm/MC/MCSectionELF.h"
40 : #include "llvm/MC/MCStreamer.h"
41 : #include "llvm/Support/AMDGPUMetadata.h"
42 : #include "llvm/Support/MathExtras.h"
43 : #include "llvm/Support/TargetParser.h"
44 : #include "llvm/Support/TargetRegistry.h"
45 : #include "llvm/Target/TargetLoweringObjectFile.h"
46 :
47 : using namespace llvm;
48 : using namespace llvm::AMDGPU;
49 :
50 : // TODO: This should get the default rounding mode from the kernel. We just set
51 : // the default here, but this could change if the OpenCL rounding mode pragmas
52 : // are used.
53 : //
54 : // The denormal mode here should match what is reported by the OpenCL runtime
55 : // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
56 : // can also be override to flush with the -cl-denorms-are-zero compiler flag.
57 : //
58 : // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
59 : // precision, and leaves single precision to flush all and does not report
60 : // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
61 : // CL_FP_DENORM for both.
62 : //
63 : // FIXME: It seems some instructions do not support single precision denormals
64 : // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
65 : // and sin_f32, cos_f32 on most parts).
66 :
67 : // We want to use these instructions, and using fp32 denormals also causes
68 : // instructions to run at the double precision rate for the device so it's
69 : // probably best to just report no single precision denormals.
70 : static uint32_t getFPMode(const MachineFunction &F) {
71 17960 : const GCNSubtarget& ST = F.getSubtarget<GCNSubtarget>();
72 : // TODO: Is there any real use for the flush in only / flush out only modes?
73 :
74 : uint32_t FP32Denormals =
75 17960 : ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
76 :
77 : uint32_t FP64Denormals =
78 17960 : ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
79 :
80 : return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
81 : FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
82 17960 : FP_DENORM_MODE_SP(FP32Denormals) |
83 17960 : FP_DENORM_MODE_DP(FP64Denormals);
84 : }
85 :
86 : static AsmPrinter *
87 1950 : createAMDGPUAsmPrinterPass(TargetMachine &tm,
88 : std::unique_ptr<MCStreamer> &&Streamer) {
89 1950 : return new AMDGPUAsmPrinter(tm, std::move(Streamer));
90 : }
91 :
92 65841 : extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
93 65841 : TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(),
94 : llvm::createR600AsmPrinterPass);
95 65841 : TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
96 : createAMDGPUAsmPrinterPass);
97 65841 : }
98 :
99 1950 : AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
100 1950 : std::unique_ptr<MCStreamer> Streamer)
101 7800 : : AsmPrinter(TM, std::move(Streamer)) {
102 1950 : }
103 :
104 0 : StringRef AMDGPUAsmPrinter::getPassName() const {
105 0 : return "AMDGPU Assembly Printer";
106 : }
107 :
108 100767 : const MCSubtargetInfo* AMDGPUAsmPrinter::getSTI() const {
109 201534 : return TM.getMCSubtargetInfo();
110 : }
111 :
112 10035 : AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
113 10035 : if (!OutStreamer)
114 : return nullptr;
115 10035 : return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
116 : }
117 :
118 1949 : void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
119 1949 : if (IsaInfo::hasCodeObjectV3(getSTI())) {
120 : std::string ExpectedTarget;
121 0 : raw_string_ostream ExpectedTargetOS(ExpectedTarget);
122 36 : IsaInfo::streamIsaVersion(getSTI(), ExpectedTargetOS);
123 :
124 36 : getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget);
125 :
126 36 : if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
127 36 : return;
128 : }
129 :
130 1913 : if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
131 : TM.getTargetTriple().getOS() != Triple::AMDPAL)
132 : return;
133 :
134 369 : if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
135 324 : HSAMetadataStream.begin(M);
136 :
137 369 : if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
138 45 : readPALMetadata(M);
139 :
140 : // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
141 369 : if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
142 324 : getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
143 :
144 : // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2.
145 369 : IsaVersion Version = getIsaVersion(getSTI()->getCPU());
146 369 : getTargetStreamer()->EmitDirectiveHSACodeObjectISA(
147 369 : Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU");
148 : }
149 :
150 1940 : void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
151 : // TODO: Add metadata to code object v3.
152 1940 : if (IsaInfo::hasCodeObjectV3(getSTI()) &&
153 36 : TM.getTargetTriple().getOS() == Triple::AMDHSA)
154 37 : return;
155 :
156 : // Following code requires TargetStreamer to be present.
157 1904 : if (!getTargetStreamer())
158 : return;
159 :
160 : // Emit ISA Version (NT_AMD_AMDGPU_ISA).
161 : std::string ISAVersionString;
162 1903 : raw_string_ostream ISAVersionStream(ISAVersionString);
163 1903 : IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream);
164 1903 : getTargetStreamer()->EmitISAVersion(ISAVersionStream.str());
165 :
166 : // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
167 1903 : if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
168 323 : HSAMetadataStream.end();
169 323 : getTargetStreamer()->EmitHSAMetadata(HSAMetadataStream.getHSAMetadata());
170 : }
171 :
172 : // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA).
173 1903 : if (TM.getTargetTriple().getOS() == Triple::AMDPAL) {
174 : // Copy the PAL metadata from the map where we collected it into a vector,
175 : // then write it as a .note.
176 : PALMD::Metadata PALMetadataVector;
177 279 : for (auto i : PALMetadataMap) {
178 234 : PALMetadataVector.push_back(i.first);
179 234 : PALMetadataVector.push_back(i.second);
180 : }
181 45 : getTargetStreamer()->EmitPALMetadata(PALMetadataVector);
182 : }
183 : }
184 :
185 7349 : bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
186 : const MachineBasicBlock *MBB) const {
187 7349 : if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB))
188 : return false;
189 :
190 1950 : if (MBB->empty())
191 : return true;
192 :
193 : // If this is a block implementing a long branch, an expression relative to
194 : // the start of the block is needed. to the start of the block.
195 : // XXX - Is there a smarter way to check this?
196 3858 : return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
197 : }
198 :
199 19722 : void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
200 19722 : const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
201 19722 : if (!MFI.isEntryFunction())
202 : return;
203 :
204 17960 : const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
205 17960 : const Function &F = MF->getFunction();
206 17960 : if (!STM.hasCodeObjectV3() && STM.isAmdHsaOrMesa(F) &&
207 3 : (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
208 : F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
209 : amd_kernel_code_t KernelCode;
210 2526 : getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
211 2526 : getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
212 : }
213 :
214 17960 : if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
215 : return;
216 :
217 2475 : if (!STM.hasCodeObjectV3() && STM.isAmdHsaOS())
218 2437 : HSAMetadataStream.emitKernel(*MF, CurrentProgramInfo);
219 : }
220 :
221 19722 : void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
222 19722 : const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
223 19722 : if (!MFI.isEntryFunction())
224 19684 : return;
225 17960 : if (!IsaInfo::hasCodeObjectV3(getSTI()) ||
226 38 : TM.getTargetTriple().getOS() != Triple::AMDHSA)
227 : return;
228 :
229 38 : auto &Streamer = getTargetStreamer()->getStreamer();
230 38 : auto &Context = Streamer.getContext();
231 38 : auto &ObjectFileInfo = *Context.getObjectFileInfo();
232 38 : auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
233 :
234 38 : Streamer.PushSection();
235 38 : Streamer.SwitchSection(&ReadOnlySection);
236 :
237 : // CP microcode requires the kernel descriptor to be allocated on 64 byte
238 : // alignment.
239 38 : Streamer.EmitValueToAlignment(64, 0, 1, 0);
240 38 : if (ReadOnlySection.getAlignment() < 64)
241 : ReadOnlySection.setAlignment(64);
242 :
243 : SmallString<128> KernelName;
244 38 : getNameWithPrefix(KernelName, &MF->getFunction());
245 76 : getTargetStreamer()->EmitAmdhsaKernelDescriptor(
246 38 : *getSTI(), KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
247 38 : CurrentProgramInfo.NumVGPRsForWavesPerEU,
248 38 : CurrentProgramInfo.NumSGPRsForWavesPerEU -
249 38 : IsaInfo::getNumExtraSGPRs(getSTI(),
250 : CurrentProgramInfo.VCCUsed,
251 : CurrentProgramInfo.FlatUsed),
252 38 : CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
253 38 : hasXNACK(*getSTI()));
254 :
255 38 : Streamer.PopSection();
256 : }
257 :
258 19722 : void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
259 19722 : if (IsaInfo::hasCodeObjectV3(getSTI()) &&
260 38 : TM.getTargetTriple().getOS() == Triple::AMDHSA) {
261 38 : AsmPrinter::EmitFunctionEntryLabel();
262 38 : return;
263 : }
264 :
265 19684 : const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
266 19684 : const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
267 19684 : if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
268 : SmallString<128> SymbolName;
269 2529 : getNameWithPrefix(SymbolName, &MF->getFunction()),
270 2529 : getTargetStreamer()->EmitAMDGPUSymbolType(
271 2529 : SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
272 : }
273 19684 : const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
274 19684 : if (STI.dumpCode()) {
275 : // Disassemble function name label to text.
276 6 : DisasmLines.push_back(MF->getName().str() + ":");
277 2 : DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
278 4 : HexLines.push_back("");
279 : }
280 :
281 19684 : AsmPrinter::EmitFunctionEntryLabel();
282 : }
283 :
284 22171 : void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
285 22171 : const GCNSubtarget &STI = MBB.getParent()->getSubtarget<GCNSubtarget>();
286 22171 : if (STI.dumpCode() && !isBlockOnlyReachableByFallthrough(&MBB)) {
287 : // Write a line for the basic block label if it is not only fallthrough.
288 2 : DisasmLines.push_back(
289 2 : (Twine("BB") + Twine(getFunctionNumber())
290 6 : + "_" + Twine(MBB.getNumber()) + ":").str());
291 2 : DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
292 4 : HexLines.push_back("");
293 : }
294 22171 : AsmPrinter::EmitBasicBlockStart(MBB);
295 22171 : }
296 :
297 319 : void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
298 :
299 : // Group segment variables aren't emitted in HSA.
300 319 : if (AMDGPU::isGroupSegment(GV))
301 : return;
302 :
303 96 : AsmPrinter::EmitGlobalVariable(GV);
304 : }
305 :
306 1940 : bool AMDGPUAsmPrinter::doFinalization(Module &M) {
307 1940 : CallGraphResourceInfo.clear();
308 1940 : return AsmPrinter::doFinalization(M);
309 : }
310 :
311 : // For the amdpal OS type, read the amdgpu.pal.metadata supplied by the
312 : // frontend into our PALMetadataMap, ready for per-function modification. It
313 : // is a NamedMD containing an MDTuple containing a number of MDNodes each of
314 : // which is an integer value, and each two integer values forms a key=value
315 : // pair that we store as PALMetadataMap[key]=value in the map.
316 45 : void AMDGPUAsmPrinter::readPALMetadata(Module &M) {
317 45 : auto NamedMD = M.getNamedMetadata("amdgpu.pal.metadata");
318 45 : if (!NamedMD || !NamedMD->getNumOperands())
319 42 : return;
320 3 : auto Tuple = dyn_cast<MDTuple>(NamedMD->getOperand(0));
321 : if (!Tuple)
322 : return;
323 9 : for (unsigned I = 0, E = Tuple->getNumOperands() & -2; I != E; I += 2) {
324 6 : auto Key = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I));
325 6 : auto Val = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I + 1));
326 6 : if (!Key || !Val)
327 : continue;
328 6 : PALMetadataMap[Key->getZExtValue()] = Val->getZExtValue();
329 : }
330 : }
331 :
332 : // Print comments that apply to both callable functions and entry points.
333 19361 : void AMDGPUAsmPrinter::emitCommonFunctionComments(
334 : uint32_t NumVGPR,
335 : uint32_t NumSGPR,
336 : uint64_t ScratchSize,
337 : uint64_t CodeSize,
338 : const AMDGPUMachineFunction *MFI) {
339 19361 : OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
340 19361 : OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
341 19361 : OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
342 19361 : OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
343 19361 : OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
344 19361 : false);
345 19361 : }
346 :
347 38 : uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
348 : const MachineFunction &MF) const {
349 : const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
350 : uint16_t KernelCodeProperties = 0;
351 :
352 38 : if (MFI.hasPrivateSegmentBuffer()) {
353 : KernelCodeProperties |=
354 : amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
355 : }
356 38 : if (MFI.hasDispatchPtr()) {
357 0 : KernelCodeProperties |=
358 : amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
359 : }
360 38 : if (MFI.hasQueuePtr()) {
361 0 : KernelCodeProperties |=
362 : amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
363 : }
364 38 : if (MFI.hasKernargSegmentPtr()) {
365 4 : KernelCodeProperties |=
366 : amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
367 : }
368 38 : if (MFI.hasDispatchID()) {
369 0 : KernelCodeProperties |=
370 : amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
371 : }
372 38 : if (MFI.hasFlatScratchInit()) {
373 0 : KernelCodeProperties |=
374 : amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
375 : }
376 :
377 38 : return KernelCodeProperties;
378 : }
379 :
380 38 : amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
381 : const MachineFunction &MF,
382 : const SIProgramInfo &PI) const {
383 : amdhsa::kernel_descriptor_t KernelDescriptor;
384 38 : memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
385 :
386 : assert(isUInt<32>(PI.ScratchSize));
387 : assert(isUInt<32>(PI.ComputePGMRSrc1));
388 : assert(isUInt<32>(PI.ComputePGMRSrc2));
389 :
390 38 : KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
391 38 : KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
392 38 : KernelDescriptor.compute_pgm_rsrc1 = PI.ComputePGMRSrc1;
393 38 : KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
394 38 : KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
395 :
396 38 : return KernelDescriptor;
397 : }
398 :
399 19722 : bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
400 19722 : CurrentProgramInfo = SIProgramInfo();
401 :
402 19722 : const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
403 :
404 : // The starting address of all shader programs must be 256 bytes aligned.
405 : // Regular functions just need the basic required instruction alignment.
406 19722 : MF.setAlignment(MFI->isEntryFunction() ? 8 : 2);
407 :
408 19722 : SetupMachineFunction(MF);
409 :
410 19722 : const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
411 19722 : MCContext &Context = getObjFileLowering().getContext();
412 : // FIXME: This should be an explicit check for Mesa.
413 19722 : if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
414 : MCSectionELF *ConfigSection =
415 16802 : Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
416 16802 : OutStreamer->SwitchSection(ConfigSection);
417 : }
418 :
419 19722 : if (MFI->isEntryFunction()) {
420 17960 : getSIProgramInfo(CurrentProgramInfo, MF);
421 : } else {
422 1762 : auto I = CallGraphResourceInfo.insert(
423 1762 : std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
424 1762 : SIFunctionResourceInfo &Info = I.first->second;
425 : assert(I.second && "should only be called once per function");
426 1762 : Info = analyzeResourceUsage(MF);
427 : }
428 :
429 19722 : if (STM.isAmdPalOS())
430 56 : EmitPALMetadata(MF, CurrentProgramInfo);
431 19666 : else if (!STM.isAmdHsaOS()) {
432 16802 : EmitProgramInfoSI(MF, CurrentProgramInfo);
433 : }
434 :
435 : DisasmLines.clear();
436 : HexLines.clear();
437 19722 : DisasmLineMaxLen = 0;
438 :
439 19722 : EmitFunctionBody();
440 :
441 19722 : if (isVerbose()) {
442 : MCSectionELF *CommentSection =
443 19361 : Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
444 19361 : OutStreamer->SwitchSection(CommentSection);
445 :
446 19361 : if (!MFI->isEntryFunction()) {
447 3512 : OutStreamer->emitRawComment(" Function info:", false);
448 1756 : SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
449 1756 : emitCommonFunctionComments(
450 1756 : Info.NumVGPR,
451 1756 : Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
452 : Info.PrivateSegmentSize,
453 : getFunctionCodeSize(MF), MFI);
454 1756 : return false;
455 : }
456 :
457 35210 : OutStreamer->emitRawComment(" Kernel info:", false);
458 17605 : emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
459 : CurrentProgramInfo.NumSGPR,
460 : CurrentProgramInfo.ScratchSize,
461 : getFunctionCodeSize(MF), MFI);
462 :
463 17605 : OutStreamer->emitRawComment(
464 35210 : " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
465 17605 : OutStreamer->emitRawComment(
466 35210 : " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
467 17605 : OutStreamer->emitRawComment(
468 17605 : " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
469 35210 : " bytes/workgroup (compile time only)", false);
470 :
471 17605 : OutStreamer->emitRawComment(
472 35210 : " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
473 17605 : OutStreamer->emitRawComment(
474 35210 : " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
475 :
476 17605 : OutStreamer->emitRawComment(
477 17605 : " NumSGPRsForWavesPerEU: " +
478 35210 : Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
479 17605 : OutStreamer->emitRawComment(
480 17605 : " NumVGPRsForWavesPerEU: " +
481 35210 : Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
482 :
483 17605 : OutStreamer->emitRawComment(
484 17605 : " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
485 :
486 17605 : if (MF.getSubtarget<GCNSubtarget>().debuggerEmitPrologue()) {
487 1 : OutStreamer->emitRawComment(
488 1 : " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
489 2 : Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
490 1 : OutStreamer->emitRawComment(
491 1 : " DebuggerPrivateSegmentBufferSGPR: s" +
492 2 : Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);
493 : }
494 :
495 17605 : OutStreamer->emitRawComment(
496 17605 : " COMPUTE_PGM_RSRC2:USER_SGPR: " +
497 35210 : Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
498 17605 : OutStreamer->emitRawComment(
499 17605 : " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
500 35210 : Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
501 17605 : OutStreamer->emitRawComment(
502 17605 : " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
503 35210 : Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
504 17605 : OutStreamer->emitRawComment(
505 17605 : " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
506 35210 : Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
507 17605 : OutStreamer->emitRawComment(
508 17605 : " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
509 35210 : Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
510 17605 : OutStreamer->emitRawComment(
511 17605 : " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
512 17605 : Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
513 17605 : false);
514 : }
515 :
516 17966 : if (STM.dumpCode()) {
517 :
518 2 : OutStreamer->SwitchSection(
519 4 : Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
520 :
521 22 : for (size_t i = 0; i < DisasmLines.size(); ++i) {
522 18 : std::string Comment = "\n";
523 36 : if (!HexLines[i].empty()) {
524 28 : Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
525 42 : Comment += " ; " + HexLines[i] + "\n";
526 : }
527 :
528 36 : OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
529 36 : OutStreamer->EmitBytes(StringRef(Comment));
530 : }
531 : }
532 :
533 : return false;
534 : }
535 :
536 19361 : uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
537 19361 : const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
538 19361 : const SIInstrInfo *TII = STM.getInstrInfo();
539 :
540 : uint64_t CodeSize = 0;
541 :
542 41080 : for (const MachineBasicBlock &MBB : MF) {
543 377725 : for (const MachineInstr &MI : MBB) {
544 : // TODO: CodeSize should account for multiple functions.
545 :
546 : // TODO: Should we count size of debug info?
547 : if (MI.isDebugInstr())
548 : continue;
549 :
550 355985 : CodeSize += TII->getInstSizeInBytes(MI);
551 : }
552 : }
553 :
554 19361 : return CodeSize;
555 : }
556 :
557 0 : static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
558 : const SIInstrInfo &TII,
559 : unsigned Reg) {
560 23082 : for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
561 10179 : if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
562 0 : return true;
563 : }
564 :
565 : return false;
566 : }
567 :
568 1756 : int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
569 : const GCNSubtarget &ST) const {
570 7024 : return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(&ST,
571 1756 : UsesVCC, UsesFlatScratch);
572 : }
573 :
574 19722 : AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
575 : const MachineFunction &MF) const {
576 19722 : SIFunctionResourceInfo Info;
577 :
578 : const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
579 19722 : const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
580 19722 : const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
581 19722 : const MachineRegisterInfo &MRI = MF.getRegInfo();
582 19722 : const SIInstrInfo *TII = ST.getInstrInfo();
583 : const SIRegisterInfo &TRI = TII->getRegisterInfo();
584 :
585 34729 : Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
586 15007 : MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);
587 :
588 : // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
589 : // instructions aren't used to access the scratch buffer. Inline assembly may
590 : // need it though.
591 : //
592 : // If we only have implicit uses of flat_scr on flat instructions, it is not
593 : // really needed.
594 24067 : if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
595 4312 : (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
596 4301 : !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
597 : !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
598 4290 : Info.UsesFlatScratch = false;
599 : }
600 :
601 19722 : Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
602 19722 : Info.PrivateSegmentSize = FrameInfo.getStackSize();
603 19722 : if (MFI->isStackRealigned())
604 3 : Info.PrivateSegmentSize += FrameInfo.getMaxAlignment();
605 :
606 :
607 34463 : Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
608 14741 : MRI.isPhysRegUsed(AMDGPU::VCC_HI);
609 :
610 : // If there are no calls, MachineRegisterInfo can tell us the used register
611 : // count easily.
612 : // A tail call isn't considered a call for MachineFrameInfo's purposes.
613 19722 : if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
614 : MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
615 4839000 : for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
616 4838184 : if (MRI.isPhysRegUsed(Reg)) {
617 : HighestVGPRReg = Reg;
618 : break;
619 : }
620 : }
621 :
622 : MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
623 1824975 : for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
624 1823990 : if (MRI.isPhysRegUsed(Reg)) {
625 : HighestSGPRReg = Reg;
626 : break;
627 : }
628 : }
629 :
630 : // We found the maximum register index. They start at 0, so add one to get the
631 : // number of registers.
632 19172 : Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
633 18356 : TRI.getHWRegIndex(HighestVGPRReg) + 1;
634 19172 : Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
635 18187 : TRI.getHWRegIndex(HighestSGPRReg) + 1;
636 :
637 19172 : return Info;
638 : }
639 :
640 550 : int32_t MaxVGPR = -1;
641 550 : int32_t MaxSGPR = -1;
642 550 : uint64_t CalleeFrameSize = 0;
643 :
644 1123 : for (const MachineBasicBlock &MBB : MF) {
645 9938 : for (const MachineInstr &MI : MBB) {
646 : // TODO: Check regmasks? Do they occur anywhere except calls?
647 45411 : for (const MachineOperand &MO : MI.operands()) {
648 : unsigned Width = 0;
649 : bool IsSGPR = false;
650 :
651 36046 : if (!MO.isReg())
652 : continue;
653 :
654 25982 : unsigned Reg = MO.getReg();
655 25982 : switch (Reg) {
656 : case AMDGPU::EXEC:
657 : case AMDGPU::EXEC_LO:
658 : case AMDGPU::EXEC_HI:
659 : case AMDGPU::SCC:
660 : case AMDGPU::M0:
661 : case AMDGPU::SRC_SHARED_BASE:
662 : case AMDGPU::SRC_SHARED_LIMIT:
663 : case AMDGPU::SRC_PRIVATE_BASE:
664 : case AMDGPU::SRC_PRIVATE_LIMIT:
665 : continue;
666 :
667 : case AMDGPU::NoRegister:
668 : assert(MI.isDebugInstr());
669 : continue;
670 :
671 68 : case AMDGPU::VCC:
672 : case AMDGPU::VCC_LO:
673 : case AMDGPU::VCC_HI:
674 68 : Info.UsesVCC = true;
675 68 : continue;
676 :
677 : case AMDGPU::FLAT_SCR:
678 : case AMDGPU::FLAT_SCR_LO:
679 : case AMDGPU::FLAT_SCR_HI:
680 : continue;
681 :
682 : case AMDGPU::XNACK_MASK:
683 : case AMDGPU::XNACK_MASK_LO:
684 : case AMDGPU::XNACK_MASK_HI:
685 : llvm_unreachable("xnack_mask registers should not be used");
686 :
687 : case AMDGPU::TBA:
688 : case AMDGPU::TBA_LO:
689 : case AMDGPU::TBA_HI:
690 : case AMDGPU::TMA:
691 : case AMDGPU::TMA_LO:
692 : case AMDGPU::TMA_HI:
693 : llvm_unreachable("trap handler registers should not be used");
694 :
695 : default:
696 : break;
697 : }
698 :
699 21204 : if (AMDGPU::SReg_32RegClass.contains(Reg)) {
700 : assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
701 : "trap handler registers should not be used");
702 : IsSGPR = true;
703 : Width = 1;
704 12055 : } else if (AMDGPU::VGPR_32RegClass.contains(Reg)) {
705 : IsSGPR = false;
706 : Width = 1;
707 6234 : } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
708 : assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
709 : "trap handler registers should not be used");
710 : IsSGPR = true;
711 : Width = 2;
712 3133 : } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
713 : IsSGPR = false;
714 : Width = 2;
715 2904 : } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
716 : IsSGPR = false;
717 : Width = 3;
718 2904 : } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
719 : assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
720 : "trap handler registers should not be used");
721 : IsSGPR = true;
722 : Width = 4;
723 201 : } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
724 : IsSGPR = false;
725 : Width = 4;
726 8 : } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
727 : assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
728 : "trap handler registers should not be used");
729 : IsSGPR = true;
730 : Width = 8;
731 8 : } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
732 : IsSGPR = false;
733 : Width = 8;
734 8 : } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
735 : assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
736 : "trap handler registers should not be used");
737 : IsSGPR = true;
738 : Width = 16;
739 0 : } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
740 : IsSGPR = false;
741 : Width = 16;
742 : } else {
743 0 : llvm_unreachable("Unknown register class");
744 : }
745 : unsigned HWReg = TRI.getHWRegIndex(Reg);
746 21204 : int MaxUsed = HWReg + Width - 1;
747 21204 : if (IsSGPR) {
748 28630 : MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
749 : } else {
750 11196 : MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
751 : }
752 : }
753 :
754 9365 : if (MI.isCall()) {
755 : // Pseudo used just to encode the underlying global. Is there a better
756 : // way to track this?
757 :
758 : const MachineOperand *CalleeOp
759 : = TII->getNamedOperand(MI, AMDGPU::OpName::callee);
760 575 : const Function *Callee = cast<Function>(CalleeOp->getGlobal());
761 575 : if (Callee->isDeclaration()) {
762 : // If this is a call to an external function, we can't do much. Make
763 : // conservative guesses.
764 :
765 : // 48 SGPRs - vcc, - flat_scr, -xnack
766 : int MaxSGPRGuess =
767 368 : 47 - IsaInfo::getNumExtraSGPRs(getSTI(), true,
768 736 : ST.hasFlatAddressSpace());
769 368 : MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
770 368 : MaxVGPR = std::max(MaxVGPR, 23);
771 :
772 368 : CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384));
773 368 : Info.UsesVCC = true;
774 368 : Info.UsesFlatScratch = ST.hasFlatAddressSpace();
775 368 : Info.HasDynamicallySizedStack = true;
776 : } else {
777 : // We force CodeGen to run in SCC order, so the callee's register
778 : // usage etc. should be the cumulative usage of all callees.
779 207 : auto I = CallGraphResourceInfo.find(Callee);
780 : assert(I != CallGraphResourceInfo.end() &&
781 : "callee should have been handled before caller");
782 :
783 207 : MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
784 207 : MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
785 : CalleeFrameSize
786 207 : = std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
787 207 : Info.UsesVCC |= I->second.UsesVCC;
788 207 : Info.UsesFlatScratch |= I->second.UsesFlatScratch;
789 207 : Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
790 207 : Info.HasRecursion |= I->second.HasRecursion;
791 : }
792 :
793 575 : if (!Callee->doesNotRecurse())
794 523 : Info.HasRecursion = true;
795 : }
796 : }
797 : }
798 :
799 550 : Info.NumExplicitSGPR = MaxSGPR + 1;
800 550 : Info.NumVGPR = MaxVGPR + 1;
801 550 : Info.PrivateSegmentSize += CalleeFrameSize;
802 :
803 550 : return Info;
804 : }
805 :
806 17960 : void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
807 : const MachineFunction &MF) {
808 17960 : SIFunctionResourceInfo Info = analyzeResourceUsage(MF);
809 :
810 17960 : ProgInfo.NumVGPR = Info.NumVGPR;
811 17960 : ProgInfo.NumSGPR = Info.NumExplicitSGPR;
812 17960 : ProgInfo.ScratchSize = Info.PrivateSegmentSize;
813 17960 : ProgInfo.VCCUsed = Info.UsesVCC;
814 17960 : ProgInfo.FlatUsed = Info.UsesFlatScratch;
815 17960 : ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
816 :
817 17960 : if (!isUInt<32>(ProgInfo.ScratchSize)) {
818 : DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
819 2 : ProgInfo.ScratchSize, DS_Error);
820 2 : MF.getFunction().getContext().diagnose(DiagStackSize);
821 : }
822 :
823 17960 : const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
824 : const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
825 17960 : const SIInstrInfo *TII = STM.getInstrInfo();
826 : const SIRegisterInfo *RI = &TII->getRegisterInfo();
827 :
828 : // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
829 : // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
830 : // unified.
831 17960 : unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
832 17960 : getSTI(), ProgInfo.VCCUsed, ProgInfo.FlatUsed);
833 :
834 : // Check the addressable register limit before we add ExtraSGPRs.
835 17960 : if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
836 9929 : !STM.hasSGPRInitBug()) {
837 : unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
838 5317 : if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
839 : // This can happen due to a compiler bug or when using inline asm.
840 1 : LLVMContext &Ctx = MF.getFunction().getContext();
841 : DiagnosticInfoResourceLimit Diag(MF.getFunction(),
842 : "addressable scalar registers",
843 1 : ProgInfo.NumSGPR, DS_Error,
844 : DK_ResourceLimit,
845 1 : MaxAddressableNumSGPRs);
846 1 : Ctx.diagnose(Diag);
847 1 : ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
848 : }
849 : }
850 :
851 : // Account for extra SGPRs and VGPRs reserved for debugger use.
852 17960 : ProgInfo.NumSGPR += ExtraSGPRs;
853 :
854 : // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
855 : // dispatch registers are function args.
856 17960 : unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
857 60738 : for (auto &Arg : MF.getFunction().args()) {
858 42778 : unsigned NumRegs = (Arg.getType()->getPrimitiveSizeInBits() + 31) / 32;
859 42778 : if (Arg.hasAttribute(Attribute::InReg))
860 2420 : WaveDispatchNumSGPR += NumRegs;
861 : else
862 40358 : WaveDispatchNumVGPR += NumRegs;
863 : }
864 17960 : ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
865 17960 : ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
866 :
867 : // Adjust number of registers used to meet default/requested minimum/maximum
868 : // number of waves per execution unit request.
869 17960 : ProgInfo.NumSGPRsForWavesPerEU = std::max(
870 17960 : std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
871 17960 : ProgInfo.NumVGPRsForWavesPerEU = std::max(
872 17960 : std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
873 :
874 17960 : if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
875 9929 : STM.hasSGPRInitBug()) {
876 : unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
877 12643 : if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
878 : // This can happen due to a compiler bug or when using inline asm to use
879 : // the registers which are usually reserved for vcc etc.
880 4 : LLVMContext &Ctx = MF.getFunction().getContext();
881 : DiagnosticInfoResourceLimit Diag(MF.getFunction(),
882 : "scalar registers",
883 4 : ProgInfo.NumSGPR, DS_Error,
884 : DK_ResourceLimit,
885 4 : MaxAddressableNumSGPRs);
886 4 : Ctx.diagnose(Diag);
887 4 : ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
888 4 : ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
889 : }
890 : }
891 :
892 17960 : if (STM.hasSGPRInitBug()) {
893 4612 : ProgInfo.NumSGPR =
894 : AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
895 4612 : ProgInfo.NumSGPRsForWavesPerEU =
896 : AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
897 : }
898 :
899 17960 : if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
900 0 : LLVMContext &Ctx = MF.getFunction().getContext();
901 : DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
902 0 : MFI->getNumUserSGPRs(), DS_Error);
903 0 : Ctx.diagnose(Diag);
904 : }
905 :
906 17960 : if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) {
907 4 : LLVMContext &Ctx = MF.getFunction().getContext();
908 : DiagnosticInfoResourceLimit Diag(MF.getFunction(), "local memory",
909 4 : MFI->getLDSSize(), DS_Error);
910 4 : Ctx.diagnose(Diag);
911 : }
912 :
913 17960 : ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
914 : getSTI(), ProgInfo.NumSGPRsForWavesPerEU);
915 17960 : ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
916 : getSTI(), ProgInfo.NumVGPRsForWavesPerEU);
917 :
918 : // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
919 : // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
920 : // attribute was requested.
921 17960 : if (STM.debuggerEmitPrologue()) {
922 4 : ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
923 4 : RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
924 4 : ProgInfo.DebuggerPrivateSegmentBufferSGPR =
925 4 : RI->getHWRegIndex(MFI->getScratchRSrcReg());
926 : }
927 :
928 : // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
929 : // register.
930 17960 : ProgInfo.FloatMode = getFPMode(MF);
931 :
932 17960 : ProgInfo.IEEEMode = STM.enableIEEEBit(MF);
933 :
934 : // Make clamp modifier on NaN input returns 0.
935 17960 : ProgInfo.DX10Clamp = STM.enableDX10Clamp();
936 :
937 : unsigned LDSAlignShift;
938 17960 : if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
939 : // LDS is allocated in 64 dword blocks.
940 : LDSAlignShift = 8;
941 : } else {
942 : // LDS is allocated in 128 dword blocks.
943 : LDSAlignShift = 9;
944 : }
945 :
946 : unsigned LDSSpillSize =
947 17960 : MFI->getLDSWaveSpillSize() * MFI->getMaxFlatWorkGroupSize();
948 :
949 17960 : ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize;
950 17960 : ProgInfo.LDSBlocks =
951 17960 : alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
952 :
953 : // Scratch is allocated in 256 dword blocks.
954 : unsigned ScratchAlignShift = 10;
955 : // We need to program the hardware with the amount of scratch memory that
956 : // is used by the entire wave. ProgInfo.ScratchSize is the amount of
957 : // scratch memory used per thread.
958 17960 : ProgInfo.ScratchBlocks =
959 17960 : alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
960 17960 : 1ULL << ScratchAlignShift) >>
961 : ScratchAlignShift;
962 :
963 17960 : ProgInfo.ComputePGMRSrc1 =
964 35920 : S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
965 35920 : S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
966 35920 : S_00B848_PRIORITY(ProgInfo.Priority) |
967 35920 : S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
968 35920 : S_00B848_PRIV(ProgInfo.Priv) |
969 35920 : S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
970 35920 : S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
971 17960 : S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
972 :
973 : // 0 = X, 1 = XY, 2 = XYZ
974 : unsigned TIDIGCompCnt = 0;
975 17960 : if (MFI->hasWorkItemIDZ())
976 : TIDIGCompCnt = 2;
977 17884 : else if (MFI->hasWorkItemIDY())
978 : TIDIGCompCnt = 1;
979 :
980 17960 : ProgInfo.ComputePGMRSrc2 =
981 35920 : S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
982 33445 : S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
983 : // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
984 17960 : S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) |
985 17960 : S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
986 17960 : S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
987 17960 : S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
988 35920 : S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
989 17960 : S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
990 17960 : S_00B84C_EXCP_EN_MSB(0) |
991 : // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
992 17960 : S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
993 : S_00B84C_EXCP_EN(0);
994 17960 : }
995 :
996 : static unsigned getRsrcReg(CallingConv::ID CallConv) {
997 : switch (CallConv) {
998 : default: LLVM_FALLTHROUGH;
999 : case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
1000 : case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
1001 : case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
1002 : case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
1003 : case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
1004 : case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
1005 : case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
1006 : }
1007 : }
1008 :
1009 16802 : void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
1010 : const SIProgramInfo &CurrentProgramInfo) {
1011 16802 : const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1012 : const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1013 16802 : unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
1014 :
1015 16802 : if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
1016 15142 : OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
1017 :
1018 15142 : OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4);
1019 :
1020 15142 : OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
1021 15142 : OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc2, 4);
1022 :
1023 15142 : OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
1024 15142 : OutStreamer->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
1025 :
1026 : // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
1027 : // 0" comment but I don't see a corresponding field in the register spec.
1028 : } else {
1029 1660 : OutStreamer->EmitIntValue(RsrcReg, 4);
1030 3320 : OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
1031 1660 : S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
1032 1660 : if (STM.isVGPRSpillingEnabled(MF.getFunction())) {
1033 34 : OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
1034 34 : OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
1035 : }
1036 : }
1037 :
1038 33604 : if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1039 1416 : OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
1040 1416 : OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4);
1041 1416 : OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
1042 1416 : OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
1043 1416 : OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
1044 1416 : OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
1045 : }
1046 :
1047 16802 : OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
1048 16802 : OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4);
1049 16802 : OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4);
1050 16802 : OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4);
1051 16802 : }
1052 :
1053 : // This is the equivalent of EmitProgramInfoSI above, but for when the OS type
1054 : // is AMDPAL. It stores each compute/SPI register setting and other PAL
1055 : // metadata items into the PALMetadataMap, combining with any provided by the
1056 : // frontend as LLVM metadata. Once all functions are written, PALMetadataMap is
1057 : // then written as a single block in the .note section.
1058 56 : void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
1059 : const SIProgramInfo &CurrentProgramInfo) {
1060 : const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1061 : // Given the calling convention, calculate the register number for rsrc1. In
1062 : // principle the register number could change in future hardware, but we know
1063 : // it is the same for gfx6-9 (except that LS and ES don't exist on gfx9), so
1064 : // we can use the same fixed value that .AMDGPU.config has for Mesa. Note
1065 : // that we use a register number rather than a byte offset, so we need to
1066 : // divide by 4.
1067 56 : unsigned Rsrc1Reg = getRsrcReg(MF.getFunction().getCallingConv()) / 4;
1068 56 : unsigned Rsrc2Reg = Rsrc1Reg + 1;
1069 : // Also calculate the PAL metadata key for *S_SCRATCH_SIZE. It can be used
1070 : // with a constant offset to access any non-register shader-specific PAL
1071 : // metadata key.
1072 56 : unsigned ScratchSizeKey = PALMD::Key::CS_SCRATCH_SIZE;
1073 56 : switch (MF.getFunction().getCallingConv()) {
1074 17 : case CallingConv::AMDGPU_PS:
1075 17 : ScratchSizeKey = PALMD::Key::PS_SCRATCH_SIZE;
1076 17 : break;
1077 6 : case CallingConv::AMDGPU_VS:
1078 6 : ScratchSizeKey = PALMD::Key::VS_SCRATCH_SIZE;
1079 6 : break;
1080 3 : case CallingConv::AMDGPU_GS:
1081 3 : ScratchSizeKey = PALMD::Key::GS_SCRATCH_SIZE;
1082 3 : break;
1083 2 : case CallingConv::AMDGPU_ES:
1084 2 : ScratchSizeKey = PALMD::Key::ES_SCRATCH_SIZE;
1085 2 : break;
1086 4 : case CallingConv::AMDGPU_HS:
1087 4 : ScratchSizeKey = PALMD::Key::HS_SCRATCH_SIZE;
1088 4 : break;
1089 2 : case CallingConv::AMDGPU_LS:
1090 2 : ScratchSizeKey = PALMD::Key::LS_SCRATCH_SIZE;
1091 2 : break;
1092 : }
1093 56 : unsigned NumUsedVgprsKey = ScratchSizeKey +
1094 56 : PALMD::Key::VS_NUM_USED_VGPRS - PALMD::Key::VS_SCRATCH_SIZE;
1095 56 : unsigned NumUsedSgprsKey = ScratchSizeKey +
1096 56 : PALMD::Key::VS_NUM_USED_SGPRS - PALMD::Key::VS_SCRATCH_SIZE;
1097 56 : PALMetadataMap[NumUsedVgprsKey] = CurrentProgramInfo.NumVGPRsForWavesPerEU;
1098 56 : PALMetadataMap[NumUsedSgprsKey] = CurrentProgramInfo.NumSGPRsForWavesPerEU;
1099 112 : if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
1100 22 : PALMetadataMap[Rsrc1Reg] |= CurrentProgramInfo.ComputePGMRSrc1;
1101 22 : PALMetadataMap[Rsrc2Reg] |= CurrentProgramInfo.ComputePGMRSrc2;
1102 : // ScratchSize is in bytes, 16 aligned.
1103 22 : PALMetadataMap[ScratchSizeKey] |=
1104 22 : alignTo(CurrentProgramInfo.ScratchSize, 16);
1105 : } else {
1106 34 : PALMetadataMap[Rsrc1Reg] |= S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
1107 34 : S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks);
1108 34 : if (CurrentProgramInfo.ScratchBlocks > 0)
1109 1 : PALMetadataMap[Rsrc2Reg] |= S_00B84C_SCRATCH_EN(1);
1110 : // ScratchSize is in bytes, 16 aligned.
1111 34 : PALMetadataMap[ScratchSizeKey] |=
1112 34 : alignTo(CurrentProgramInfo.ScratchSize, 16);
1113 : }
1114 112 : if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
1115 17 : PALMetadataMap[Rsrc2Reg] |=
1116 17 : S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks);
1117 17 : PALMetadataMap[R_0286CC_SPI_PS_INPUT_ENA / 4] |= MFI->getPSInputEnable();
1118 17 : PALMetadataMap[R_0286D0_SPI_PS_INPUT_ADDR / 4] |= MFI->getPSInputAddr();
1119 : }
1120 56 : }
1121 :
1122 : // This is supposed to be log2(Size)
1123 : static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
1124 2526 : switch (Size) {
1125 : case 4:
1126 : return AMD_ELEMENT_4_BYTES;
1127 5 : case 8:
1128 : return AMD_ELEMENT_8_BYTES;
1129 67 : case 16:
1130 : return AMD_ELEMENT_16_BYTES;
1131 0 : default:
1132 0 : llvm_unreachable("invalid private_element_size");
1133 : }
1134 : }
1135 :
1136 2526 : void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
1137 : const SIProgramInfo &CurrentProgramInfo,
1138 : const MachineFunction &MF) const {
1139 2526 : const Function &F = MF.getFunction();
1140 : assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
1141 : F.getCallingConv() == CallingConv::SPIR_KERNEL);
1142 :
1143 : const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1144 2526 : const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1145 :
1146 2526 : AMDGPU::initDefaultAMDKernelCodeT(Out, getSTI());
1147 :
1148 2526 : Out.compute_pgm_resource_registers =
1149 5052 : CurrentProgramInfo.ComputePGMRSrc1 |
1150 2526 : (CurrentProgramInfo.ComputePGMRSrc2 << 32);
1151 2526 : Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
1152 :
1153 2526 : if (CurrentProgramInfo.DynamicCallStack)
1154 241 : Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
1155 :
1156 2598 : AMD_HSA_BITS_SET(Out.code_properties,
1157 : AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
1158 : getElementByteSizeValue(STM.getMaxPrivateElementSize()));
1159 :
1160 2526 : if (MFI->hasPrivateSegmentBuffer()) {
1161 2526 : Out.code_properties |=
1162 : AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
1163 : }
1164 :
1165 2526 : if (MFI->hasDispatchPtr())
1166 42 : Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1167 :
1168 2526 : if (MFI->hasQueuePtr())
1169 57 : Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
1170 :
1171 2526 : if (MFI->hasKernargSegmentPtr())
1172 2161 : Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
1173 :
1174 2526 : if (MFI->hasDispatchID())
1175 5 : Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
1176 :
1177 2526 : if (MFI->hasFlatScratchInit())
1178 381 : Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
1179 :
1180 2526 : if (MFI->hasDispatchPtr())
1181 42 : Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
1182 :
1183 : if (STM.debuggerSupported())
1184 3 : Out.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
1185 :
1186 2526 : if (STM.isXNACKEnabled())
1187 42 : Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
1188 :
1189 : unsigned MaxKernArgAlign;
1190 2526 : Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
1191 2526 : Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
1192 2526 : Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
1193 2526 : Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
1194 2526 : Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
1195 :
1196 : // These alignment values are specified in powers of two, so alignment =
1197 : // 2^n. The minimum alignment is 2^4 = 16.
1198 5052 : Out.kernarg_segment_alignment = std::max((size_t)4,
1199 5052 : countTrailingZeros(MaxKernArgAlign));
1200 :
1201 2526 : if (STM.debuggerEmitPrologue()) {
1202 4 : Out.debug_wavefront_private_segment_offset_sgpr =
1203 4 : CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
1204 4 : Out.debug_private_segment_buffer_sgpr =
1205 4 : CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR;
1206 : }
1207 2526 : }
1208 :
1209 680 : bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
1210 : unsigned AsmVariant,
1211 : const char *ExtraCode, raw_ostream &O) {
1212 : // First try the generic code, which knows about modifiers like 'c' and 'n'.
1213 680 : if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O))
1214 : return false;
1215 :
1216 676 : if (ExtraCode && ExtraCode[0]) {
1217 0 : if (ExtraCode[1] != 0)
1218 : return true; // Unknown modifier.
1219 :
1220 0 : switch (ExtraCode[0]) {
1221 : case 'r':
1222 : break;
1223 : default:
1224 : return true;
1225 : }
1226 : }
1227 :
1228 : // TODO: Should be able to support other operand types like globals.
1229 676 : const MachineOperand &MO = MI->getOperand(OpNo);
1230 676 : if (MO.isReg()) {
1231 676 : AMDGPUInstPrinter::printRegOperand(MO.getReg(), O,
1232 676 : *MF->getSubtarget().getRegisterInfo());
1233 676 : return false;
1234 : }
1235 :
1236 : return true;
1237 : }
|