LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUAsmPrinter.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 559 577 96.9 %
Date: 2018-10-20 13:21:21 Functions: 27 29 93.1 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- AMDGPUAsmPrinter.cpp - AMDGPU assembly printer  -------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : ///
      12             : /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
      13             : /// code.  When passed an MCAsmStreamer it prints assembly and when passed
      14             : /// an MCObjectStreamer it outputs binary code.
      15             : //
      16             : //===----------------------------------------------------------------------===//
      17             : //
      18             : 
      19             : #include "AMDGPUAsmPrinter.h"
      20             : #include "AMDGPU.h"
      21             : #include "AMDGPUSubtarget.h"
      22             : #include "AMDGPUTargetMachine.h"
      23             : #include "InstPrinter/AMDGPUInstPrinter.h"
      24             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      25             : #include "MCTargetDesc/AMDGPUTargetStreamer.h"
      26             : #include "R600AsmPrinter.h"
      27             : #include "R600Defines.h"
      28             : #include "R600MachineFunctionInfo.h"
      29             : #include "R600RegisterInfo.h"
      30             : #include "SIDefines.h"
      31             : #include "SIInstrInfo.h"
      32             : #include "SIMachineFunctionInfo.h"
      33             : #include "SIRegisterInfo.h"
      34             : #include "Utils/AMDGPUBaseInfo.h"
      35             : #include "llvm/BinaryFormat/ELF.h"
      36             : #include "llvm/CodeGen/MachineFrameInfo.h"
      37             : #include "llvm/IR/DiagnosticInfo.h"
      38             : #include "llvm/MC/MCContext.h"
      39             : #include "llvm/MC/MCSectionELF.h"
      40             : #include "llvm/MC/MCStreamer.h"
      41             : #include "llvm/Support/AMDGPUMetadata.h"
      42             : #include "llvm/Support/MathExtras.h"
      43             : #include "llvm/Support/TargetParser.h"
      44             : #include "llvm/Support/TargetRegistry.h"
      45             : #include "llvm/Target/TargetLoweringObjectFile.h"
      46             : 
      47             : using namespace llvm;
      48             : using namespace llvm::AMDGPU;
      49             : 
      50             : // TODO: This should get the default rounding mode from the kernel. We just set
      51             : // the default here, but this could change if the OpenCL rounding mode pragmas
      52             : // are used.
      53             : //
      54             : // The denormal mode here should match what is reported by the OpenCL runtime
      55             : // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
      56             : // can also be override to flush with the -cl-denorms-are-zero compiler flag.
      57             : //
      58             : // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
      59             : // precision, and leaves single precision to flush all and does not report
      60             : // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
      61             : // CL_FP_DENORM for both.
      62             : //
      63             : // FIXME: It seems some instructions do not support single precision denormals
      64             : // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
      65             : // and sin_f32, cos_f32 on most parts).
      66             : 
      67             : // We want to use these instructions, and using fp32 denormals also causes
      68             : // instructions to run at the double precision rate for the device so it's
      69             : // probably best to just report no single precision denormals.
      70             : static uint32_t getFPMode(const MachineFunction &F) {
      71       17960 :   const GCNSubtarget& ST = F.getSubtarget<GCNSubtarget>();
      72             :   // TODO: Is there any real use for the flush in only / flush out only modes?
      73             : 
      74             :   uint32_t FP32Denormals =
      75       17960 :     ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
      76             : 
      77             :   uint32_t FP64Denormals =
      78       17960 :     ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
      79             : 
      80             :   return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
      81             :          FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
      82       17960 :          FP_DENORM_MODE_SP(FP32Denormals) |
      83       17960 :          FP_DENORM_MODE_DP(FP64Denormals);
      84             : }
      85             : 
      86             : static AsmPrinter *
      87        1950 : createAMDGPUAsmPrinterPass(TargetMachine &tm,
      88             :                            std::unique_ptr<MCStreamer> &&Streamer) {
      89        1950 :   return new AMDGPUAsmPrinter(tm, std::move(Streamer));
      90             : }
      91             : 
      92       65841 : extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
      93       65841 :   TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(),
      94             :                                      llvm::createR600AsmPrinterPass);
      95       65841 :   TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
      96             :                                      createAMDGPUAsmPrinterPass);
      97       65841 : }
      98             : 
      99        1950 : AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
     100        1950 :                                    std::unique_ptr<MCStreamer> Streamer)
     101        7800 :   : AsmPrinter(TM, std::move(Streamer)) {
     102        1950 : }
     103             : 
     104           0 : StringRef AMDGPUAsmPrinter::getPassName() const {
     105           0 :   return "AMDGPU Assembly Printer";
     106             : }
     107             : 
     108      100767 : const MCSubtargetInfo* AMDGPUAsmPrinter::getSTI() const {
     109      201534 :   return TM.getMCSubtargetInfo();
     110             : }
     111             : 
     112       10035 : AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
     113       10035 :   if (!OutStreamer)
     114             :     return nullptr;
     115       10035 :   return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
     116             : }
     117             : 
     118        1949 : void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
     119        1949 :   if (IsaInfo::hasCodeObjectV3(getSTI())) {
     120             :     std::string ExpectedTarget;
     121           0 :     raw_string_ostream ExpectedTargetOS(ExpectedTarget);
     122          36 :     IsaInfo::streamIsaVersion(getSTI(), ExpectedTargetOS);
     123             : 
     124          36 :     getTargetStreamer()->EmitDirectiveAMDGCNTarget(ExpectedTarget);
     125             : 
     126          36 :     if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
     127          36 :       return;
     128             :   }
     129             : 
     130        1913 :   if (TM.getTargetTriple().getOS() != Triple::AMDHSA &&
     131             :       TM.getTargetTriple().getOS() != Triple::AMDPAL)
     132             :     return;
     133             : 
     134         369 :   if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
     135         324 :     HSAMetadataStream.begin(M);
     136             : 
     137         369 :   if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
     138          45 :     readPALMetadata(M);
     139             : 
     140             :   // HSA emits NT_AMDGPU_HSA_CODE_OBJECT_VERSION for code objects v2.
     141         369 :   if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
     142         324 :     getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
     143             : 
     144             :   // HSA and PAL emit NT_AMDGPU_HSA_ISA for code objects v2.
     145         369 :   IsaVersion Version = getIsaVersion(getSTI()->getCPU());
     146         369 :   getTargetStreamer()->EmitDirectiveHSACodeObjectISA(
     147         369 :       Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU");
     148             : }
     149             : 
     150        1940 : void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
     151             :   // TODO: Add metadata to code object v3.
     152        1940 :   if (IsaInfo::hasCodeObjectV3(getSTI()) &&
     153          36 :       TM.getTargetTriple().getOS() == Triple::AMDHSA)
     154          37 :     return;
     155             : 
     156             :   // Following code requires TargetStreamer to be present.
     157        1904 :   if (!getTargetStreamer())
     158             :     return;
     159             : 
     160             :   // Emit ISA Version (NT_AMD_AMDGPU_ISA).
     161             :   std::string ISAVersionString;
     162        1903 :   raw_string_ostream ISAVersionStream(ISAVersionString);
     163        1903 :   IsaInfo::streamIsaVersion(getSTI(), ISAVersionStream);
     164        1903 :   getTargetStreamer()->EmitISAVersion(ISAVersionStream.str());
     165             : 
     166             :   // Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
     167        1903 :   if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
     168         323 :     HSAMetadataStream.end();
     169         323 :     getTargetStreamer()->EmitHSAMetadata(HSAMetadataStream.getHSAMetadata());
     170             :   }
     171             : 
     172             :   // Emit PAL Metadata (NT_AMD_AMDGPU_PAL_METADATA).
     173        1903 :   if (TM.getTargetTriple().getOS() == Triple::AMDPAL) {
     174             :     // Copy the PAL metadata from the map where we collected it into a vector,
     175             :     // then write it as a .note.
     176             :     PALMD::Metadata PALMetadataVector;
     177         279 :     for (auto i : PALMetadataMap) {
     178         234 :       PALMetadataVector.push_back(i.first);
     179         234 :       PALMetadataVector.push_back(i.second);
     180             :     }
     181          45 :     getTargetStreamer()->EmitPALMetadata(PALMetadataVector);
     182             :   }
     183             : }
     184             : 
     185        7349 : bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
     186             :   const MachineBasicBlock *MBB) const {
     187        7349 :   if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB))
     188             :     return false;
     189             : 
     190        1950 :   if (MBB->empty())
     191             :     return true;
     192             : 
     193             :   // If this is a block implementing a long branch, an expression relative to
     194             :   // the start of the block is needed.  to the start of the block.
     195             :   // XXX - Is there a smarter way to check this?
     196        3858 :   return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
     197             : }
     198             : 
     199       19722 : void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
     200       19722 :   const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
     201       19722 :   if (!MFI.isEntryFunction())
     202             :     return;
     203             : 
     204       17960 :   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
     205       17960 :   const Function &F = MF->getFunction();
     206       17960 :   if (!STM.hasCodeObjectV3() && STM.isAmdHsaOrMesa(F) &&
     207           3 :       (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
     208             :        F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
     209             :     amd_kernel_code_t KernelCode;
     210        2526 :     getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
     211        2526 :     getTargetStreamer()->EmitAMDKernelCodeT(KernelCode);
     212             :   }
     213             : 
     214       17960 :   if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
     215             :     return;
     216             : 
     217        2475 :   if (!STM.hasCodeObjectV3() && STM.isAmdHsaOS())
     218        2437 :     HSAMetadataStream.emitKernel(*MF, CurrentProgramInfo);
     219             : }
     220             : 
     221       19722 : void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
     222       19722 :   const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
     223       19722 :   if (!MFI.isEntryFunction())
     224       19684 :     return;
     225       17960 :   if (!IsaInfo::hasCodeObjectV3(getSTI()) ||
     226          38 :       TM.getTargetTriple().getOS() != Triple::AMDHSA)
     227             :     return;
     228             : 
     229          38 :   auto &Streamer = getTargetStreamer()->getStreamer();
     230          38 :   auto &Context = Streamer.getContext();
     231          38 :   auto &ObjectFileInfo = *Context.getObjectFileInfo();
     232          38 :   auto &ReadOnlySection = *ObjectFileInfo.getReadOnlySection();
     233             : 
     234          38 :   Streamer.PushSection();
     235          38 :   Streamer.SwitchSection(&ReadOnlySection);
     236             : 
     237             :   // CP microcode requires the kernel descriptor to be allocated on 64 byte
     238             :   // alignment.
     239          38 :   Streamer.EmitValueToAlignment(64, 0, 1, 0);
     240          38 :   if (ReadOnlySection.getAlignment() < 64)
     241             :     ReadOnlySection.setAlignment(64);
     242             : 
     243             :   SmallString<128> KernelName;
     244          38 :   getNameWithPrefix(KernelName, &MF->getFunction());
     245          76 :   getTargetStreamer()->EmitAmdhsaKernelDescriptor(
     246          38 :       *getSTI(), KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
     247          38 :       CurrentProgramInfo.NumVGPRsForWavesPerEU,
     248          38 :       CurrentProgramInfo.NumSGPRsForWavesPerEU -
     249          38 :           IsaInfo::getNumExtraSGPRs(getSTI(),
     250             :                                     CurrentProgramInfo.VCCUsed,
     251             :                                     CurrentProgramInfo.FlatUsed),
     252          38 :       CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
     253          38 :       hasXNACK(*getSTI()));
     254             : 
     255          38 :   Streamer.PopSection();
     256             : }
     257             : 
     258       19722 : void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
     259       19722 :   if (IsaInfo::hasCodeObjectV3(getSTI()) &&
     260          38 :       TM.getTargetTriple().getOS() == Triple::AMDHSA) {
     261          38 :     AsmPrinter::EmitFunctionEntryLabel();
     262          38 :     return;
     263             :   }
     264             : 
     265       19684 :   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
     266       19684 :   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
     267       19684 :   if (MFI->isEntryFunction() && STM.isAmdHsaOrMesa(MF->getFunction())) {
     268             :     SmallString<128> SymbolName;
     269        2529 :     getNameWithPrefix(SymbolName, &MF->getFunction()),
     270        2529 :     getTargetStreamer()->EmitAMDGPUSymbolType(
     271        2529 :         SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
     272             :   }
     273       19684 :   const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
     274       19684 :   if (STI.dumpCode()) {
     275             :     // Disassemble function name label to text.
     276           6 :     DisasmLines.push_back(MF->getName().str() + ":");
     277           2 :     DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
     278           4 :     HexLines.push_back("");
     279             :   }
     280             : 
     281       19684 :   AsmPrinter::EmitFunctionEntryLabel();
     282             : }
     283             : 
     284       22171 : void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
     285       22171 :   const GCNSubtarget &STI = MBB.getParent()->getSubtarget<GCNSubtarget>();
     286       22171 :   if (STI.dumpCode() && !isBlockOnlyReachableByFallthrough(&MBB)) {
     287             :     // Write a line for the basic block label if it is not only fallthrough.
     288           2 :     DisasmLines.push_back(
     289           2 :         (Twine("BB") + Twine(getFunctionNumber())
     290           6 :          + "_" + Twine(MBB.getNumber()) + ":").str());
     291           2 :     DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
     292           4 :     HexLines.push_back("");
     293             :   }
     294       22171 :   AsmPrinter::EmitBasicBlockStart(MBB);
     295       22171 : }
     296             : 
     297         319 : void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     298             : 
     299             :   // Group segment variables aren't emitted in HSA.
     300         319 :   if (AMDGPU::isGroupSegment(GV))
     301             :     return;
     302             : 
     303          96 :   AsmPrinter::EmitGlobalVariable(GV);
     304             : }
     305             : 
     306        1940 : bool AMDGPUAsmPrinter::doFinalization(Module &M) {
     307        1940 :   CallGraphResourceInfo.clear();
     308        1940 :   return AsmPrinter::doFinalization(M);
     309             : }
     310             : 
     311             : // For the amdpal OS type, read the amdgpu.pal.metadata supplied by the
     312             : // frontend into our PALMetadataMap, ready for per-function modification.  It
     313             : // is a NamedMD containing an MDTuple containing a number of MDNodes each of
     314             : // which is an integer value, and each two integer values forms a key=value
     315             : // pair that we store as PALMetadataMap[key]=value in the map.
     316          45 : void AMDGPUAsmPrinter::readPALMetadata(Module &M) {
     317          45 :   auto NamedMD = M.getNamedMetadata("amdgpu.pal.metadata");
     318          45 :   if (!NamedMD || !NamedMD->getNumOperands())
     319          42 :     return;
     320           3 :   auto Tuple = dyn_cast<MDTuple>(NamedMD->getOperand(0));
     321             :   if (!Tuple)
     322             :     return;
     323           9 :   for (unsigned I = 0, E = Tuple->getNumOperands() & -2; I != E; I += 2) {
     324           6 :     auto Key = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I));
     325           6 :     auto Val = mdconst::dyn_extract<ConstantInt>(Tuple->getOperand(I + 1));
     326           6 :     if (!Key || !Val)
     327             :       continue;
     328           6 :     PALMetadataMap[Key->getZExtValue()] = Val->getZExtValue();
     329             :   }
     330             : }
     331             : 
     332             : // Print comments that apply to both callable functions and entry points.
     333       19361 : void AMDGPUAsmPrinter::emitCommonFunctionComments(
     334             :   uint32_t NumVGPR,
     335             :   uint32_t NumSGPR,
     336             :   uint64_t ScratchSize,
     337             :   uint64_t CodeSize,
     338             :   const AMDGPUMachineFunction *MFI) {
     339       19361 :   OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
     340       19361 :   OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
     341       19361 :   OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
     342       19361 :   OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
     343       19361 :   OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
     344       19361 :                               false);
     345       19361 : }
     346             : 
     347          38 : uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
     348             :     const MachineFunction &MF) const {
     349             :   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
     350             :   uint16_t KernelCodeProperties = 0;
     351             : 
     352          38 :   if (MFI.hasPrivateSegmentBuffer()) {
     353             :     KernelCodeProperties |=
     354             :         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
     355             :   }
     356          38 :   if (MFI.hasDispatchPtr()) {
     357           0 :     KernelCodeProperties |=
     358             :         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
     359             :   }
     360          38 :   if (MFI.hasQueuePtr()) {
     361           0 :     KernelCodeProperties |=
     362             :         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
     363             :   }
     364          38 :   if (MFI.hasKernargSegmentPtr()) {
     365           4 :     KernelCodeProperties |=
     366             :         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
     367             :   }
     368          38 :   if (MFI.hasDispatchID()) {
     369           0 :     KernelCodeProperties |=
     370             :         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
     371             :   }
     372          38 :   if (MFI.hasFlatScratchInit()) {
     373           0 :     KernelCodeProperties |=
     374             :         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
     375             :   }
     376             : 
     377          38 :   return KernelCodeProperties;
     378             : }
     379             : 
     380          38 : amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
     381             :     const MachineFunction &MF,
     382             :     const SIProgramInfo &PI) const {
     383             :   amdhsa::kernel_descriptor_t KernelDescriptor;
     384          38 :   memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
     385             : 
     386             :   assert(isUInt<32>(PI.ScratchSize));
     387             :   assert(isUInt<32>(PI.ComputePGMRSrc1));
     388             :   assert(isUInt<32>(PI.ComputePGMRSrc2));
     389             : 
     390          38 :   KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
     391          38 :   KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
     392          38 :   KernelDescriptor.compute_pgm_rsrc1 = PI.ComputePGMRSrc1;
     393          38 :   KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
     394          38 :   KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
     395             : 
     396          38 :   return KernelDescriptor;
     397             : }
     398             : 
     399       19722 : bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     400       19722 :   CurrentProgramInfo = SIProgramInfo();
     401             : 
     402       19722 :   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
     403             : 
     404             :   // The starting address of all shader programs must be 256 bytes aligned.
     405             :   // Regular functions just need the basic required instruction alignment.
     406       19722 :   MF.setAlignment(MFI->isEntryFunction() ? 8 : 2);
     407             : 
     408       19722 :   SetupMachineFunction(MF);
     409             : 
     410       19722 :   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
     411       19722 :   MCContext &Context = getObjFileLowering().getContext();
     412             :   // FIXME: This should be an explicit check for Mesa.
     413       19722 :   if (!STM.isAmdHsaOS() && !STM.isAmdPalOS()) {
     414             :     MCSectionELF *ConfigSection =
     415       16802 :         Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
     416       16802 :     OutStreamer->SwitchSection(ConfigSection);
     417             :   }
     418             : 
     419       19722 :   if (MFI->isEntryFunction()) {
     420       17960 :     getSIProgramInfo(CurrentProgramInfo, MF);
     421             :   } else {
     422        1762 :     auto I = CallGraphResourceInfo.insert(
     423        1762 :       std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
     424        1762 :     SIFunctionResourceInfo &Info = I.first->second;
     425             :     assert(I.second && "should only be called once per function");
     426        1762 :     Info = analyzeResourceUsage(MF);
     427             :   }
     428             : 
     429       19722 :   if (STM.isAmdPalOS())
     430          56 :     EmitPALMetadata(MF, CurrentProgramInfo);
     431       19666 :   else if (!STM.isAmdHsaOS()) {
     432       16802 :     EmitProgramInfoSI(MF, CurrentProgramInfo);
     433             :   }
     434             : 
     435             :   DisasmLines.clear();
     436             :   HexLines.clear();
     437       19722 :   DisasmLineMaxLen = 0;
     438             : 
     439       19722 :   EmitFunctionBody();
     440             : 
     441       19722 :   if (isVerbose()) {
     442             :     MCSectionELF *CommentSection =
     443       19361 :         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
     444       19361 :     OutStreamer->SwitchSection(CommentSection);
     445             : 
     446       19361 :     if (!MFI->isEntryFunction()) {
     447        3512 :       OutStreamer->emitRawComment(" Function info:", false);
     448        1756 :       SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
     449        1756 :       emitCommonFunctionComments(
     450        1756 :         Info.NumVGPR,
     451        1756 :         Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
     452             :         Info.PrivateSegmentSize,
     453             :         getFunctionCodeSize(MF), MFI);
     454        1756 :       return false;
     455             :     }
     456             : 
     457       35210 :     OutStreamer->emitRawComment(" Kernel info:", false);
     458       17605 :     emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
     459             :                                CurrentProgramInfo.NumSGPR,
     460             :                                CurrentProgramInfo.ScratchSize,
     461             :                                getFunctionCodeSize(MF), MFI);
     462             : 
     463       17605 :     OutStreamer->emitRawComment(
     464       35210 :       " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
     465       17605 :     OutStreamer->emitRawComment(
     466       35210 :       " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
     467       17605 :     OutStreamer->emitRawComment(
     468       17605 :       " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
     469       35210 :       " bytes/workgroup (compile time only)", false);
     470             : 
     471       17605 :     OutStreamer->emitRawComment(
     472       35210 :       " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
     473       17605 :     OutStreamer->emitRawComment(
     474       35210 :       " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
     475             : 
     476       17605 :     OutStreamer->emitRawComment(
     477       17605 :       " NumSGPRsForWavesPerEU: " +
     478       35210 :       Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
     479       17605 :     OutStreamer->emitRawComment(
     480       17605 :       " NumVGPRsForWavesPerEU: " +
     481       35210 :       Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
     482             : 
     483       17605 :     OutStreamer->emitRawComment(
     484       17605 :       " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
     485             : 
     486       17605 :     if (MF.getSubtarget<GCNSubtarget>().debuggerEmitPrologue()) {
     487           1 :       OutStreamer->emitRawComment(
     488           1 :         " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
     489           2 :         Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
     490           1 :       OutStreamer->emitRawComment(
     491           1 :         " DebuggerPrivateSegmentBufferSGPR: s" +
     492           2 :         Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);
     493             :     }
     494             : 
     495       17605 :     OutStreamer->emitRawComment(
     496       17605 :       " COMPUTE_PGM_RSRC2:USER_SGPR: " +
     497       35210 :       Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
     498       17605 :     OutStreamer->emitRawComment(
     499       17605 :       " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
     500       35210 :       Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
     501       17605 :     OutStreamer->emitRawComment(
     502       17605 :       " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
     503       35210 :       Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
     504       17605 :     OutStreamer->emitRawComment(
     505       17605 :       " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
     506       35210 :       Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
     507       17605 :     OutStreamer->emitRawComment(
     508       17605 :       " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
     509       35210 :       Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
     510       17605 :     OutStreamer->emitRawComment(
     511       17605 :       " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
     512       17605 :       Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
     513       17605 :       false);
     514             :   }
     515             : 
     516       17966 :   if (STM.dumpCode()) {
     517             : 
     518           2 :     OutStreamer->SwitchSection(
     519           4 :         Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
     520             : 
     521          22 :     for (size_t i = 0; i < DisasmLines.size(); ++i) {
     522          18 :       std::string Comment = "\n";
     523          36 :       if (!HexLines[i].empty()) {
     524          28 :         Comment = std::string(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
     525          42 :         Comment += " ; " + HexLines[i] + "\n";
     526             :       }
     527             : 
     528          36 :       OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
     529          36 :       OutStreamer->EmitBytes(StringRef(Comment));
     530             :     }
     531             :   }
     532             : 
     533             :   return false;
     534             : }
     535             : 
     536       19361 : uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
     537       19361 :   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
     538       19361 :   const SIInstrInfo *TII = STM.getInstrInfo();
     539             : 
     540             :   uint64_t CodeSize = 0;
     541             : 
     542       41080 :   for (const MachineBasicBlock &MBB : MF) {
     543      377725 :     for (const MachineInstr &MI : MBB) {
     544             :       // TODO: CodeSize should account for multiple functions.
     545             : 
     546             :       // TODO: Should we count size of debug info?
     547             :       if (MI.isDebugInstr())
     548             :         continue;
     549             : 
     550      355985 :       CodeSize += TII->getInstSizeInBytes(MI);
     551             :     }
     552             :   }
     553             : 
     554       19361 :   return CodeSize;
     555             : }
     556             : 
     557           0 : static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
     558             :                                   const SIInstrInfo &TII,
     559             :                                   unsigned Reg) {
     560       23082 :   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
     561       10179 :     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
     562           0 :       return true;
     563             :   }
     564             : 
     565             :   return false;
     566             : }
     567             : 
     568        1756 : int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
     569             :   const GCNSubtarget &ST) const {
     570        7024 :   return NumExplicitSGPR + IsaInfo::getNumExtraSGPRs(&ST,
     571        1756 :                                                      UsesVCC, UsesFlatScratch);
     572             : }
     573             : 
     574       19722 : AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
     575             :   const MachineFunction &MF) const {
     576       19722 :   SIFunctionResourceInfo Info;
     577             : 
     578             :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     579       19722 :   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
     580       19722 :   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
     581       19722 :   const MachineRegisterInfo &MRI = MF.getRegInfo();
     582       19722 :   const SIInstrInfo *TII = ST.getInstrInfo();
     583             :   const SIRegisterInfo &TRI = TII->getRegisterInfo();
     584             : 
     585       34729 :   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
     586       15007 :                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);
     587             : 
     588             :   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
     589             :   // instructions aren't used to access the scratch buffer. Inline assembly may
     590             :   // need it though.
     591             :   //
     592             :   // If we only have implicit uses of flat_scr on flat instructions, it is not
     593             :   // really needed.
     594       24067 :   if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
     595        4312 :       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
     596        4301 :        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
     597             :        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
     598        4290 :     Info.UsesFlatScratch = false;
     599             :   }
     600             : 
     601       19722 :   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
     602       19722 :   Info.PrivateSegmentSize = FrameInfo.getStackSize();
     603       19722 :   if (MFI->isStackRealigned())
     604           3 :     Info.PrivateSegmentSize += FrameInfo.getMaxAlignment();
     605             : 
     606             : 
     607       34463 :   Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
     608       14741 :                  MRI.isPhysRegUsed(AMDGPU::VCC_HI);
     609             : 
     610             :   // If there are no calls, MachineRegisterInfo can tell us the used register
     611             :   // count easily.
     612             :   // A tail call isn't considered a call for MachineFrameInfo's purposes.
     613       19722 :   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
     614             :     MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
     615     4839000 :     for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
     616     4838184 :       if (MRI.isPhysRegUsed(Reg)) {
     617             :         HighestVGPRReg = Reg;
     618             :         break;
     619             :       }
     620             :     }
     621             : 
     622             :     MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
     623     1824975 :     for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
     624     1823990 :       if (MRI.isPhysRegUsed(Reg)) {
     625             :         HighestSGPRReg = Reg;
     626             :         break;
     627             :       }
     628             :     }
     629             : 
     630             :     // We found the maximum register index. They start at 0, so add one to get the
     631             :     // number of registers.
     632       19172 :     Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
     633       18356 :       TRI.getHWRegIndex(HighestVGPRReg) + 1;
     634       19172 :     Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
     635       18187 :       TRI.getHWRegIndex(HighestSGPRReg) + 1;
     636             : 
     637       19172 :     return Info;
     638             :   }
     639             : 
     640         550 :   int32_t MaxVGPR = -1;
     641         550 :   int32_t MaxSGPR = -1;
     642         550 :   uint64_t CalleeFrameSize = 0;
     643             : 
     644        1123 :   for (const MachineBasicBlock &MBB : MF) {
     645        9938 :     for (const MachineInstr &MI : MBB) {
     646             :       // TODO: Check regmasks? Do they occur anywhere except calls?
     647       45411 :       for (const MachineOperand &MO : MI.operands()) {
     648             :         unsigned Width = 0;
     649             :         bool IsSGPR = false;
     650             : 
     651       36046 :         if (!MO.isReg())
     652             :           continue;
     653             : 
     654       25982 :         unsigned Reg = MO.getReg();
     655       25982 :         switch (Reg) {
     656             :         case AMDGPU::EXEC:
     657             :         case AMDGPU::EXEC_LO:
     658             :         case AMDGPU::EXEC_HI:
     659             :         case AMDGPU::SCC:
     660             :         case AMDGPU::M0:
     661             :         case AMDGPU::SRC_SHARED_BASE:
     662             :         case AMDGPU::SRC_SHARED_LIMIT:
     663             :         case AMDGPU::SRC_PRIVATE_BASE:
     664             :         case AMDGPU::SRC_PRIVATE_LIMIT:
     665             :           continue;
     666             : 
     667             :         case AMDGPU::NoRegister:
     668             :           assert(MI.isDebugInstr());
     669             :           continue;
     670             : 
     671          68 :         case AMDGPU::VCC:
     672             :         case AMDGPU::VCC_LO:
     673             :         case AMDGPU::VCC_HI:
     674          68 :           Info.UsesVCC = true;
     675          68 :           continue;
     676             : 
     677             :         case AMDGPU::FLAT_SCR:
     678             :         case AMDGPU::FLAT_SCR_LO:
     679             :         case AMDGPU::FLAT_SCR_HI:
     680             :           continue;
     681             : 
     682             :         case AMDGPU::XNACK_MASK:
     683             :         case AMDGPU::XNACK_MASK_LO:
     684             :         case AMDGPU::XNACK_MASK_HI:
     685             :           llvm_unreachable("xnack_mask registers should not be used");
     686             : 
     687             :         case AMDGPU::TBA:
     688             :         case AMDGPU::TBA_LO:
     689             :         case AMDGPU::TBA_HI:
     690             :         case AMDGPU::TMA:
     691             :         case AMDGPU::TMA_LO:
     692             :         case AMDGPU::TMA_HI:
     693             :           llvm_unreachable("trap handler registers should not be used");
     694             : 
     695             :         default:
     696             :           break;
     697             :         }
     698             : 
     699       21204 :         if (AMDGPU::SReg_32RegClass.contains(Reg)) {
     700             :           assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
     701             :                  "trap handler registers should not be used");
     702             :           IsSGPR = true;
     703             :           Width = 1;
     704       12055 :         } else if (AMDGPU::VGPR_32RegClass.contains(Reg)) {
     705             :           IsSGPR = false;
     706             :           Width = 1;
     707        6234 :         } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
     708             :           assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
     709             :                  "trap handler registers should not be used");
     710             :           IsSGPR = true;
     711             :           Width = 2;
     712        3133 :         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
     713             :           IsSGPR = false;
     714             :           Width = 2;
     715        2904 :         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
     716             :           IsSGPR = false;
     717             :           Width = 3;
     718        2904 :         } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
     719             :           assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
     720             :             "trap handler registers should not be used");
     721             :           IsSGPR = true;
     722             :           Width = 4;
     723         201 :         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
     724             :           IsSGPR = false;
     725             :           Width = 4;
     726           8 :         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
     727             :           assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
     728             :             "trap handler registers should not be used");
     729             :           IsSGPR = true;
     730             :           Width = 8;
     731           8 :         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
     732             :           IsSGPR = false;
     733             :           Width = 8;
     734           8 :         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
     735             :           assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
     736             :             "trap handler registers should not be used");
     737             :           IsSGPR = true;
     738             :           Width = 16;
     739           0 :         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
     740             :           IsSGPR = false;
     741             :           Width = 16;
     742             :         } else {
     743           0 :           llvm_unreachable("Unknown register class");
     744             :         }
     745             :         unsigned HWReg = TRI.getHWRegIndex(Reg);
     746       21204 :         int MaxUsed = HWReg + Width - 1;
     747       21204 :         if (IsSGPR) {
     748       28630 :           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
     749             :         } else {
     750       11196 :           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
     751             :         }
     752             :       }
     753             : 
     754        9365 :       if (MI.isCall()) {
     755             :         // Pseudo used just to encode the underlying global. Is there a better
     756             :         // way to track this?
     757             : 
     758             :         const MachineOperand *CalleeOp
     759             :           = TII->getNamedOperand(MI, AMDGPU::OpName::callee);
     760         575 :         const Function *Callee = cast<Function>(CalleeOp->getGlobal());
     761         575 :         if (Callee->isDeclaration()) {
     762             :           // If this is a call to an external function, we can't do much. Make
     763             :           // conservative guesses.
     764             : 
     765             :           // 48 SGPRs - vcc, - flat_scr, -xnack
     766             :           int MaxSGPRGuess =
     767         368 :               47 - IsaInfo::getNumExtraSGPRs(getSTI(), true,
     768         736 :                                              ST.hasFlatAddressSpace());
     769         368 :           MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
     770         368 :           MaxVGPR = std::max(MaxVGPR, 23);
     771             : 
     772         368 :           CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384));
     773         368 :           Info.UsesVCC = true;
     774         368 :           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
     775         368 :           Info.HasDynamicallySizedStack = true;
     776             :         } else {
     777             :           // We force CodeGen to run in SCC order, so the callee's register
     778             :           // usage etc. should be the cumulative usage of all callees.
     779         207 :           auto I = CallGraphResourceInfo.find(Callee);
     780             :           assert(I != CallGraphResourceInfo.end() &&
     781             :                  "callee should have been handled before caller");
     782             : 
     783         207 :           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
     784         207 :           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
     785             :           CalleeFrameSize
     786         207 :             = std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
     787         207 :           Info.UsesVCC |= I->second.UsesVCC;
     788         207 :           Info.UsesFlatScratch |= I->second.UsesFlatScratch;
     789         207 :           Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
     790         207 :           Info.HasRecursion |= I->second.HasRecursion;
     791             :         }
     792             : 
     793         575 :         if (!Callee->doesNotRecurse())
     794         523 :           Info.HasRecursion = true;
     795             :       }
     796             :     }
     797             :   }
     798             : 
     799         550 :   Info.NumExplicitSGPR = MaxSGPR + 1;
     800         550 :   Info.NumVGPR = MaxVGPR + 1;
     801         550 :   Info.PrivateSegmentSize += CalleeFrameSize;
     802             : 
     803         550 :   return Info;
     804             : }
     805             : 
     806       17960 : void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     807             :                                         const MachineFunction &MF) {
     808       17960 :   SIFunctionResourceInfo Info = analyzeResourceUsage(MF);
     809             : 
     810       17960 :   ProgInfo.NumVGPR = Info.NumVGPR;
     811       17960 :   ProgInfo.NumSGPR = Info.NumExplicitSGPR;
     812       17960 :   ProgInfo.ScratchSize = Info.PrivateSegmentSize;
     813       17960 :   ProgInfo.VCCUsed = Info.UsesVCC;
     814       17960 :   ProgInfo.FlatUsed = Info.UsesFlatScratch;
     815       17960 :   ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
     816             : 
     817       17960 :   if (!isUInt<32>(ProgInfo.ScratchSize)) {
     818             :     DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
     819           2 :                                           ProgInfo.ScratchSize, DS_Error);
     820           2 :     MF.getFunction().getContext().diagnose(DiagStackSize);
     821             :   }
     822             : 
     823       17960 :   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
     824             :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     825       17960 :   const SIInstrInfo *TII = STM.getInstrInfo();
     826             :   const SIRegisterInfo *RI = &TII->getRegisterInfo();
     827             : 
     828             :   // TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
     829             :   // duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
     830             :   // unified.
     831       17960 :   unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
     832       17960 :       getSTI(), ProgInfo.VCCUsed, ProgInfo.FlatUsed);
     833             : 
     834             :   // Check the addressable register limit before we add ExtraSGPRs.
     835       17960 :   if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
     836        9929 :       !STM.hasSGPRInitBug()) {
     837             :     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
     838        5317 :     if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
     839             :       // This can happen due to a compiler bug or when using inline asm.
     840           1 :       LLVMContext &Ctx = MF.getFunction().getContext();
     841             :       DiagnosticInfoResourceLimit Diag(MF.getFunction(),
     842             :                                        "addressable scalar registers",
     843           1 :                                        ProgInfo.NumSGPR, DS_Error,
     844             :                                        DK_ResourceLimit,
     845           1 :                                        MaxAddressableNumSGPRs);
     846           1 :       Ctx.diagnose(Diag);
     847           1 :       ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
     848             :     }
     849             :   }
     850             : 
     851             :   // Account for extra SGPRs and VGPRs reserved for debugger use.
     852       17960 :   ProgInfo.NumSGPR += ExtraSGPRs;
     853             : 
     854             :   // Ensure there are enough SGPRs and VGPRs for wave dispatch, where wave
     855             :   // dispatch registers are function args.
     856       17960 :   unsigned WaveDispatchNumSGPR = 0, WaveDispatchNumVGPR = 0;
     857       60738 :   for (auto &Arg : MF.getFunction().args()) {
     858       42778 :     unsigned NumRegs = (Arg.getType()->getPrimitiveSizeInBits() + 31) / 32;
     859       42778 :     if (Arg.hasAttribute(Attribute::InReg))
     860        2420 :       WaveDispatchNumSGPR += NumRegs;
     861             :     else
     862       40358 :       WaveDispatchNumVGPR += NumRegs;
     863             :   }
     864       17960 :   ProgInfo.NumSGPR = std::max(ProgInfo.NumSGPR, WaveDispatchNumSGPR);
     865       17960 :   ProgInfo.NumVGPR = std::max(ProgInfo.NumVGPR, WaveDispatchNumVGPR);
     866             : 
     867             :   // Adjust number of registers used to meet default/requested minimum/maximum
     868             :   // number of waves per execution unit request.
     869       17960 :   ProgInfo.NumSGPRsForWavesPerEU = std::max(
     870       17960 :     std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
     871       17960 :   ProgInfo.NumVGPRsForWavesPerEU = std::max(
     872       17960 :     std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
     873             : 
     874       17960 :   if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
     875        9929 :       STM.hasSGPRInitBug()) {
     876             :     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
     877       12643 :     if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
     878             :       // This can happen due to a compiler bug or when using inline asm to use
     879             :       // the registers which are usually reserved for vcc etc.
     880           4 :       LLVMContext &Ctx = MF.getFunction().getContext();
     881             :       DiagnosticInfoResourceLimit Diag(MF.getFunction(),
     882             :                                        "scalar registers",
     883           4 :                                        ProgInfo.NumSGPR, DS_Error,
     884             :                                        DK_ResourceLimit,
     885           4 :                                        MaxAddressableNumSGPRs);
     886           4 :       Ctx.diagnose(Diag);
     887           4 :       ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
     888           4 :       ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
     889             :     }
     890             :   }
     891             : 
     892       17960 :   if (STM.hasSGPRInitBug()) {
     893        4612 :     ProgInfo.NumSGPR =
     894             :         AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
     895        4612 :     ProgInfo.NumSGPRsForWavesPerEU =
     896             :         AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
     897             :   }
     898             : 
     899       17960 :   if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
     900           0 :     LLVMContext &Ctx = MF.getFunction().getContext();
     901             :     DiagnosticInfoResourceLimit Diag(MF.getFunction(), "user SGPRs",
     902           0 :                                      MFI->getNumUserSGPRs(), DS_Error);
     903           0 :     Ctx.diagnose(Diag);
     904             :   }
     905             : 
     906       17960 :   if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) {
     907           4 :     LLVMContext &Ctx = MF.getFunction().getContext();
     908             :     DiagnosticInfoResourceLimit Diag(MF.getFunction(), "local memory",
     909           4 :                                      MFI->getLDSSize(), DS_Error);
     910           4 :     Ctx.diagnose(Diag);
     911             :   }
     912             : 
     913       17960 :   ProgInfo.SGPRBlocks = IsaInfo::getNumSGPRBlocks(
     914             :       getSTI(), ProgInfo.NumSGPRsForWavesPerEU);
     915       17960 :   ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
     916             :       getSTI(), ProgInfo.NumVGPRsForWavesPerEU);
     917             : 
     918             :   // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
     919             :   // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
     920             :   // attribute was requested.
     921       17960 :   if (STM.debuggerEmitPrologue()) {
     922           4 :     ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
     923           4 :       RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
     924           4 :     ProgInfo.DebuggerPrivateSegmentBufferSGPR =
     925           4 :       RI->getHWRegIndex(MFI->getScratchRSrcReg());
     926             :   }
     927             : 
     928             :   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
     929             :   // register.
     930       17960 :   ProgInfo.FloatMode = getFPMode(MF);
     931             : 
     932       17960 :   ProgInfo.IEEEMode = STM.enableIEEEBit(MF);
     933             : 
     934             :   // Make clamp modifier on NaN input returns 0.
     935       17960 :   ProgInfo.DX10Clamp = STM.enableDX10Clamp();
     936             : 
     937             :   unsigned LDSAlignShift;
     938       17960 :   if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
     939             :     // LDS is allocated in 64 dword blocks.
     940             :     LDSAlignShift = 8;
     941             :   } else {
     942             :     // LDS is allocated in 128 dword blocks.
     943             :     LDSAlignShift = 9;
     944             :   }
     945             : 
     946             :   unsigned LDSSpillSize =
     947       17960 :     MFI->getLDSWaveSpillSize() * MFI->getMaxFlatWorkGroupSize();
     948             : 
     949       17960 :   ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize;
     950       17960 :   ProgInfo.LDSBlocks =
     951       17960 :       alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
     952             : 
     953             :   // Scratch is allocated in 256 dword blocks.
     954             :   unsigned ScratchAlignShift = 10;
     955             :   // We need to program the hardware with the amount of scratch memory that
     956             :   // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
     957             :   // scratch memory used per thread.
     958       17960 :   ProgInfo.ScratchBlocks =
     959       17960 :       alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
     960       17960 :               1ULL << ScratchAlignShift) >>
     961             :       ScratchAlignShift;
     962             : 
     963       17960 :   ProgInfo.ComputePGMRSrc1 =
     964       35920 :       S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
     965       35920 :       S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
     966       35920 :       S_00B848_PRIORITY(ProgInfo.Priority) |
     967       35920 :       S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
     968       35920 :       S_00B848_PRIV(ProgInfo.Priv) |
     969       35920 :       S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
     970       35920 :       S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
     971       17960 :       S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
     972             : 
     973             :   // 0 = X, 1 = XY, 2 = XYZ
     974             :   unsigned TIDIGCompCnt = 0;
     975       17960 :   if (MFI->hasWorkItemIDZ())
     976             :     TIDIGCompCnt = 2;
     977       17884 :   else if (MFI->hasWorkItemIDY())
     978             :     TIDIGCompCnt = 1;
     979             : 
     980       17960 :   ProgInfo.ComputePGMRSrc2 =
     981       35920 :       S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
     982       33445 :       S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
     983             :       // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
     984       17960 :       S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) |
     985       17960 :       S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
     986       17960 :       S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
     987       17960 :       S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
     988       35920 :       S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
     989       17960 :       S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
     990       17960 :       S_00B84C_EXCP_EN_MSB(0) |
     991             :       // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
     992       17960 :       S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
     993             :       S_00B84C_EXCP_EN(0);
     994       17960 : }
     995             : 
     996             : static unsigned getRsrcReg(CallingConv::ID CallConv) {
     997             :   switch (CallConv) {
     998             :   default: LLVM_FALLTHROUGH;
     999             :   case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
    1000             :   case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
    1001             :   case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
    1002             :   case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
    1003             :   case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
    1004             :   case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
    1005             :   case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
    1006             :   }
    1007             : }
    1008             : 
    1009       16802 : void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
    1010             :                                          const SIProgramInfo &CurrentProgramInfo) {
    1011       16802 :   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
    1012             :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    1013       16802 :   unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
    1014             : 
    1015       16802 :   if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
    1016       15142 :     OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
    1017             : 
    1018       15142 :     OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4);
    1019             : 
    1020       15142 :     OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
    1021       15142 :     OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc2, 4);
    1022             : 
    1023       15142 :     OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
    1024       15142 :     OutStreamer->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
    1025             : 
    1026             :     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
    1027             :     // 0" comment but I don't see a corresponding field in the register spec.
    1028             :   } else {
    1029        1660 :     OutStreamer->EmitIntValue(RsrcReg, 4);
    1030        3320 :     OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
    1031        1660 :                               S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
    1032        1660 :     if (STM.isVGPRSpillingEnabled(MF.getFunction())) {
    1033          34 :       OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
    1034          34 :       OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
    1035             :     }
    1036             :   }
    1037             : 
    1038       33604 :   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
    1039        1416 :     OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
    1040        1416 :     OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4);
    1041        1416 :     OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
    1042        1416 :     OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
    1043        1416 :     OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
    1044        1416 :     OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
    1045             :   }
    1046             : 
    1047       16802 :   OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
    1048       16802 :   OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4);
    1049       16802 :   OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4);
    1050       16802 :   OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4);
    1051       16802 : }
    1052             : 
    1053             : // This is the equivalent of EmitProgramInfoSI above, but for when the OS type
    1054             : // is AMDPAL.  It stores each compute/SPI register setting and other PAL
    1055             : // metadata items into the PALMetadataMap, combining with any provided by the
    1056             : // frontend as LLVM metadata. Once all functions are written, PALMetadataMap is
    1057             : // then written as a single block in the .note section.
    1058          56 : void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
    1059             :        const SIProgramInfo &CurrentProgramInfo) {
    1060             :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    1061             :   // Given the calling convention, calculate the register number for rsrc1. In
    1062             :   // principle the register number could change in future hardware, but we know
    1063             :   // it is the same for gfx6-9 (except that LS and ES don't exist on gfx9), so
    1064             :   // we can use the same fixed value that .AMDGPU.config has for Mesa. Note
    1065             :   // that we use a register number rather than a byte offset, so we need to
    1066             :   // divide by 4.
    1067          56 :   unsigned Rsrc1Reg = getRsrcReg(MF.getFunction().getCallingConv()) / 4;
    1068          56 :   unsigned Rsrc2Reg = Rsrc1Reg + 1;
    1069             :   // Also calculate the PAL metadata key for *S_SCRATCH_SIZE. It can be used
    1070             :   // with a constant offset to access any non-register shader-specific PAL
    1071             :   // metadata key.
    1072          56 :   unsigned ScratchSizeKey = PALMD::Key::CS_SCRATCH_SIZE;
    1073          56 :   switch (MF.getFunction().getCallingConv()) {
    1074          17 :     case CallingConv::AMDGPU_PS:
    1075          17 :       ScratchSizeKey = PALMD::Key::PS_SCRATCH_SIZE;
    1076          17 :       break;
    1077           6 :     case CallingConv::AMDGPU_VS:
    1078           6 :       ScratchSizeKey = PALMD::Key::VS_SCRATCH_SIZE;
    1079           6 :       break;
    1080           3 :     case CallingConv::AMDGPU_GS:
    1081           3 :       ScratchSizeKey = PALMD::Key::GS_SCRATCH_SIZE;
    1082           3 :       break;
    1083           2 :     case CallingConv::AMDGPU_ES:
    1084           2 :       ScratchSizeKey = PALMD::Key::ES_SCRATCH_SIZE;
    1085           2 :       break;
    1086           4 :     case CallingConv::AMDGPU_HS:
    1087           4 :       ScratchSizeKey = PALMD::Key::HS_SCRATCH_SIZE;
    1088           4 :       break;
    1089           2 :     case CallingConv::AMDGPU_LS:
    1090           2 :       ScratchSizeKey = PALMD::Key::LS_SCRATCH_SIZE;
    1091           2 :       break;
    1092             :   }
    1093          56 :   unsigned NumUsedVgprsKey = ScratchSizeKey +
    1094          56 :       PALMD::Key::VS_NUM_USED_VGPRS - PALMD::Key::VS_SCRATCH_SIZE;
    1095          56 :   unsigned NumUsedSgprsKey = ScratchSizeKey +
    1096          56 :       PALMD::Key::VS_NUM_USED_SGPRS - PALMD::Key::VS_SCRATCH_SIZE;
    1097          56 :   PALMetadataMap[NumUsedVgprsKey] = CurrentProgramInfo.NumVGPRsForWavesPerEU;
    1098          56 :   PALMetadataMap[NumUsedSgprsKey] = CurrentProgramInfo.NumSGPRsForWavesPerEU;
    1099         112 :   if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
    1100          22 :     PALMetadataMap[Rsrc1Reg] |= CurrentProgramInfo.ComputePGMRSrc1;
    1101          22 :     PALMetadataMap[Rsrc2Reg] |= CurrentProgramInfo.ComputePGMRSrc2;
    1102             :     // ScratchSize is in bytes, 16 aligned.
    1103          22 :     PALMetadataMap[ScratchSizeKey] |=
    1104          22 :         alignTo(CurrentProgramInfo.ScratchSize, 16);
    1105             :   } else {
    1106          34 :     PALMetadataMap[Rsrc1Reg] |= S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
    1107          34 :         S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks);
    1108          34 :     if (CurrentProgramInfo.ScratchBlocks > 0)
    1109           1 :       PALMetadataMap[Rsrc2Reg] |= S_00B84C_SCRATCH_EN(1);
    1110             :     // ScratchSize is in bytes, 16 aligned.
    1111          34 :     PALMetadataMap[ScratchSizeKey] |=
    1112          34 :         alignTo(CurrentProgramInfo.ScratchSize, 16);
    1113             :   }
    1114         112 :   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
    1115          17 :     PALMetadataMap[Rsrc2Reg] |=
    1116          17 :         S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks);
    1117          17 :     PALMetadataMap[R_0286CC_SPI_PS_INPUT_ENA / 4] |= MFI->getPSInputEnable();
    1118          17 :     PALMetadataMap[R_0286D0_SPI_PS_INPUT_ADDR / 4] |= MFI->getPSInputAddr();
    1119             :   }
    1120          56 : }
    1121             : 
    1122             : // This is supposed to be log2(Size)
    1123             : static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
    1124        2526 :   switch (Size) {
    1125             :   case 4:
    1126             :     return AMD_ELEMENT_4_BYTES;
    1127           5 :   case 8:
    1128             :     return AMD_ELEMENT_8_BYTES;
    1129          67 :   case 16:
    1130             :     return AMD_ELEMENT_16_BYTES;
    1131           0 :   default:
    1132           0 :     llvm_unreachable("invalid private_element_size");
    1133             :   }
    1134             : }
    1135             : 
    1136        2526 : void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
    1137             :                                         const SIProgramInfo &CurrentProgramInfo,
    1138             :                                         const MachineFunction &MF) const {
    1139        2526 :   const Function &F = MF.getFunction();
    1140             :   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
    1141             :          F.getCallingConv() == CallingConv::SPIR_KERNEL);
    1142             : 
    1143             :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
    1144        2526 :   const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
    1145             : 
    1146        2526 :   AMDGPU::initDefaultAMDKernelCodeT(Out, getSTI());
    1147             : 
    1148        2526 :   Out.compute_pgm_resource_registers =
    1149        5052 :       CurrentProgramInfo.ComputePGMRSrc1 |
    1150        2526 :       (CurrentProgramInfo.ComputePGMRSrc2 << 32);
    1151        2526 :   Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
    1152             : 
    1153        2526 :   if (CurrentProgramInfo.DynamicCallStack)
    1154         241 :     Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
    1155             : 
    1156        2598 :   AMD_HSA_BITS_SET(Out.code_properties,
    1157             :                    AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
    1158             :                    getElementByteSizeValue(STM.getMaxPrivateElementSize()));
    1159             : 
    1160        2526 :   if (MFI->hasPrivateSegmentBuffer()) {
    1161        2526 :     Out.code_properties |=
    1162             :       AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
    1163             :   }
    1164             : 
    1165        2526 :   if (MFI->hasDispatchPtr())
    1166          42 :     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
    1167             : 
    1168        2526 :   if (MFI->hasQueuePtr())
    1169          57 :     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
    1170             : 
    1171        2526 :   if (MFI->hasKernargSegmentPtr())
    1172        2161 :     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
    1173             : 
    1174        2526 :   if (MFI->hasDispatchID())
    1175           5 :     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
    1176             : 
    1177        2526 :   if (MFI->hasFlatScratchInit())
    1178         381 :     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
    1179             : 
    1180        2526 :   if (MFI->hasDispatchPtr())
    1181          42 :     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
    1182             : 
    1183             :   if (STM.debuggerSupported())
    1184           3 :     Out.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
    1185             : 
    1186        2526 :   if (STM.isXNACKEnabled())
    1187          42 :     Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
    1188             : 
    1189             :   unsigned MaxKernArgAlign;
    1190        2526 :   Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
    1191        2526 :   Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
    1192        2526 :   Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
    1193        2526 :   Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
    1194        2526 :   Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
    1195             : 
    1196             :   // These alignment values are specified in powers of two, so alignment =
    1197             :   // 2^n.  The minimum alignment is 2^4 = 16.
    1198        5052 :   Out.kernarg_segment_alignment = std::max((size_t)4,
    1199        5052 :       countTrailingZeros(MaxKernArgAlign));
    1200             : 
    1201        2526 :   if (STM.debuggerEmitPrologue()) {
    1202           4 :     Out.debug_wavefront_private_segment_offset_sgpr =
    1203           4 :       CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
    1204           4 :     Out.debug_private_segment_buffer_sgpr =
    1205           4 :       CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR;
    1206             :   }
    1207        2526 : }
    1208             : 
    1209         680 : bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
    1210             :                                        unsigned AsmVariant,
    1211             :                                        const char *ExtraCode, raw_ostream &O) {
    1212             :   // First try the generic code, which knows about modifiers like 'c' and 'n'.
    1213         680 :   if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O))
    1214             :     return false;
    1215             : 
    1216         676 :   if (ExtraCode && ExtraCode[0]) {
    1217           0 :     if (ExtraCode[1] != 0)
    1218             :       return true; // Unknown modifier.
    1219             : 
    1220           0 :     switch (ExtraCode[0]) {
    1221             :     case 'r':
    1222             :       break;
    1223             :     default:
    1224             :       return true;
    1225             :     }
    1226             :   }
    1227             : 
    1228             :   // TODO: Should be able to support other operand types like globals.
    1229         676 :   const MachineOperand &MO = MI->getOperand(OpNo);
    1230         676 :   if (MO.isReg()) {
    1231         676 :     AMDGPUInstPrinter::printRegOperand(MO.getReg(), O,
    1232         676 :                                        *MF->getSubtarget().getRegisterInfo());
    1233         676 :     return false;
    1234             :   }
    1235             : 
    1236             :   return true;
    1237             : }

Generated by: LCOV version 1.13