LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - AMDGPUAsmPrinter.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 500 519 96.3 %
Date: 2017-09-14 15:23:50 Functions: 22 23 95.7 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : ///
      12             : /// The AMDGPUAsmPrinter is used to print both assembly string and also binary
      13             : /// code.  When passed an MCAsmStreamer it prints assembly and when passed
      14             : /// an MCObjectStreamer it outputs binary code.
      15             : //
      16             : //===----------------------------------------------------------------------===//
      17             : //
      18             : 
      19             : #include "AMDGPUAsmPrinter.h"
      20             : #include "AMDGPU.h"
      21             : #include "AMDGPUSubtarget.h"
      22             : #include "AMDGPUTargetMachine.h"
      23             : #include "InstPrinter/AMDGPUInstPrinter.h"
      24             : #include "MCTargetDesc/AMDGPUTargetStreamer.h"
      25             : #include "R600Defines.h"
      26             : #include "R600MachineFunctionInfo.h"
      27             : #include "R600RegisterInfo.h"
      28             : #include "SIDefines.h"
      29             : #include "SIInstrInfo.h"
      30             : #include "SIMachineFunctionInfo.h"
      31             : #include "SIRegisterInfo.h"
      32             : #include "Utils/AMDGPUBaseInfo.h"
      33             : #include "llvm/BinaryFormat/ELF.h"
      34             : #include "llvm/CodeGen/MachineFrameInfo.h"
      35             : #include "llvm/IR/DiagnosticInfo.h"
      36             : #include "llvm/MC/MCContext.h"
      37             : #include "llvm/MC/MCSectionELF.h"
      38             : #include "llvm/MC/MCStreamer.h"
      39             : #include "llvm/Support/MathExtras.h"
      40             : #include "llvm/Support/TargetRegistry.h"
      41             : #include "llvm/Target/TargetLoweringObjectFile.h"
      42             : 
      43             : using namespace llvm;
      44             : 
      45             : // TODO: This should get the default rounding mode from the kernel. We just set
      46             : // the default here, but this could change if the OpenCL rounding mode pragmas
      47             : // are used.
      48             : //
      49             : // The denormal mode here should match what is reported by the OpenCL runtime
      50             : // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
      51             : // can also be override to flush with the -cl-denorms-are-zero compiler flag.
      52             : //
      53             : // AMD OpenCL only sets flush none and reports CL_FP_DENORM for double
      54             : // precision, and leaves single precision to flush all and does not report
      55             : // CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports
      56             : // CL_FP_DENORM for both.
      57             : //
      58             : // FIXME: It seems some instructions do not support single precision denormals
      59             : // regardless of the mode (exp_*_f32, rcp_*_f32, rsq_*_f32, rsq_*f32, sqrt_f32,
      60             : // and sin_f32, cos_f32 on most parts).
      61             : 
      62             : // We want to use these instructions, and using fp32 denormals also causes
      63             : // instructions to run at the double precision rate for the device so it's
      64             : // probably best to just report no single precision denormals.
      65             : static uint32_t getFPMode(const MachineFunction &F) {
      66       14168 :   const SISubtarget& ST = F.getSubtarget<SISubtarget>();
      67             :   // TODO: Is there any real use for the flush in only / flush out only modes?
      68             : 
      69             :   uint32_t FP32Denormals =
      70       14168 :     ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
      71             : 
      72             :   uint32_t FP64Denormals =
      73       14168 :     ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
      74             : 
      75             :   return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
      76             :          FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
      77       14168 :          FP_DENORM_MODE_SP(FP32Denormals) |
      78       14168 :          FP_DENORM_MODE_DP(FP64Denormals);
      79             : }
      80             : 
      81             : static AsmPrinter *
      82        1703 : createAMDGPUAsmPrinterPass(TargetMachine &tm,
      83             :                            std::unique_ptr<MCStreamer> &&Streamer) {
      84        5109 :   return new AMDGPUAsmPrinter(tm, std::move(Streamer));
      85             : }
      86             : 
      87       47006 : extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
      88       94012 :   TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(),
      89             :                                      createAMDGPUAsmPrinterPass);
      90       94012 :   TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
      91             :                                      createAMDGPUAsmPrinterPass);
      92       47006 : }
      93             : 
      94        1703 : AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
      95        1703 :                                    std::unique_ptr<MCStreamer> Streamer)
      96       11921 :   : AsmPrinter(TM, std::move(Streamer)) {
      97        1703 :     AMDGPUASI = static_cast<AMDGPUTargetMachine*>(&TM)->getAMDGPUAS();
      98        1703 :   }
      99             : 
     100           0 : StringRef AMDGPUAsmPrinter::getPassName() const {
     101           0 :   return "AMDGPU Assembly Printer";
     102             : }
     103             : 
     104         240 : const MCSubtargetInfo* AMDGPUAsmPrinter::getSTI() const {
     105         240 :   return TM.getMCSubtargetInfo();
     106             : }
     107             : 
     108        6114 : AMDGPUTargetStreamer& AMDGPUAsmPrinter::getTargetStreamer() const {
     109       18342 :   return static_cast<AMDGPUTargetStreamer&>(*OutStreamer->getTargetStreamer());
     110             : }
     111             : 
     112        1703 : void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
     113        1703 :   if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
     114        1463 :     return;
     115             : 
     116             :   AMDGPU::IsaInfo::IsaVersion ISA =
     117         480 :       AMDGPU::IsaInfo::getIsaVersion(getSTI()->getFeatureBits());
     118             : 
     119         240 :   getTargetStreamer().EmitDirectiveHSACodeObjectVersion(2, 1);
     120         720 :   getTargetStreamer().EmitDirectiveHSACodeObjectISA(
     121         240 :       ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU");
     122         240 :   getTargetStreamer().EmitStartOfCodeObjectMetadata(M);
     123             : }
     124             : 
     125        1694 : void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
     126        1694 :   if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
     127             :     return;
     128             : 
     129         240 :   getTargetStreamer().EmitEndOfCodeObjectMetadata();
     130             : }
     131             : 
     132        1923 : bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
     133             :   const MachineBasicBlock *MBB) const {
     134        1923 :   if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB))
     135             :     return false;
     136             : 
     137         526 :   if (MBB->empty())
     138             :     return true;
     139             : 
     140             :   // If this is a block implementing a long branch, an expression relative to
     141             :   // the start of the block is needed.  to the start of the block.
     142             :   // XXX - Is there a smarter way to check this?
     143        1038 :   return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
     144             : }
     145             : 
     146       16883 : void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
     147       33766 :   const AMDGPUMachineFunction *MFI = MF->getInfo<AMDGPUMachineFunction>();
     148       16883 :   if (!MFI->isEntryFunction())
     149       15219 :     return;
     150             : 
     151       16220 :   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
     152             :   amd_kernel_code_t KernelCode;
     153       16220 :   if (STM.isAmdCodeObjectV2(*MF)) {
     154        1745 :     getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
     155             : 
     156        3490 :     OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
     157        1745 :     getTargetStreamer().EmitAMDKernelCodeT(KernelCode);
     158             :   }
     159             : 
     160       16220 :   if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
     161             :     return;
     162        3328 :   getTargetStreamer().EmitKernelCodeObjectMetadata(*MF->getFunction(),
     163        1664 :                                                    KernelCode);
     164             : }
     165             : 
     166       16883 : void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
     167       33766 :   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
     168       16883 :   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
     169       16883 :   if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(*MF)) {
     170        3490 :     SmallString<128> SymbolName;
     171        1745 :     getNameWithPrefix(SymbolName, MF->getFunction()),
     172        3490 :     getTargetStreamer().EmitAMDGPUSymbolType(
     173        1745 :         SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
     174             :   }
     175             : 
     176       16883 :   AsmPrinter::EmitFunctionEntryLabel();
     177       16883 : }
     178             : 
     179         276 : void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     180             : 
     181             :   // Group segment variables aren't emitted in HSA.
     182         276 :   if (AMDGPU::isGroupSegment(GV, AMDGPUASI))
     183             :     return;
     184             : 
     185          94 :   AsmPrinter::EmitGlobalVariable(GV);
     186             : }
     187             : 
     188        1694 : bool AMDGPUAsmPrinter::doFinalization(Module &M) {
     189        1694 :   CallGraphResourceInfo.clear();
     190        1694 :   return AsmPrinter::doFinalization(M);
     191             : }
     192             : 
     193             : // Print comments that apply to both callable functions and entry points.
     194       14583 : void AMDGPUAsmPrinter::emitCommonFunctionComments(
     195             :   uint32_t NumVGPR,
     196             :   uint32_t NumSGPR,
     197             :   uint32_t ScratchSize,
     198             :   uint64_t CodeSize) {
     199       72915 :   OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
     200       72915 :   OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
     201       72915 :   OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
     202       72915 :   OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
     203       14583 : }
     204             : 
     205       16883 : bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     206       16883 :   CurrentProgramInfo = SIProgramInfo();
     207             : 
     208       16883 :   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
     209             : 
     210             :   // The starting address of all shader programs must be 256 bytes aligned.
     211             :   // Regular functions just need the basic required instruction alignment.
     212       33766 :   MF.setAlignment(MFI->isEntryFunction() ? 8 : 2);
     213             : 
     214       16883 :   SetupMachineFunction(MF);
     215             : 
     216       16883 :   const AMDGPUSubtarget &STM = MF.getSubtarget<AMDGPUSubtarget>();
     217       16883 :   MCContext &Context = getObjFileLowering().getContext();
     218       16883 :   if (!STM.isAmdHsaOS()) {
     219             :     MCSectionELF *ConfigSection =
     220       29986 :         Context.getELFSection(".AMDGPU.config", ELF::SHT_PROGBITS, 0);
     221       29986 :     OutStreamer->SwitchSection(ConfigSection);
     222             :   }
     223             : 
     224       16883 :   if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
     225       14826 :     if (MFI->isEntryFunction()) {
     226       14168 :       getSIProgramInfo(CurrentProgramInfo, MF);
     227             :     } else {
     228             :       auto I = CallGraphResourceInfo.insert(
     229        1974 :         std::make_pair(MF.getFunction(), SIFunctionResourceInfo()));
     230         658 :       SIFunctionResourceInfo &Info = I.first->second;
     231             :       assert(I.second && "should only be called once per function");
     232         658 :       Info = analyzeResourceUsage(MF);
     233             :     }
     234             : 
     235       14826 :     if (!STM.isAmdHsaOS()) {
     236       12936 :       EmitProgramInfoSI(MF, CurrentProgramInfo);
     237             :     }
     238             :   } else {
     239        2057 :     EmitProgramInfoR600(MF);
     240             :   }
     241             : 
     242       16883 :   DisasmLines.clear();
     243       16883 :   HexLines.clear();
     244       16883 :   DisasmLineMaxLen = 0;
     245             : 
     246       16883 :   EmitFunctionBody();
     247             : 
     248       16883 :   if (isVerbose()) {
     249             :     MCSectionELF *CommentSection =
     250       33272 :         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
     251       33272 :     OutStreamer->SwitchSection(CommentSection);
     252             : 
     253       16636 :     if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
     254       14583 :       if (!MFI->isEntryFunction()) {
     255        1959 :         OutStreamer->emitRawComment(" Function info:", false);
     256        1306 :         SIFunctionResourceInfo &Info = CallGraphResourceInfo[MF.getFunction()];
     257        1959 :         emitCommonFunctionComments(
     258         653 :           Info.NumVGPR,
     259         653 :           Info.getTotalNumSGPRs(MF.getSubtarget<SISubtarget>()),
     260             :           Info.PrivateSegmentSize,
     261             :           getFunctionCodeSize(MF));
     262         653 :         return false;
     263             :       }
     264             : 
     265       41790 :       OutStreamer->emitRawComment(" Kernel info:", false);
     266       13930 :       emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
     267             :                                  CurrentProgramInfo.NumSGPR,
     268             :                                  CurrentProgramInfo.ScratchSize,
     269             :                                  getFunctionCodeSize(MF));
     270             : 
     271       41790 :       OutStreamer->emitRawComment(
     272       69650 :         " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
     273       41790 :       OutStreamer->emitRawComment(
     274       69650 :         " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
     275       41790 :       OutStreamer->emitRawComment(
     276       83580 :         " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
     277       41790 :         " bytes/workgroup (compile time only)", false);
     278             : 
     279       41790 :       OutStreamer->emitRawComment(
     280       69650 :         " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
     281       41790 :       OutStreamer->emitRawComment(
     282       69650 :         " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
     283             : 
     284       41790 :       OutStreamer->emitRawComment(
     285       27860 :         " NumSGPRsForWavesPerEU: " +
     286       55720 :         Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
     287       41790 :       OutStreamer->emitRawComment(
     288       27860 :         " NumVGPRsForWavesPerEU: " +
     289       55720 :         Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
     290             : 
     291       41790 :       OutStreamer->emitRawComment(
     292       69650 :         " ReservedVGPRFirst: " + Twine(CurrentProgramInfo.ReservedVGPRFirst),
     293       13930 :         false);
     294       41790 :       OutStreamer->emitRawComment(
     295       69650 :         " ReservedVGPRCount: " + Twine(CurrentProgramInfo.ReservedVGPRCount),
     296       13930 :         false);
     297             : 
     298       13930 :       if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
     299           3 :         OutStreamer->emitRawComment(
     300           2 :           " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
     301           4 :           Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
     302           3 :         OutStreamer->emitRawComment(
     303           2 :           " DebuggerPrivateSegmentBufferSGPR: s" +
     304           4 :           Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);
     305             :       }
     306             : 
     307       41790 :       OutStreamer->emitRawComment(
     308       27860 :         " COMPUTE_PGM_RSRC2:USER_SGPR: " +
     309       55720 :         Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
     310       41790 :       OutStreamer->emitRawComment(
     311       27860 :         " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
     312       55720 :         Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
     313       41790 :       OutStreamer->emitRawComment(
     314       27860 :         " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
     315       55720 :         Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
     316       41790 :       OutStreamer->emitRawComment(
     317       27860 :         " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
     318       55720 :         Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
     319       41790 :       OutStreamer->emitRawComment(
     320       27860 :         " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
     321       55720 :         Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
     322       41790 :       OutStreamer->emitRawComment(
     323       27860 :         " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
     324       55720 :         Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
     325       13930 :         false);
     326             :     } else {
     327        2053 :       R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
     328        6159 :       OutStreamer->emitRawComment(
     329       10265 :         Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->CFStackSize)));
     330             :     }
     331             :   }
     332             : 
     333       16230 :   if (STM.dumpCode()) {
     334             : 
     335           6 :     OutStreamer->SwitchSection(
     336           8 :         Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
     337             : 
     338          32 :     for (size_t i = 0; i < DisasmLines.size(); ++i) {
     339          70 :       std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
     340          70 :       Comment += " ; " + HexLines[i] + "\n";
     341             : 
     342          56 :       OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
     343          42 :       OutStreamer->EmitBytes(StringRef(Comment));
     344             :     }
     345             :   }
     346             : 
     347             :   return false;
     348             : }
     349             : 
     350        2057 : void AMDGPUAsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
     351        2057 :   unsigned MaxGPR = 0;
     352        2057 :   bool killPixel = false;
     353        2057 :   const R600Subtarget &STM = MF.getSubtarget<R600Subtarget>();
     354        2057 :   const R600RegisterInfo *RI = STM.getRegisterInfo();
     355        2057 :   const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
     356             : 
     357        8228 :   for (const MachineBasicBlock &MBB : MF) {
     358      165358 :     for (const MachineInstr &MI : MBB) {
     359       78565 :       if (MI.getOpcode() == AMDGPU::KILLGT)
     360           0 :         killPixel = true;
     361       78565 :       unsigned numOperands = MI.getNumOperands();
     362     1054473 :       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
     363     1951816 :         const MachineOperand &MO = MI.getOperand(op_idx);
     364      975908 :         if (!MO.isReg())
     365     1620412 :           continue;
     366      197305 :         unsigned HWReg = RI->getHWRegIndex(MO.getReg());
     367             : 
     368             :         // Register with value > 127 aren't GPR
     369      197305 :         if (HWReg > 127)
     370       63206 :           continue;
     371      134099 :         MaxGPR = std::max(MaxGPR, HWReg);
     372             :       }
     373             :     }
     374             :   }
     375             : 
     376             :   unsigned RsrcReg;
     377        2057 :   if (STM.getGeneration() >= R600Subtarget::EVERGREEN) {
     378             :     // Evergreen / Northern Islands
     379        4080 :     switch (MF.getFunction()->getCallingConv()) {
     380             :     default: LLVM_FALLTHROUGH;
     381             :     case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break;
     382             :     case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break;
     383             :     case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break;
     384             :     case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break;
     385             :     }
     386             :   } else {
     387             :     // R600 / R700
     388          34 :     switch (MF.getFunction()->getCallingConv()) {
     389             :     default: LLVM_FALLTHROUGH;
     390             :     case CallingConv::AMDGPU_GS: LLVM_FALLTHROUGH;
     391             :     case CallingConv::AMDGPU_CS: LLVM_FALLTHROUGH;
     392             :     case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break;
     393           7 :     case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break;
     394             :     }
     395             :   }
     396             : 
     397        4114 :   OutStreamer->EmitIntValue(RsrcReg, 4);
     398        8228 :   OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
     399        4114 :                            S_STACK_SIZE(MFI->CFStackSize), 4);
     400        4114 :   OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
     401        4114 :   OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
     402             : 
     403        4114 :   if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
     404        4020 :     OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
     405        6030 :     OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4);
     406             :   }
     407        2057 : }
     408             : 
     409       14583 : uint64_t AMDGPUAsmPrinter::getFunctionCodeSize(const MachineFunction &MF) const {
     410       14583 :   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
     411       14583 :   const SIInstrInfo *TII = STM.getInstrInfo();
     412             : 
     413       14583 :   uint64_t CodeSize = 0;
     414             : 
     415       60196 :   for (const MachineBasicBlock &MBB : MF) {
     416      672712 :     for (const MachineInstr &MI : MBB) {
     417             :       // TODO: CodeSize should account for multiple functions.
     418             : 
     419             :       // TODO: Should we count size of debug info?
     420      303462 :       if (MI.isDebugValue())
     421           9 :         continue;
     422             : 
     423      303453 :       CodeSize += TII->getInstSizeInBytes(MI);
     424             :     }
     425             :   }
     426             : 
     427       14583 :   return CodeSize;
     428             : }
     429             : 
     430             : static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
     431             :                                   const SIInstrInfo &TII,
     432             :                                   unsigned Reg) {
     433       17275 :   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
     434       15471 :     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
     435             :       return true;
     436             :   }
     437             : 
     438             :   return false;
     439             : }
     440             : 
     441             : static unsigned getNumExtraSGPRs(const SISubtarget &ST,
     442             :                                  bool VCCUsed,
     443             :                                  bool FlatScrUsed) {
     444       15081 :   unsigned ExtraSGPRs = 0;
     445       14821 :   if (VCCUsed)
     446        4025 :     ExtraSGPRs = 2;
     447             : 
     448       15081 :   if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
     449        7443 :     if (FlatScrUsed)
     450         268 :       ExtraSGPRs = 4;
     451             :   } else {
     452        7638 :     if (ST.isXNACKEnabled())
     453         667 :       ExtraSGPRs = 4;
     454             : 
     455        7638 :     if (FlatScrUsed)
     456         482 :       ExtraSGPRs = 6;
     457             :   }
     458             : 
     459             :   return ExtraSGPRs;
     460             : }
     461             : 
     462         653 : int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
     463             :   const SISubtarget &ST) const {
     464        1306 :   return NumExplicitSGPR + getNumExtraSGPRs(ST, UsesVCC, UsesFlatScratch);
     465             : }
     466             : 
     467       14826 : AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
     468             :   const MachineFunction &MF) const {
     469       14826 :   SIFunctionResourceInfo Info;
     470             : 
     471       14826 :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     472       14826 :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
     473       14826 :   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
     474       14826 :   const MachineRegisterInfo &MRI = MF.getRegInfo();
     475       14826 :   const SIInstrInfo *TII = ST.getInstrInfo();
     476       14826 :   const SIRegisterInfo &TRI = TII->getRegisterInfo();
     477             : 
     478       26108 :   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
     479       11282 :                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);
     480             : 
     481             :   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
     482             :   // instructions aren't used to access the scratch buffer. Inline assembly may
     483             :   // need it though.
     484             :   //
     485             :   // If we only have implicit uses of flat_scr on flat instructions, it is not
     486             :   // really needed.
     487       21604 :   if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
     488        9636 :       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
     489        9559 :        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
     490        6358 :        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
     491             :     Info.UsesFlatScratch = false;
     492             :   }
     493             : 
     494       14826 :   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
     495       14826 :   Info.PrivateSegmentSize = FrameInfo.getStackSize();
     496             : 
     497             : 
     498       25887 :   Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
     499       11061 :                  MRI.isPhysRegUsed(AMDGPU::VCC_HI);
     500             : 
     501             :   // If there are no calls, MachineRegisterInfo can tell us the used register
     502             :   // count easily.
     503             :   // A tail call isn't considered a call for MachineFrameInfo's purposes.
     504       14826 :   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
     505       14394 :     MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
     506     7325454 :     for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
     507     3633566 :       if (MRI.isPhysRegUsed(Reg)) {
     508             :         HighestVGPRReg = Reg;
     509             :         break;
     510             :       }
     511             :     }
     512             : 
     513       14394 :     MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
     514     2820686 :     for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
     515     1381154 :       if (MRI.isPhysRegUsed(Reg)) {
     516             :         HighestSGPRReg = Reg;
     517             :         break;
     518             :       }
     519             :     }
     520             : 
     521             :     // We found the maximum register index. They start at 0, so add one to get the
     522             :     // number of registers.
     523       28042 :     Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
     524       27296 :       TRI.getHWRegIndex(HighestVGPRReg) + 1;
     525       27986 :     Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
     526       27184 :       TRI.getHWRegIndex(HighestSGPRReg) + 1;
     527             : 
     528       14394 :     return Info;
     529             :   }
     530             : 
     531         432 :   int32_t MaxVGPR = -1;
     532         432 :   int32_t MaxSGPR = -1;
     533         432 :   uint32_t CalleeFrameSize = 0;
     534             : 
     535        2608 :   for (const MachineBasicBlock &MBB : MF) {
     536       16666 :     for (const MachineInstr &MI : MBB) {
     537             :       // TODO: Check regmasks? Do they occur anywhere except calls?
     538       34476 :       for (const MachineOperand &MO : MI.operands()) {
     539       27023 :         unsigned Width = 0;
     540       27023 :         bool IsSGPR = false;
     541             : 
     542       27023 :         if (!MO.isReg())
     543        8206 :           continue;
     544             : 
     545       18817 :         unsigned Reg = MO.getReg();
     546       22314 :         switch (Reg) {
     547        2948 :         case AMDGPU::EXEC:
     548             :         case AMDGPU::EXEC_LO:
     549             :         case AMDGPU::EXEC_HI:
     550             :         case AMDGPU::SCC:
     551             :         case AMDGPU::M0:
     552             :         case AMDGPU::SRC_SHARED_BASE:
     553             :         case AMDGPU::SRC_SHARED_LIMIT:
     554             :         case AMDGPU::SRC_PRIVATE_BASE:
     555             :         case AMDGPU::SRC_PRIVATE_LIMIT:
     556        2948 :           continue;
     557             : 
     558           0 :         case AMDGPU::NoRegister:
     559             :           assert(MI.isDebugValue());
     560           0 :           continue;
     561             : 
     562          30 :         case AMDGPU::VCC:
     563             :         case AMDGPU::VCC_LO:
     564             :         case AMDGPU::VCC_HI:
     565          30 :           Info.UsesVCC = true;
     566          30 :           continue;
     567             : 
     568         519 :         case AMDGPU::FLAT_SCR:
     569             :         case AMDGPU::FLAT_SCR_LO:
     570             :         case AMDGPU::FLAT_SCR_HI:
     571         519 :           continue;
     572             : 
     573           0 :         case AMDGPU::TBA:
     574             :         case AMDGPU::TBA_LO:
     575             :         case AMDGPU::TBA_HI:
     576             :         case AMDGPU::TMA:
     577             :         case AMDGPU::TMA_LO:
     578             :         case AMDGPU::TMA_HI:
     579           0 :           llvm_unreachable("trap handler registers should not be used");
     580             : 
     581             :         default:
     582             :           break;
     583             :         }
     584             : 
     585       23964 :         if (AMDGPU::SReg_32RegClass.contains(Reg)) {
     586             :           assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
     587             :                  "trap handler registers should not be used");
     588             :           IsSGPR = true;
     589             :           Width = 1;
     590       11155 :         } else if (AMDGPU::VGPR_32RegClass.contains(Reg)) {
     591             :           IsSGPR = false;
     592             :           Width = 1;
     593        9109 :         } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
     594             :           assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
     595             :                  "trap handler registers should not be used");
     596             :           IsSGPR = true;
     597             :           Width = 2;
     598        4698 :         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
     599             :           IsSGPR = false;
     600             :           Width = 2;
     601        4418 :         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
     602             :           IsSGPR = false;
     603             :           Width = 3;
     604        4193 :         } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
     605             :           IsSGPR = true;
     606             :           Width = 4;
     607         421 :         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
     608             :           IsSGPR = false;
     609             :           Width = 4;
     610          39 :         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
     611             :           IsSGPR = true;
     612             :           Width = 8;
     613          67 :         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
     614             :           IsSGPR = false;
     615             :           Width = 8;
     616          29 :         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
     617             :           IsSGPR = true;
     618             :           Width = 16;
     619          58 :         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
     620             :           IsSGPR = false;
     621             :           Width = 16;
     622             :         } else {
     623           0 :           llvm_unreachable("Unknown register class");
     624             :         }
     625       15320 :         unsigned HWReg = TRI.getHWRegIndex(Reg);
     626       15320 :         int MaxUsed = HWReg + Width - 1;
     627       15320 :         if (IsSGPR) {
     628       11731 :           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
     629             :         } else {
     630        3589 :           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
     631             :         }
     632             :       }
     633             : 
     634        7453 :       if (MI.isCall()) {
     635             :         // Pseudo used just to encode the underlying global. Is there a better
     636             :         // way to track this?
     637             : 
     638         449 :         const MachineOperand *CalleeOp
     639             :           = TII->getNamedOperand(MI, AMDGPU::OpName::callee);
     640         898 :         const Function *Callee = cast<Function>(CalleeOp->getGlobal());
     641         449 :         if (Callee->isDeclaration()) {
     642             :           // If this is a call to an external function, we can't do much. Make
     643             :           // conservative guesses.
     644             : 
     645             :           // 48 SGPRs - vcc, - flat_scr, -xnack
     646         520 :           int MaxSGPRGuess = 47 - getNumExtraSGPRs(ST, true,
     647         520 :                                                    ST.hasFlatAddressSpace());
     648         260 :           MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
     649         520 :           MaxVGPR = std::max(MaxVGPR, 23);
     650             : 
     651         520 :           CalleeFrameSize = std::max(CalleeFrameSize, 16384u);
     652         260 :           Info.UsesVCC = true;
     653         260 :           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
     654         260 :           Info.HasDynamicallySizedStack = true;
     655             :         } else {
     656             :           // We force CodeGen to run in SCC order, so the callee's register
     657             :           // usage etc. should be the cumulative usage of all callees.
     658         189 :           auto I = CallGraphResourceInfo.find(Callee);
     659             :           assert(I != CallGraphResourceInfo.end() &&
     660             :                  "callee should have been handled before caller");
     661             : 
     662         378 :           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
     663         378 :           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
     664             :           CalleeFrameSize
     665         378 :             = std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
     666         189 :           Info.UsesVCC |= I->second.UsesVCC;
     667         189 :           Info.UsesFlatScratch |= I->second.UsesFlatScratch;
     668         189 :           Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
     669         189 :           Info.HasRecursion |= I->second.HasRecursion;
     670             :         }
     671             : 
     672         449 :         if (!Callee->doesNotRecurse())
     673         401 :           Info.HasRecursion = true;
     674             :       }
     675             :     }
     676             :   }
     677             : 
     678         432 :   Info.NumExplicitSGPR = MaxSGPR + 1;
     679         432 :   Info.NumVGPR = MaxVGPR + 1;
     680         432 :   Info.PrivateSegmentSize += CalleeFrameSize;
     681             : 
     682         432 :   return Info;
     683             : }
     684             : 
     685       14168 : void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     686             :                                         const MachineFunction &MF) {
     687       14168 :   SIFunctionResourceInfo Info = analyzeResourceUsage(MF);
     688             : 
     689       14168 :   ProgInfo.NumVGPR = Info.NumVGPR;
     690       14168 :   ProgInfo.NumSGPR = Info.NumExplicitSGPR;
     691       14168 :   ProgInfo.ScratchSize = Info.PrivateSegmentSize;
     692       14168 :   ProgInfo.VCCUsed = Info.UsesVCC;
     693       14168 :   ProgInfo.FlatUsed = Info.UsesFlatScratch;
     694       14168 :   ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
     695             : 
     696       14168 :   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
     697       14168 :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     698       14168 :   const SIInstrInfo *TII = STM.getInstrInfo();
     699       14168 :   const SIRegisterInfo *RI = &TII->getRegisterInfo();
     700             : 
     701       14168 :   unsigned ExtraSGPRs = getNumExtraSGPRs(STM,
     702       14168 :                                          ProgInfo.VCCUsed,
     703       28336 :                                          ProgInfo.FlatUsed);
     704       28336 :   unsigned ExtraVGPRs = STM.getReservedNumVGPRs(MF);
     705             : 
     706             :   // Check the addressable register limit before we add ExtraSGPRs.
     707       14168 :   if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
     708             :       !STM.hasSGPRInitBug()) {
     709        2878 :     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
     710        2878 :     if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
     711             :       // This can happen due to a compiler bug or when using inline asm.
     712           1 :       LLVMContext &Ctx = MF.getFunction()->getContext();
     713           1 :       DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
     714             :                                        "addressable scalar registers",
     715           1 :                                        ProgInfo.NumSGPR, DS_Error,
     716             :                                        DK_ResourceLimit,
     717           4 :                                        MaxAddressableNumSGPRs);
     718           1 :       Ctx.diagnose(Diag);
     719           1 :       ProgInfo.NumSGPR = MaxAddressableNumSGPRs - 1;
     720             :     }
     721             :   }
     722             : 
     723             :   // Account for extra SGPRs and VGPRs reserved for debugger use.
     724       14168 :   ProgInfo.NumSGPR += ExtraSGPRs;
     725       14168 :   ProgInfo.NumVGPR += ExtraVGPRs;
     726             : 
     727             :   // Adjust number of registers used to meet default/requested minimum/maximum
     728             :   // number of waves per execution unit request.
     729       14168 :   ProgInfo.NumSGPRsForWavesPerEU = std::max(
     730       70840 :     std::max(ProgInfo.NumSGPR, 1u), STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
     731       14168 :   ProgInfo.NumVGPRsForWavesPerEU = std::max(
     732       70840 :     std::max(ProgInfo.NumVGPR, 1u), STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
     733             : 
     734       14168 :   if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
     735             :       STM.hasSGPRInitBug()) {
     736       11290 :     unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
     737       11290 :     if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
     738             :       // This can happen due to a compiler bug or when using inline asm to use
     739             :       // the registers which are usually reserved for vcc etc.
     740           4 :       LLVMContext &Ctx = MF.getFunction()->getContext();
     741           4 :       DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
     742             :                                        "scalar registers",
     743           4 :                                        ProgInfo.NumSGPR, DS_Error,
     744             :                                        DK_ResourceLimit,
     745          16 :                                        MaxAddressableNumSGPRs);
     746           4 :       Ctx.diagnose(Diag);
     747           4 :       ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
     748           4 :       ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
     749             :     }
     750             :   }
     751             : 
     752       14168 :   if (STM.hasSGPRInitBug()) {
     753        4198 :     ProgInfo.NumSGPR =
     754             :         AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
     755        4198 :     ProgInfo.NumSGPRsForWavesPerEU =
     756             :         AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
     757             :   }
     758             : 
     759       14168 :   if (MFI->getNumUserSGPRs() > STM.getMaxNumUserSGPRs()) {
     760           0 :     LLVMContext &Ctx = MF.getFunction()->getContext();
     761           0 :     DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "user SGPRs",
     762           0 :                                      MFI->getNumUserSGPRs(), DS_Error);
     763           0 :     Ctx.diagnose(Diag);
     764             :   }
     765             : 
     766       14168 :   if (MFI->getLDSSize() > static_cast<unsigned>(STM.getLocalMemorySize())) {
     767           4 :     LLVMContext &Ctx = MF.getFunction()->getContext();
     768           4 :     DiagnosticInfoResourceLimit Diag(*MF.getFunction(), "local memory",
     769          12 :                                      MFI->getLDSSize(), DS_Error);
     770           4 :     Ctx.diagnose(Diag);
     771             :   }
     772             : 
     773             :   // SGPRBlocks is actual number of SGPR blocks minus 1.
     774       28336 :   ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU,
     775       14168 :                                 STM.getSGPREncodingGranule());
     776       28336 :   ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / STM.getSGPREncodingGranule() - 1;
     777             : 
     778             :   // VGPRBlocks is actual number of VGPR blocks minus 1.
     779       28336 :   ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU,
     780       14168 :                                 STM.getVGPREncodingGranule());
     781       28336 :   ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / STM.getVGPREncodingGranule() - 1;
     782             : 
     783             :   // Record first reserved VGPR and number of reserved VGPRs.
     784       14168 :   ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? ProgInfo.NumVGPR : 0;
     785       28336 :   ProgInfo.ReservedVGPRCount = STM.getReservedNumVGPRs(MF);
     786             : 
     787             :   // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
     788             :   // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
     789             :   // attribute was requested.
     790       14168 :   if (STM.debuggerEmitPrologue()) {
     791           4 :     ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
     792           8 :       RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
     793           4 :     ProgInfo.DebuggerPrivateSegmentBufferSGPR =
     794           8 :       RI->getHWRegIndex(MFI->getScratchRSrcReg());
     795             :   }
     796             : 
     797             :   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
     798             :   // register.
     799       14168 :   ProgInfo.FloatMode = getFPMode(MF);
     800             : 
     801       28336 :   ProgInfo.IEEEMode = STM.enableIEEEBit(MF);
     802             : 
     803             :   // Make clamp modifier on NaN input returns 0.
     804       14168 :   ProgInfo.DX10Clamp = STM.enableDX10Clamp();
     805             : 
     806             :   unsigned LDSAlignShift;
     807       14168 :   if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {
     808             :     // LDS is allocated in 64 dword blocks.
     809             :     LDSAlignShift = 8;
     810             :   } else {
     811             :     // LDS is allocated in 128 dword blocks.
     812        8716 :     LDSAlignShift = 9;
     813             :   }
     814             : 
     815             :   unsigned LDSSpillSize =
     816       14168 :     MFI->getLDSWaveSpillSize() * MFI->getMaxFlatWorkGroupSize();
     817             : 
     818       14168 :   ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize;
     819       14168 :   ProgInfo.LDSBlocks =
     820       28336 :       alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
     821             : 
     822             :   // Scratch is allocated in 256 dword blocks.
     823       14168 :   unsigned ScratchAlignShift = 10;
     824             :   // We need to program the hardware with the amount of scratch memory that
     825             :   // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
     826             :   // scratch memory used per thread.
     827       14168 :   ProgInfo.ScratchBlocks =
     828       28336 :       alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
     829       14168 :               1ULL << ScratchAlignShift) >>
     830             :       ScratchAlignShift;
     831             : 
     832       14168 :   ProgInfo.ComputePGMRSrc1 =
     833       28336 :       S_00B848_VGPRS(ProgInfo.VGPRBlocks) |
     834       28336 :       S_00B848_SGPRS(ProgInfo.SGPRBlocks) |
     835       28336 :       S_00B848_PRIORITY(ProgInfo.Priority) |
     836       28336 :       S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
     837       28336 :       S_00B848_PRIV(ProgInfo.Priv) |
     838       28336 :       S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
     839       28336 :       S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
     840       14168 :       S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
     841             : 
     842             :   // 0 = X, 1 = XY, 2 = XYZ
     843       14168 :   unsigned TIDIGCompCnt = 0;
     844       14168 :   if (MFI->hasWorkItemIDZ())
     845             :     TIDIGCompCnt = 2;
     846       14110 :   else if (MFI->hasWorkItemIDY())
     847          47 :     TIDIGCompCnt = 1;
     848             : 
     849       14168 :   ProgInfo.ComputePGMRSrc2 =
     850       28336 :       S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
     851       28336 :       S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
     852       28336 :       S_00B84C_TRAP_HANDLER(STM.isTrapHandlerEnabled()) |
     853       28336 :       S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
     854       28336 :       S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
     855       28336 :       S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
     856       28336 :       S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
     857       14168 :       S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
     858       14168 :       S_00B84C_EXCP_EN_MSB(0) |
     859             :       // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
     860       42504 :       S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
     861             :       S_00B84C_EXCP_EN(0);
     862       14168 : }
     863             : 
     864             : static unsigned getRsrcReg(CallingConv::ID CallConv) {
     865             :   switch (CallConv) {
     866             :   default: LLVM_FALLTHROUGH;
     867             :   case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
     868             :   case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
     869             :   case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
     870             :   case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
     871             :   case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
     872             :   }
     873             : }
     874             : 
     875       12936 : void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
     876             :                                          const SIProgramInfo &CurrentProgramInfo) {
     877       12936 :   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
     878       12936 :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     879       25872 :   unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv());
     880             : 
     881       25872 :   if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
     882       24792 :     OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
     883             : 
     884       24792 :     OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4);
     885             : 
     886       24792 :     OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
     887       24792 :     OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc2, 4);
     888             : 
     889       24792 :     OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
     890       24792 :     OutStreamer->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
     891             : 
     892             :     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
     893             :     // 0" comment but I don't see a corresponding field in the register spec.
     894             :   } else {
     895        1080 :     OutStreamer->EmitIntValue(RsrcReg, 4);
     896        2160 :     OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
     897        1080 :                               S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
     898         540 :     if (STM.isVGPRSpillingEnabled(*MF.getFunction())) {
     899          68 :       OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
     900          68 :       OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
     901             :     }
     902             :   }
     903             : 
     904       25872 :   if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) {
     905         878 :     OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
     906         878 :     OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4);
     907         878 :     OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
     908         878 :     OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
     909         878 :     OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
     910         878 :     OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
     911             :   }
     912             : 
     913       25872 :   OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
     914       25872 :   OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4);
     915       25872 :   OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4);
     916       25872 :   OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4);
     917       12936 : }
     918             : 
     919             : // This is supposed to be log2(Size)
     920             : static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
     921        1745 :   switch (Size) {
     922             :   case 4:
     923             :     return AMD_ELEMENT_4_BYTES;
     924           5 :   case 8:
     925             :     return AMD_ELEMENT_8_BYTES;
     926           5 :   case 16:
     927             :     return AMD_ELEMENT_16_BYTES;
     928           0 :   default:
     929           0 :     llvm_unreachable("invalid private_element_size");
     930             :   }
     931             : }
     932             : 
     933        1745 : void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
     934             :                                         const SIProgramInfo &CurrentProgramInfo,
     935             :                                         const MachineFunction &MF) const {
     936        1745 :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     937        1745 :   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
     938             : 
     939        3490 :   AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits());
     940             : 
     941        1745 :   Out.compute_pgm_resource_registers =
     942        3490 :       CurrentProgramInfo.ComputePGMRSrc1 |
     943        1745 :       (CurrentProgramInfo.ComputePGMRSrc2 << 32);
     944        1745 :   Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
     945             : 
     946        1745 :   if (CurrentProgramInfo.DynamicCallStack)
     947         208 :     Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
     948             : 
     949        3490 :   AMD_HSA_BITS_SET(Out.code_properties,
     950             :                    AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
     951             :                    getElementByteSizeValue(STM.getMaxPrivateElementSize()));
     952             : 
     953        1745 :   if (MFI->hasPrivateSegmentBuffer()) {
     954        1742 :     Out.code_properties |=
     955             :       AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
     956             :   }
     957             : 
     958        1745 :   if (MFI->hasDispatchPtr())
     959          25 :     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
     960             : 
     961        1745 :   if (MFI->hasQueuePtr())
     962          57 :     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
     963             : 
     964        1745 :   if (MFI->hasKernargSegmentPtr())
     965        1421 :     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
     966             : 
     967        1745 :   if (MFI->hasDispatchID())
     968           5 :     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
     969             : 
     970        1745 :   if (MFI->hasFlatScratchInit())
     971         332 :     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
     972             : 
     973        1745 :   if (MFI->hasGridWorkgroupCountX()) {
     974           0 :     Out.code_properties |=
     975             :       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
     976             :   }
     977             : 
     978        1745 :   if (MFI->hasGridWorkgroupCountY()) {
     979           0 :     Out.code_properties |=
     980             :       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
     981             :   }
     982             : 
     983        1745 :   if (MFI->hasGridWorkgroupCountZ()) {
     984           0 :     Out.code_properties |=
     985             :       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
     986             :   }
     987             : 
     988        1745 :   if (MFI->hasDispatchPtr())
     989          25 :     Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
     990             : 
     991           3 :   if (STM.debuggerSupported())
     992           3 :     Out.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
     993             : 
     994        1745 :   if (STM.isXNACKEnabled())
     995          99 :     Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
     996             : 
     997             :   // FIXME: Should use getKernArgSize
     998        1745 :   Out.kernarg_segment_byte_size =
     999        1745 :     STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
    1000        1745 :   Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
    1001        1745 :   Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
    1002        1745 :   Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
    1003        1745 :   Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
    1004        1745 :   Out.reserved_vgpr_first = CurrentProgramInfo.ReservedVGPRFirst;
    1005        1745 :   Out.reserved_vgpr_count = CurrentProgramInfo.ReservedVGPRCount;
    1006             : 
    1007             :   // These alignment values are specified in powers of two, so alignment =
    1008             :   // 2^n.  The minimum alignment is 2^4 = 16.
    1009        1745 :   Out.kernarg_segment_alignment = std::max((size_t)4,
    1010        6980 :       countTrailingZeros(MFI->getMaxKernArgAlign()));
    1011             : 
    1012        1745 :   if (STM.debuggerEmitPrologue()) {
    1013           4 :     Out.debug_wavefront_private_segment_offset_sgpr =
    1014           4 :       CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
    1015           4 :     Out.debug_private_segment_buffer_sgpr =
    1016           4 :       CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR;
    1017             :   }
    1018        1745 : }
    1019             : 
    1020         572 : bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
    1021             :                                        unsigned AsmVariant,
    1022             :                                        const char *ExtraCode, raw_ostream &O) {
    1023             :   // First try the generic code, which knows about modifiers like 'c' and 'n'.
    1024         572 :   if (!AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O))
    1025             :     return false;
    1026             : 
    1027         568 :   if (ExtraCode && ExtraCode[0]) {
    1028           0 :     if (ExtraCode[1] != 0)
    1029             :       return true; // Unknown modifier.
    1030             : 
    1031           0 :     switch (ExtraCode[0]) {
    1032             :     case 'r':
    1033             :       break;
    1034             :     default:
    1035             :       return true;
    1036             :     }
    1037             :   }
    1038             : 
    1039             :   // TODO: Should be able to support other operand types like globals.
    1040        1136 :   const MachineOperand &MO = MI->getOperand(OpNo);
    1041         568 :   if (MO.isReg()) {
    1042         568 :     AMDGPUInstPrinter::printRegOperand(MO.getReg(), O,
    1043         568 :                                        *MF->getSubtarget().getRegisterInfo());
    1044         568 :     return false;
    1045             :   }
    1046             : 
    1047             :   return true;
    1048             : }

Generated by: LCOV version 1.13