LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIFrameLowering.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 263 282 93.3 %
Date: 2018-02-19 03:08:00 Functions: 14 14 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===----------------------- SIFrameLowering.cpp --------------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //==-----------------------------------------------------------------------===//
       9             : 
      10             : #include "SIFrameLowering.h"
      11             : #include "AMDGPUSubtarget.h"
      12             : #include "SIInstrInfo.h"
      13             : #include "SIMachineFunctionInfo.h"
      14             : #include "SIRegisterInfo.h"
      15             : 
      16             : #include "llvm/CodeGen/MachineFrameInfo.h"
      17             : #include "llvm/CodeGen/MachineFunction.h"
      18             : #include "llvm/CodeGen/MachineInstrBuilder.h"
      19             : #include "llvm/CodeGen/RegisterScavenging.h"
      20             : 
      21             : using namespace llvm;
      22             : 
      23             : 
      24             : static ArrayRef<MCPhysReg> getAllSGPR128(const SISubtarget &ST,
      25             :                                          const MachineFunction &MF) {
      26             :   return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
      27         405 :                       ST.getMaxNumSGPRs(MF) / 4);
      28             : }
      29             : 
      30             : static ArrayRef<MCPhysReg> getAllSGPRs(const SISubtarget &ST,
      31             :                                        const MachineFunction &MF) {
      32             :   return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
      33         723 :                       ST.getMaxNumSGPRs(MF));
      34             : }
      35             : 
      36         349 : void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
      37             :                                           MachineFunction &MF,
      38             :                                           MachineBasicBlock &MBB) const {
      39             :   const SIInstrInfo *TII = ST.getInstrInfo();
      40             :   const SIRegisterInfo* TRI = &TII->getRegisterInfo();
      41         349 :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
      42             : 
      43             :   // We don't need this if we only have spills since there is no user facing
      44             :   // scratch.
      45             : 
      46             :   // TODO: If we know we don't have flat instructions earlier, we can omit
      47             :   // this from the input registers.
      48             :   //
      49             :   // TODO: We only need to know if we access scratch space through a flat
      50             :   // pointer. Because we only detect if flat instructions are used at all,
      51             :   // this will be used more often than necessary on VI.
      52             : 
      53             :   // Debug location must be unknown since the first debug location is used to
      54             :   // determine the end of the prologue.
      55         349 :   DebugLoc DL;
      56         349 :   MachineBasicBlock::iterator I = MBB.begin();
      57             : 
      58             :   unsigned FlatScratchInitReg
      59             :     = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
      60             : 
      61         349 :   MachineRegisterInfo &MRI = MF.getRegInfo();
      62             :   MRI.addLiveIn(FlatScratchInitReg);
      63         349 :   MBB.addLiveIn(FlatScratchInitReg);
      64             : 
      65         349 :   unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
      66         349 :   unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
      67             : 
      68         349 :   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
      69             : 
      70             :   // Do a 64-bit pointer add.
      71         349 :   if (ST.flatScratchIsPointer()) {
      72         186 :     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
      73          62 :       .addReg(FlatScrInitLo)
      74          62 :       .addReg(ScratchWaveOffsetReg);
      75         186 :     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
      76          62 :       .addReg(FlatScrInitHi)
      77             :       .addImm(0);
      78             : 
      79             :     return;
      80             :   }
      81             : 
      82             :   // Copy the size in bytes.
      83         861 :   BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
      84         287 :     .addReg(FlatScrInitHi, RegState::Kill);
      85             : 
      86             :   // Add wave offset in bytes to private base offset.
      87             :   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
      88         861 :   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
      89         287 :     .addReg(FlatScrInitLo)
      90         287 :     .addReg(ScratchWaveOffsetReg);
      91             : 
      92             :   // Convert offset to 256-byte units.
      93         861 :   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
      94         287 :     .addReg(FlatScrInitLo, RegState::Kill)
      95             :     .addImm(8);
      96             : }
      97             : 
      98       15469 : unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
      99             :   const SISubtarget &ST,
     100             :   const SIInstrInfo *TII,
     101             :   const SIRegisterInfo *TRI,
     102             :   SIMachineFunctionInfo *MFI,
     103             :   MachineFunction &MF) const {
     104       15469 :   MachineRegisterInfo &MRI = MF.getRegInfo();
     105             : 
     106             :   // We need to insert initialization of the scratch resource descriptor.
     107       15469 :   unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
     108       30938 :   if (ScratchRsrcReg == AMDGPU::NoRegister ||
     109       15469 :       !MRI.isPhysRegUsed(ScratchRsrcReg))
     110             :     return AMDGPU::NoRegister;
     111             : 
     112        1559 :   if (ST.hasSGPRInitBug() ||
     113         707 :       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
     114             :     return ScratchRsrcReg;
     115             : 
     116             :   // We reserved the last registers for this. Shift it down to the end of those
     117             :   // which were actually used.
     118             :   //
     119             :   // FIXME: It might be safer to use a pseudoregister before replacement.
     120             : 
     121             :   // FIXME: We should be able to eliminate unused input registers. We only
     122             :   // cannot do this for the resources required for scratch access. For now we
     123             :   // skip over user SGPRs and may leave unused holes.
     124             : 
     125             :   // We find the resource first because it has an alignment requirement.
     126             : 
     127         810 :   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
     128             :   ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
     129        1215 :   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
     130             : 
     131             :   // Skip the last N reserved elements because they should have already been
     132             :   // reserved for VCC etc.
     133        2821 :   for (MCPhysReg Reg : AllSGPR128s) {
     134             :     // Pick the first unallocated one. Make sure we don't clobber the other
     135             :     // reserved input we needed.
     136        1605 :     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
     137         397 :       MRI.replaceRegWith(ScratchRsrcReg, Reg);
     138             :       MFI->setScratchRSrcReg(Reg);
     139         397 :       return Reg;
     140             :     }
     141             :   }
     142             : 
     143             :   return ScratchRsrcReg;
     144             : }
     145             : 
     146             : // Shift down registers reserved for the scratch wave offset and stack pointer
     147             : // SGPRs.
     148             : std::pair<unsigned, unsigned>
     149       15469 : SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
     150             :   const SISubtarget &ST,
     151             :   const SIInstrInfo *TII,
     152             :   const SIRegisterInfo *TRI,
     153             :   SIMachineFunctionInfo *MFI,
     154             :   MachineFunction &MF) const {
     155       15469 :   MachineRegisterInfo &MRI = MF.getRegInfo();
     156       15469 :   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
     157             : 
     158             :   // No replacement necessary.
     159       30938 :   if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
     160       15469 :       !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) {
     161             :     assert(MFI->getStackPtrOffsetReg() == AMDGPU::SP_REG);
     162       14596 :     return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister);
     163             :   }
     164             : 
     165         873 :   unsigned SPReg = MFI->getStackPtrOffsetReg();
     166         873 :   if (ST.hasSGPRInitBug())
     167             :     return std::make_pair(ScratchWaveOffsetReg, SPReg);
     168             : 
     169         723 :   unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
     170             : 
     171             :   ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
     172         723 :   if (NumPreloaded > AllSGPRs.size())
     173             :     return std::make_pair(ScratchWaveOffsetReg, SPReg);
     174             : 
     175             :   AllSGPRs = AllSGPRs.slice(NumPreloaded);
     176             : 
     177             :   // We need to drop register from the end of the list that we cannot use
     178             :   // for the scratch wave offset.
     179             :   // + 2 s102 and s103 do not exist on VI.
     180             :   // + 2 for vcc
     181             :   // + 2 for xnack_mask
     182             :   // + 2 for flat_scratch
     183             :   // + 4 for registers reserved for scratch resource register
     184             :   // + 1 for register reserved for scratch wave offset.  (By exluding this
     185             :   //     register from the list to consider, it means that when this
     186             :   //     register is being used for the scratch wave offset and there
     187             :   //     are no other free SGPRs, then the value will stay in this register.
     188             :   // + 1 if stack pointer is used.
     189             :   // ----
     190             :   //  13 (+1)
     191             :   unsigned ReservedRegCount = 13;
     192             : 
     193         723 :   if (AllSGPRs.size() < ReservedRegCount)
     194             :     return std::make_pair(ScratchWaveOffsetReg, SPReg);
     195             : 
     196             :   bool HandledScratchWaveOffsetReg =
     197         723 :     ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
     198             : 
     199       63659 :   for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) {
     200             :     // Pick the first unallocated SGPR. Be careful not to pick an alias of the
     201             :     // scratch descriptor, since we haven’t added its uses yet.
     202       31911 :     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
     203       21455 :       if (!HandledScratchWaveOffsetReg) {
     204             :         HandledScratchWaveOffsetReg = true;
     205             : 
     206         443 :         MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
     207             :         MFI->setScratchWaveOffsetReg(Reg);
     208             :         ScratchWaveOffsetReg = Reg;
     209             :         break;
     210             :       }
     211             :     }
     212             :   }
     213             : 
     214             :   return std::make_pair(ScratchWaveOffsetReg, SPReg);
     215             : }
     216             : 
     217       15469 : void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
     218             :                                                 MachineBasicBlock &MBB) const {
     219             :   // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
     220             :   // specified.
     221       15469 :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
     222       15469 :   if (ST.debuggerEmitPrologue())
     223           4 :     emitDebuggerPrologue(MF, MBB);
     224             : 
     225             :   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
     226             : 
     227       15469 :   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     228             : 
     229             :   // If we only have SGPR spills, we won't actually be using scratch memory
     230             :   // since these spill to VGPRs.
     231             :   //
     232             :   // FIXME: We should be cleaning up these unused SGPR spill frame indices
     233             :   // somewhere.
     234             : 
     235             :   const SIInstrInfo *TII = ST.getInstrInfo();
     236             :   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
     237       15469 :   MachineRegisterInfo &MRI = MF.getRegInfo();
     238             : 
     239             :   // We need to do the replacement of the private segment buffer and wave offset
     240             :   // register even if there are no stack objects. There could be stores to undef
     241             :   // or a constant without an associated object.
     242             : 
     243             :   // FIXME: We still have implicit uses on SGPR spill instructions in case they
     244             :   // need to spill to vector memory. It's likely that will not happen, but at
     245             :   // this point it appears we need the setup. This part of the prolog should be
     246             :   // emitted after frame indices are eliminated.
     247             : 
     248       15469 :   if (MFI->hasFlatScratchInit())
     249         349 :     emitFlatScratchInit(ST, MF, MBB);
     250             : 
     251       15469 :   unsigned SPReg = MFI->getStackPtrOffsetReg();
     252       15469 :   if (SPReg != AMDGPU::SP_REG) {
     253             :     assert(MRI.isReserved(SPReg) && "SPReg used but not reserved");
     254             : 
     255         333 :     DebugLoc DL;
     256         333 :     const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
     257         333 :     int64_t StackSize = FrameInfo.getStackSize();
     258             : 
     259         333 :     if (StackSize == 0) {
     260         942 :       BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg)
     261         314 :         .addReg(MFI->getScratchWaveOffsetReg());
     262             :     } else {
     263          57 :       BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::S_ADD_U32), SPReg)
     264          19 :         .addReg(MFI->getScratchWaveOffsetReg())
     265          19 :         .addImm(StackSize * ST.getWavefrontSize());
     266             :     }
     267             :   }
     268             : 
     269             :   unsigned ScratchRsrcReg
     270       15469 :     = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
     271             : 
     272             :   unsigned ScratchWaveOffsetReg;
     273             :   std::tie(ScratchWaveOffsetReg, SPReg)
     274       30938 :     = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
     275             : 
     276             :   // It's possible to have uses of only ScratchWaveOffsetReg without
     277             :   // ScratchRsrcReg if it's only used for the initialization of flat_scratch,
     278             :   // but the inverse is not true.
     279       15469 :   if (ScratchWaveOffsetReg == AMDGPU::NoRegister) {
     280             :     assert(ScratchRsrcReg == AMDGPU::NoRegister);
     281       14596 :     return;
     282             :   }
     283             : 
     284             :   // We need to insert initialization of the scratch resource descriptor.
     285             :   unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
     286             :     AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
     287             : 
     288             :   unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
     289         873 :   if (ST.isAmdCodeObjectV2(MF)) {
     290             :     PreloadedPrivateBufferReg = MFI->getPreloadedReg(
     291             :       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
     292             :   }
     293             : 
     294         873 :   bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg);
     295        1725 :   bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister &&
     296         852 :                          MRI.isPhysRegUsed(ScratchRsrcReg);
     297             : 
     298             :   // We added live-ins during argument lowering, but since they were not used
     299             :   // they were deleted. We're adding the uses now, so add them back.
     300         873 :   if (OffsetRegUsed) {
     301             :     assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister &&
     302             :            "scratch wave offset input is required");
     303             :     MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
     304         873 :     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
     305             :   }
     306             : 
     307         873 :   if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
     308             :     assert(ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF));
     309             :     MRI.addLiveIn(PreloadedPrivateBufferReg);
     310         381 :     MBB.addLiveIn(PreloadedPrivateBufferReg);
     311             :   }
     312             : 
     313             :   // Make the register selected live throughout the function.
     314        1937 :   for (MachineBasicBlock &OtherBB : MF) {
     315        1064 :     if (&OtherBB == &MBB)
     316         873 :       continue;
     317             : 
     318         191 :     if (OffsetRegUsed)
     319         191 :       OtherBB.addLiveIn(ScratchWaveOffsetReg);
     320             : 
     321         191 :     if (ResourceRegUsed)
     322         191 :       OtherBB.addLiveIn(ScratchRsrcReg);
     323             :   }
     324             : 
     325         873 :   DebugLoc DL;
     326         873 :   MachineBasicBlock::iterator I = MBB.begin();
     327             : 
     328             :   // If we reserved the original input registers, we don't need to copy to the
     329             :   // reserved registers.
     330             : 
     331             :   bool CopyBuffer = ResourceRegUsed &&
     332         381 :     PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
     333        1254 :     ST.isAmdCodeObjectV2(MF) &&
     334             :     ScratchRsrcReg != PreloadedPrivateBufferReg;
     335             : 
     336             :   // This needs to be careful of the copying order to avoid overwriting one of
     337             :   // the input registers before it's been copied to it's final
     338             :   // destination. Usually the offset should be copied first.
     339         873 :   bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg,
     340             :                                               ScratchWaveOffsetReg);
     341         873 :   if (CopyBuffer && CopyBufferFirst) {
     342           0 :     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
     343           0 :       .addReg(PreloadedPrivateBufferReg, RegState::Kill);
     344             :   }
     345             : 
     346         873 :   if (OffsetRegUsed &&
     347             :       PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
     348        1479 :     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
     349         493 :       .addReg(PreloadedScratchWaveOffsetReg,
     350         493 :               MRI.isPhysRegUsed(ScratchWaveOffsetReg) ? 0 : RegState::Kill);
     351             :   }
     352             : 
     353         873 :   if (CopyBuffer && !CopyBufferFirst) {
     354          93 :     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
     355          31 :       .addReg(PreloadedPrivateBufferReg, RegState::Kill);
     356             :   }
     357             : 
     358         873 :   if (ResourceRegUsed)
     359         852 :     emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I,
     360             :         PreloadedPrivateBufferReg, ScratchRsrcReg);
     361             : }
     362             : 
     363             : // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
     364         852 : void SIFrameLowering::emitEntryFunctionScratchSetup(const SISubtarget &ST,
     365             :       MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
     366             :       MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
     367             :       unsigned ScratchRsrcReg) const {
     368             : 
     369             :   const SIInstrInfo *TII = ST.getInstrInfo();
     370             :   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
     371         852 :   DebugLoc DL;
     372             : 
     373         852 :   if (ST.isAmdPalOS()) {
     374             :     // The pointer to the GIT is formed from the offset passed in and either
     375             :     // the amdgpu-git-ptr-high function attribute or the top part of the PC
     376           2 :     unsigned RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
     377           2 :     unsigned RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
     378           2 :     unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
     379             : 
     380           2 :     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
     381             : 
     382           2 :     if (MFI->getGITPtrHigh() != 0xffffffff) {
     383           2 :       BuildMI(MBB, I, DL, SMovB32, RsrcHi)
     384           1 :         .addImm(MFI->getGITPtrHigh())
     385           1 :         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
     386             :     } else {
     387             :       const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
     388           1 :       BuildMI(MBB, I, DL, GetPC64, Rsrc01);
     389             :     }
     390           4 :     BuildMI(MBB, I, DL, SMovB32, RsrcLo)
     391           2 :       .addReg(AMDGPU::SGPR0) // Low address passed in
     392           2 :       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
     393             : 
     394             :     // We now have the GIT ptr - now get the scratch descriptor from the entry
     395             :     // at offset 0.
     396             :     PointerType *PtrTy =
     397           2 :       PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
     398           2 :                        AMDGPUAS::CONSTANT_ADDRESS);
     399           2 :     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
     400           2 :     const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
     401           2 :     auto MMO = MF.getMachineMemOperand(PtrInfo,
     402             :                                        MachineMemOperand::MOLoad |
     403             :                                        MachineMemOperand::MOInvariant |
     404             :                                        MachineMemOperand::MODereferenceable,
     405           2 :                                        0, 0);
     406           4 :     BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
     407           2 :       .addReg(Rsrc01)
     408             :       .addImm(0) // offset
     409             :       .addImm(0) // glc
     410           2 :       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
     411             :       .addMemOperand(MMO);
     412             :     return;
     413             :   }
     414             :   if (ST.isMesaGfxShader(MF)
     415         848 :       || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) {
     416             :     assert(!ST.isAmdCodeObjectV2(MF));
     417         469 :     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
     418             : 
     419         469 :     unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
     420         469 :     unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
     421             : 
     422             :     // Use relocations to get the pointer, and setup the other bits manually.
     423         469 :     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
     424             : 
     425         469 :     if (MFI->hasImplicitBufferPtr()) {
     426           2 :       unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
     427             : 
     428           4 :       if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
     429           1 :         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
     430             : 
     431           2 :         BuildMI(MBB, I, DL, Mov64, Rsrc01)
     432           1 :           .addReg(MFI->getImplicitBufferPtrUserSGPR())
     433           1 :           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
     434             :       } else {
     435           1 :         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
     436             : 
     437             :         PointerType *PtrTy =
     438           1 :           PointerType::get(Type::getInt64Ty(MF.getFunction().getContext()),
     439           1 :                            AMDGPUAS::CONSTANT_ADDRESS);
     440           1 :         MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
     441           1 :         auto MMO = MF.getMachineMemOperand(PtrInfo,
     442             :                                            MachineMemOperand::MOLoad |
     443             :                                            MachineMemOperand::MOInvariant |
     444             :                                            MachineMemOperand::MODereferenceable,
     445           1 :                                            0, 0);
     446           2 :         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
     447           1 :           .addReg(MFI->getImplicitBufferPtrUserSGPR())
     448             :           .addImm(0) // offset
     449             :           .addImm(0) // glc
     450             :           .addMemOperand(MMO)
     451           1 :           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
     452             :       }
     453             :     } else {
     454         467 :       unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
     455         467 :       unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
     456             : 
     457         934 :       BuildMI(MBB, I, DL, SMovB32, Rsrc0)
     458             :         .addExternalSymbol("SCRATCH_RSRC_DWORD0")
     459         467 :         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
     460             : 
     461         934 :       BuildMI(MBB, I, DL, SMovB32, Rsrc1)
     462             :         .addExternalSymbol("SCRATCH_RSRC_DWORD1")
     463         467 :         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
     464             : 
     465             :     }
     466             : 
     467         938 :     BuildMI(MBB, I, DL, SMovB32, Rsrc2)
     468         469 :       .addImm(Rsrc23 & 0xffffffff)
     469         469 :       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
     470             : 
     471         938 :     BuildMI(MBB, I, DL, SMovB32, Rsrc3)
     472         469 :       .addImm(Rsrc23 >> 32)
     473         469 :       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
     474             :   }
     475             : }
     476             : 
     477       16642 : void SIFrameLowering::emitPrologue(MachineFunction &MF,
     478             :                                    MachineBasicBlock &MBB) const {
     479       16642 :   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
     480       16642 :   if (FuncInfo->isEntryFunction()) {
     481       15469 :     emitEntryFunctionPrologue(MF, MBB);
     482       15469 :     return;
     483             :   }
     484             : 
     485        1173 :   const MachineFrameInfo &MFI = MF.getFrameInfo();
     486        1173 :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
     487             :   const SIInstrInfo *TII = ST.getInstrInfo();
     488             : 
     489        1173 :   unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
     490        1173 :   unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
     491             : 
     492        1173 :   MachineBasicBlock::iterator MBBI = MBB.begin();
     493        1173 :   DebugLoc DL;
     494             : 
     495        1173 :   bool NeedFP = hasFP(MF);
     496        1173 :   if (NeedFP) {
     497             :     // If we need a base pointer, set it up here. It's whatever the value of
     498             :     // the stack pointer is at this point. Any variable size objects will be
     499             :     // allocated after this, so we can still use the base pointer to reference
     500             :     // locals.
     501         711 :     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
     502         237 :       .addReg(StackPtrReg)
     503             :       .setMIFlag(MachineInstr::FrameSetup);
     504             :   }
     505             : 
     506        1173 :   uint32_t NumBytes = MFI.getStackSize();
     507        1173 :   if (NumBytes != 0 && hasSP(MF)) {
     508         231 :     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
     509          77 :       .addReg(StackPtrReg)
     510          77 :       .addImm(NumBytes * ST.getWavefrontSize())
     511             :       .setMIFlag(MachineInstr::FrameSetup);
     512             :   }
     513             : 
     514          83 :   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
     515        1256 :          : FuncInfo->getSGPRSpillVGPRs()) {
     516          83 :     if (!Reg.FI.hasValue())
     517           6 :       continue;
     518          77 :     TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
     519             :                              Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
     520             :                              &TII->getRegisterInfo());
     521             :   }
     522             : }
     523             : 
     524       16626 : void SIFrameLowering::emitEpilogue(MachineFunction &MF,
     525             :                                    MachineBasicBlock &MBB) const {
     526       16626 :   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
     527       16626 :   if (FuncInfo->isEntryFunction())
     528       15461 :     return;
     529             : 
     530        1165 :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
     531             :   const SIInstrInfo *TII = ST.getInstrInfo();
     532        1165 :   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
     533             : 
     534          83 :   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
     535        1248 :          : FuncInfo->getSGPRSpillVGPRs()) {
     536          83 :     if (!Reg.FI.hasValue())
     537           6 :       continue;
     538          77 :     TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
     539             :                               Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
     540             :                               &TII->getRegisterInfo());
     541             :   }
     542             : 
     543        1165 :   unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
     544        1165 :   if (StackPtrReg == AMDGPU::NoRegister)
     545             :     return;
     546             : 
     547        1165 :   const MachineFrameInfo &MFI = MF.getFrameInfo();
     548        1165 :   uint32_t NumBytes = MFI.getStackSize();
     549             : 
     550        1165 :   DebugLoc DL;
     551             : 
     552             :   // FIXME: Clarify distinction between no set SP and SP. For callee functions,
     553             :   // it's really whether we need SP to be accurate or not.
     554             : 
     555        1165 :   if (NumBytes != 0 && hasSP(MF)) {
     556         231 :     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
     557          77 :       .addReg(StackPtrReg)
     558          77 :       .addImm(NumBytes * ST.getWavefrontSize())
     559             :       .setMIFlag(MachineInstr::FrameDestroy);
     560             :   }
     561             : }
     562             : 
     563             : static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
     564        1954 :   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
     565        1954 :        I != E; ++I) {
     566        1864 :     if (!MFI.isDeadObjectIndex(I))
     567             :       return false;
     568             :   }
     569             : 
     570             :   return true;
     571             : }
     572             : 
     573          22 : int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
     574             :                                             unsigned &FrameReg) const {
     575          22 :   const SIRegisterInfo *RI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
     576             : 
     577          22 :   FrameReg = RI->getFrameRegister(MF);
     578          44 :   return MF.getFrameInfo().getObjectOffset(FI);
     579             : }
     580             : 
     581       16642 : void SIFrameLowering::processFunctionBeforeFrameFinalized(
     582             :   MachineFunction &MF,
     583             :   RegScavenger *RS) const {
     584       16642 :   MachineFrameInfo &MFI = MF.getFrameInfo();
     585             : 
     586       16642 :   if (!MFI.hasStackObjects())
     587             :     return;
     588             : 
     589         693 :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
     590             :   const SIInstrInfo *TII = ST.getInstrInfo();
     591             :   const SIRegisterInfo &TRI = TII->getRegisterInfo();
     592         693 :   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
     593             :   bool AllSGPRSpilledToVGPRs = false;
     594             : 
     595         693 :   if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) {
     596             :     AllSGPRSpilledToVGPRs = true;
     597             : 
     598             :     // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
     599             :     // are spilled to VGPRs, in which case we can eliminate the stack usage.
     600             :     //
     601             :     // XXX - This operates under the assumption that only other SGPR spills are
     602             :     // users of the frame index. I'm not 100% sure this is correct. The
     603             :     // StackColoring pass has a comment saying a future improvement would be to
     604             :     // merging of allocas with spill slots, but for now according to
     605             :     // MachineFrameInfo isSpillSlot can't alias any other object.
     606         402 :     for (MachineBasicBlock &MBB : MF) {
     607             :       MachineBasicBlock::iterator Next;
     608        5638 :       for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
     609             :         MachineInstr &MI = *I;
     610             :         Next = std::next(I);
     611             : 
     612        5363 :         if (TII->isSGPRSpill(MI)) {
     613        1118 :           int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
     614        1118 :           if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
     615        1114 :             bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS);
     616             :             (void)Spilled;
     617             :             assert(Spilled && "failed to spill SGPR to VGPR when allocated");
     618             :           } else
     619             :             AllSGPRSpilledToVGPRs = false;
     620             :         }
     621             :       }
     622             :     }
     623             : 
     624         127 :     FuncInfo->removeSGPRToVGPRFrameIndices(MFI);
     625             :   }
     626             : 
     627             :   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
     628             :   // but currently hasNonSpillStackObjects is set only from source
     629             :   // allocas. Stack temps produced from legalization are not counted currently.
     630        1016 :   if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() ||
     631         800 :       !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) {
     632             :     assert(RS && "RegScavenger required if spilling");
     633             : 
     634             :     // We force this to be at offset 0 so no user object ever has 0 as an
     635             :     // address, so we may use 0 as an invalid pointer value. This is because
     636             :     // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca
     637             :     // is required to be address space 0, we are forced to accept this for
     638             :     // now. Ideally we could have the stack in another address space with 0 as a
     639             :     // valid pointer, and -1 as the null value.
     640             :     //
     641             :     // This will also waste additional space when user stack objects require > 4
     642             :     // byte alignment.
     643             :     //
     644             :     // The main cost here is losing the offset for addressing modes. However
     645             :     // this also ensures we shouldn't need a register for the offset when
     646             :     // emergency scavenging.
     647         651 :     int ScavengeFI = MFI.CreateFixedObject(
     648         651 :       TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
     649             :     RS->addScavengingFrameIndex(ScavengeFI);
     650             :   }
     651             : }
     652             : 
     653       16642 : void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
     654             :                                            RegScavenger *RS) const {
     655       16642 :   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
     656       16642 :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     657             : 
     658             :   // The SP is specifically managed and we don't want extra spills of it.
     659       16642 :   SavedRegs.reset(MFI->getStackPtrOffsetReg());
     660       16642 : }
     661             : 
     662         874 : MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
     663             :   MachineFunction &MF,
     664             :   MachineBasicBlock &MBB,
     665             :   MachineBasicBlock::iterator I) const {
     666         874 :   int64_t Amount = I->getOperand(0).getImm();
     667         874 :   if (Amount == 0)
     668         874 :     return MBB.erase(I);
     669             : 
     670           0 :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
     671             :   const SIInstrInfo *TII = ST.getInstrInfo();
     672             :   const DebugLoc &DL = I->getDebugLoc();
     673           0 :   unsigned Opc = I->getOpcode();
     674           0 :   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
     675           0 :   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
     676             : 
     677           0 :   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
     678           0 :   if (!TFI->hasReservedCallFrame(MF)) {
     679           0 :     unsigned Align = getStackAlignment();
     680             : 
     681           0 :     Amount = alignTo(Amount, Align);
     682             :     assert(isUInt<32>(Amount) && "exceeded stack address space size");
     683           0 :     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     684           0 :     unsigned SPReg = MFI->getStackPtrOffsetReg();
     685             : 
     686           0 :     unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
     687           0 :     BuildMI(MBB, I, DL, TII->get(Op), SPReg)
     688           0 :       .addReg(SPReg)
     689           0 :       .addImm(Amount * ST.getWavefrontSize());
     690           0 :   } else if (CalleePopAmount != 0) {
     691           0 :     llvm_unreachable("is this used?");
     692             :   }
     693             : 
     694           0 :   return MBB.erase(I);
     695             : }
     696             : 
     697           4 : void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
     698             :                                            MachineBasicBlock &MBB) const {
     699           4 :   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
     700             :   const SIInstrInfo *TII = ST.getInstrInfo();
     701             :   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
     702           4 :   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
     703             : 
     704           4 :   MachineBasicBlock::iterator I = MBB.begin();
     705           4 :   DebugLoc DL;
     706             : 
     707             :   // For each dimension:
     708          28 :   for (unsigned i = 0; i < 3; ++i) {
     709             :     // Get work group ID SGPR, and make it live-in again.
     710             :     unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i);
     711          12 :     MF.getRegInfo().addLiveIn(WorkGroupIDSGPR);
     712          12 :     MBB.addLiveIn(WorkGroupIDSGPR);
     713             : 
     714             :     // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in
     715             :     // order to spill it to scratch.
     716             :     unsigned WorkGroupIDVGPR =
     717          12 :       MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     718          36 :     BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR)
     719          12 :       .addReg(WorkGroupIDSGPR);
     720             : 
     721             :     // Spill work group ID.
     722             :     int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i);
     723          12 :     TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false,
     724             :       WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
     725             : 
     726             :     // Get work item ID VGPR, and make it live-in again.
     727             :     unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i);
     728          12 :     MF.getRegInfo().addLiveIn(WorkItemIDVGPR);
     729          12 :     MBB.addLiveIn(WorkItemIDVGPR);
     730             : 
     731             :     // Spill work item ID.
     732             :     int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i);
     733          12 :     TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false,
     734             :       WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI);
     735             :   }
     736           4 : }
     737             : 
     738       19196 : bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
     739             :   // All stack operations are relative to the frame offset SGPR.
     740             :   // TODO: Still want to eliminate sometimes.
     741       19196 :   const MachineFrameInfo &MFI = MF.getFrameInfo();
     742             : 
     743             :   // XXX - Is this only called after frame is finalized? Should be able to check
     744             :   // frame size.
     745       20406 :   return MFI.hasStackObjects() && !allStackObjectsAreDead(MFI);
     746             : }
     747             : 
     748         474 : bool SIFrameLowering::hasSP(const MachineFunction &MF) const {
     749             :   // All stack operations are relative to the frame offset SGPR.
     750         474 :   const MachineFrameInfo &MFI = MF.getFrameInfo();
     751         474 :   return MFI.hasCalls() || MFI.hasVarSizedObjects();
     752             : }

Generated by: LCOV version 1.13