LLVM  9.0.0svn
SIFrameLowering.cpp
Go to the documentation of this file.
1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 
9 #include "SIFrameLowering.h"
10 #include "AMDGPUSubtarget.h"
11 #include "SIInstrInfo.h"
12 #include "SIMachineFunctionInfo.h"
13 #include "SIRegisterInfo.h"
15 
21 
22 using namespace llvm;
23 
24 
26  const MachineFunction &MF) {
27  return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
28  ST.getMaxNumSGPRs(MF) / 4);
29 }
30 
32  const MachineFunction &MF) {
33  return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
34  ST.getMaxNumSGPRs(MF));
35 }
36 
37 void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
38  MachineFunction &MF,
39  MachineBasicBlock &MBB) const {
40  const SIInstrInfo *TII = ST.getInstrInfo();
41  const SIRegisterInfo* TRI = &TII->getRegisterInfo();
43 
44  // We don't need this if we only have spills since there is no user facing
45  // scratch.
46 
47  // TODO: If we know we don't have flat instructions earlier, we can omit
48  // this from the input registers.
49  //
50  // TODO: We only need to know if we access scratch space through a flat
51  // pointer. Because we only detect if flat instructions are used at all,
52  // this will be used more often than necessary on VI.
53 
54  // Debug location must be unknown since the first debug location is used to
55  // determine the end of the prologue.
56  DebugLoc DL;
58 
59  unsigned FlatScratchInitReg
61 
63  MRI.addLiveIn(FlatScratchInitReg);
64  MBB.addLiveIn(FlatScratchInitReg);
65 
66  unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
67  unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
68 
69  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
70 
71  // Do a 64-bit pointer add.
72  if (ST.flatScratchIsPointer()) {
74  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
75  .addReg(FlatScrInitLo)
76  .addReg(ScratchWaveOffsetReg);
77  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi)
78  .addReg(FlatScrInitHi)
79  .addImm(0);
80  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
81  addReg(FlatScrInitLo).
82  addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
84  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
85  addReg(FlatScrInitHi).
86  addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
88  return;
89  }
90 
91  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
92  .addReg(FlatScrInitLo)
93  .addReg(ScratchWaveOffsetReg);
94  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
95  .addReg(FlatScrInitHi)
96  .addImm(0);
97 
98  return;
99  }
100 
102 
103  // Copy the size in bytes.
104  BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
105  .addReg(FlatScrInitHi, RegState::Kill);
106 
107  // Add wave offset in bytes to private base offset.
108  // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
109  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
110  .addReg(FlatScrInitLo)
111  .addReg(ScratchWaveOffsetReg);
112 
113  // Convert offset to 256-byte units.
114  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
115  .addReg(FlatScrInitLo, RegState::Kill)
116  .addImm(8);
117 }
118 
119 unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
120  const GCNSubtarget &ST,
121  const SIInstrInfo *TII,
122  const SIRegisterInfo *TRI,
124  MachineFunction &MF) const {
126 
127  // We need to insert initialization of the scratch resource descriptor.
128  unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
129  if (ScratchRsrcReg == AMDGPU::NoRegister ||
130  !MRI.isPhysRegUsed(ScratchRsrcReg))
131  return AMDGPU::NoRegister;
132 
133  if (ST.hasSGPRInitBug() ||
134  ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
135  return ScratchRsrcReg;
136 
137  // We reserved the last registers for this. Shift it down to the end of those
138  // which were actually used.
139  //
140  // FIXME: It might be safer to use a pseudoregister before replacement.
141 
142  // FIXME: We should be able to eliminate unused input registers. We only
143  // cannot do this for the resources required for scratch access. For now we
144  // skip over user SGPRs and may leave unused holes.
145 
146  // We find the resource first because it has an alignment requirement.
147 
148  unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
149  ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
150  AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
151 
152  // Skip the last N reserved elements because they should have already been
153  // reserved for VCC etc.
154  for (MCPhysReg Reg : AllSGPR128s) {
155  // Pick the first unallocated one. Make sure we don't clobber the other
156  // reserved input we needed.
157  if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
158  MRI.replaceRegWith(ScratchRsrcReg, Reg);
159  MFI->setScratchRSrcReg(Reg);
160  return Reg;
161  }
162  }
163 
164  return ScratchRsrcReg;
165 }
166 
167 // Shift down registers reserved for the scratch wave offset and stack pointer
168 // SGPRs.
169 std::pair<unsigned, unsigned>
170 SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
171  const GCNSubtarget &ST,
172  const SIInstrInfo *TII,
173  const SIRegisterInfo *TRI,
175  MachineFunction &MF) const {
177  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
178 
179  // No replacement necessary.
180  if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
181  !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) {
182  assert(MFI->getStackPtrOffsetReg() == AMDGPU::SP_REG);
183  return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister);
184  }
185 
186  unsigned SPReg = MFI->getStackPtrOffsetReg();
187  if (ST.hasSGPRInitBug())
188  return std::make_pair(ScratchWaveOffsetReg, SPReg);
189 
190  unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
191 
192  ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
193  if (NumPreloaded > AllSGPRs.size())
194  return std::make_pair(ScratchWaveOffsetReg, SPReg);
195 
196  AllSGPRs = AllSGPRs.slice(NumPreloaded);
197 
198  // We need to drop register from the end of the list that we cannot use
199  // for the scratch wave offset.
200  // + 2 s102 and s103 do not exist on VI.
201  // + 2 for vcc
202  // + 2 for xnack_mask
203  // + 2 for flat_scratch
204  // + 4 for registers reserved for scratch resource register
205  // + 1 for register reserved for scratch wave offset. (By exluding this
206  // register from the list to consider, it means that when this
207  // register is being used for the scratch wave offset and there
208  // are no other free SGPRs, then the value will stay in this register.
209  // + 1 if stack pointer is used.
210  // ----
211  // 13 (+1)
212  unsigned ReservedRegCount = 13;
213 
214  if (AllSGPRs.size() < ReservedRegCount)
215  return std::make_pair(ScratchWaveOffsetReg, SPReg);
216 
217  bool HandledScratchWaveOffsetReg =
218  ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
219 
220  for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) {
221  // Pick the first unallocated SGPR. Be careful not to pick an alias of the
222  // scratch descriptor, since we haven’t added its uses yet.
223  if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
224  if (!HandledScratchWaveOffsetReg) {
225  HandledScratchWaveOffsetReg = true;
226 
227  MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
229  ScratchWaveOffsetReg = Reg;
230  break;
231  }
232  }
233  }
234 
235  return std::make_pair(ScratchWaveOffsetReg, SPReg);
236 }
237 
239  MachineBasicBlock &MBB) const {
240  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
241 
243 
244  // If we only have SGPR spills, we won't actually be using scratch memory
245  // since these spill to VGPRs.
246  //
247  // FIXME: We should be cleaning up these unused SGPR spill frame indices
248  // somewhere.
249 
250  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
251  const SIInstrInfo *TII = ST.getInstrInfo();
252  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
254  const Function &F = MF.getFunction();
255 
256  // We need to do the replacement of the private segment buffer and wave offset
257  // register even if there are no stack objects. There could be stores to undef
258  // or a constant without an associated object.
259 
260  // FIXME: We still have implicit uses on SGPR spill instructions in case they
261  // need to spill to vector memory. It's likely that will not happen, but at
262  // this point it appears we need the setup. This part of the prolog should be
263  // emitted after frame indices are eliminated.
264 
265  if (MFI->hasFlatScratchInit())
266  emitFlatScratchInit(ST, MF, MBB);
267 
268  unsigned SPReg = MFI->getStackPtrOffsetReg();
269  if (SPReg != AMDGPU::SP_REG) {
270  assert(MRI.isReserved(SPReg) && "SPReg used but not reserved");
271 
272  DebugLoc DL;
273  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
274  int64_t StackSize = FrameInfo.getStackSize();
275 
276  if (StackSize == 0) {
277  BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg)
278  .addReg(MFI->getScratchWaveOffsetReg());
279  } else {
280  BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::S_ADD_U32), SPReg)
281  .addReg(MFI->getScratchWaveOffsetReg())
282  .addImm(StackSize * ST.getWavefrontSize());
283  }
284  }
285 
286  unsigned ScratchRsrcReg
287  = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
288 
289  unsigned ScratchWaveOffsetReg;
290  std::tie(ScratchWaveOffsetReg, SPReg)
291  = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
292 
293  // It's possible to have uses of only ScratchWaveOffsetReg without
294  // ScratchRsrcReg if it's only used for the initialization of flat_scratch,
295  // but the inverse is not true.
296  if (ScratchWaveOffsetReg == AMDGPU::NoRegister) {
297  assert(ScratchRsrcReg == AMDGPU::NoRegister);
298  return;
299  }
300 
301  // We need to insert initialization of the scratch resource descriptor.
302  unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
304 
305  unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
306  if (ST.isAmdHsaOrMesa(F)) {
307  PreloadedPrivateBufferReg = MFI->getPreloadedReg(
309  }
310 
311  bool OffsetRegUsed = MRI.isPhysRegUsed(ScratchWaveOffsetReg);
312  bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister &&
313  MRI.isPhysRegUsed(ScratchRsrcReg);
314 
315  // We added live-ins during argument lowering, but since they were not used
316  // they were deleted. We're adding the uses now, so add them back.
317  if (OffsetRegUsed) {
318  assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister &&
319  "scratch wave offset input is required");
320  MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
321  MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
322  }
323 
324  if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
325  assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F));
326  MRI.addLiveIn(PreloadedPrivateBufferReg);
327  MBB.addLiveIn(PreloadedPrivateBufferReg);
328  }
329 
330  // Make the register selected live throughout the function.
331  for (MachineBasicBlock &OtherBB : MF) {
332  if (&OtherBB == &MBB)
333  continue;
334 
335  if (OffsetRegUsed)
336  OtherBB.addLiveIn(ScratchWaveOffsetReg);
337 
338  if (ResourceRegUsed)
339  OtherBB.addLiveIn(ScratchRsrcReg);
340  }
341 
342  DebugLoc DL;
344 
345  // If we reserved the original input registers, we don't need to copy to the
346  // reserved registers.
347 
348  bool CopyBuffer = ResourceRegUsed &&
349  PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
350  ST.isAmdHsaOrMesa(F) &&
351  ScratchRsrcReg != PreloadedPrivateBufferReg;
352 
353  // This needs to be careful of the copying order to avoid overwriting one of
354  // the input registers before it's been copied to it's final
355  // destination. Usually the offset should be copied first.
356  bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg,
357  ScratchWaveOffsetReg);
358  if (CopyBuffer && CopyBufferFirst) {
359  BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
360  .addReg(PreloadedPrivateBufferReg, RegState::Kill);
361  }
362 
363  if (OffsetRegUsed &&
364  PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
365  BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
366  .addReg(PreloadedScratchWaveOffsetReg,
367  MRI.isPhysRegUsed(ScratchWaveOffsetReg) ? 0 : RegState::Kill);
368  }
369 
370  if (CopyBuffer && !CopyBufferFirst) {
371  BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
372  .addReg(PreloadedPrivateBufferReg, RegState::Kill);
373  }
374 
375  if (ResourceRegUsed)
376  emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I,
377  PreloadedPrivateBufferReg, ScratchRsrcReg);
378 }
379 
380 // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
381 void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
383  MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
384  unsigned ScratchRsrcReg) const {
385 
386  const SIInstrInfo *TII = ST.getInstrInfo();
387  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
388  const Function &Fn = MF.getFunction();
389  DebugLoc DL;
390 
391  if (ST.isAmdPalOS()) {
392  // The pointer to the GIT is formed from the offset passed in and either
393  // the amdgpu-git-ptr-high function attribute or the top part of the PC
394  unsigned RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
395  unsigned RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
396  unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
397 
398  const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
399 
400  if (MFI->getGITPtrHigh() != 0xffffffff) {
401  BuildMI(MBB, I, DL, SMovB32, RsrcHi)
402  .addImm(MFI->getGITPtrHigh())
403  .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
404  } else {
405  const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
406  BuildMI(MBB, I, DL, GetPC64, Rsrc01);
407  }
408  auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
409  if (ST.hasMergedShaders()) {
410  switch (MF.getFunction().getCallingConv()) {
413  // Low GIT address is passed in s8 rather than s0 for an LS+HS or
414  // ES+GS merged shader on gfx9+.
415  GitPtrLo = AMDGPU::SGPR8;
416  break;
417  default:
418  break;
419  }
420  }
421  MF.getRegInfo().addLiveIn(GitPtrLo);
422  MF.front().addLiveIn(GitPtrLo);
423  BuildMI(MBB, I, DL, SMovB32, RsrcLo)
424  .addReg(GitPtrLo)
425  .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
426 
427  // We now have the GIT ptr - now get the scratch descriptor from the entry
428  // at offset 0 (or offset 16 for a compute shader).
429  PointerType *PtrTy =
432  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
433  const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
434  auto MMO = MF.getMachineMemOperand(PtrInfo,
438  16, 4);
439  unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
440  const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
441  unsigned EncodedOffset = AMDGPU::getSMRDEncodedOffset(Subtarget, Offset);
442  BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
443  .addReg(Rsrc01)
444  .addImm(EncodedOffset) // offset
445  .addImm(0) // glc
446  .addImm(0) // dlc
447  .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
448  .addMemOperand(MMO);
449  return;
450  }
451  if (ST.isMesaGfxShader(Fn)
452  || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) {
453  assert(!ST.isAmdHsaOrMesa(Fn));
454  const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
455 
456  unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
457  unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
458 
459  // Use relocations to get the pointer, and setup the other bits manually.
460  uint64_t Rsrc23 = TII->getScratchRsrcWords23();
461 
462  if (MFI->hasImplicitBufferPtr()) {
463  unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
464 
466  const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
467 
468  BuildMI(MBB, I, DL, Mov64, Rsrc01)
470  .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
471  } else {
472  const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
473 
474  PointerType *PtrTy =
477  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
478  auto MMO = MF.getMachineMemOperand(PtrInfo,
482  8, 4);
483  BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
485  .addImm(0) // offset
486  .addImm(0) // glc
487  .addImm(0) // dlc
488  .addMemOperand(MMO)
489  .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
490  }
491  } else {
492  unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
493  unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
494 
495  BuildMI(MBB, I, DL, SMovB32, Rsrc0)
496  .addExternalSymbol("SCRATCH_RSRC_DWORD0")
497  .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
498 
499  BuildMI(MBB, I, DL, SMovB32, Rsrc1)
500  .addExternalSymbol("SCRATCH_RSRC_DWORD1")
501  .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
502 
503  }
504 
505  BuildMI(MBB, I, DL, SMovB32, Rsrc2)
506  .addImm(Rsrc23 & 0xffffffff)
507  .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
508 
509  BuildMI(MBB, I, DL, SMovB32, Rsrc3)
510  .addImm(Rsrc23 >> 32)
511  .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
512  }
513 }
514 
515 // Find a scratch register that we can use at the start of the prologue to
516 // re-align the stack pointer. We avoid using callee-save registers since they
517 // may appear to be free when this is called from canUseAsPrologue (during
518 // shrink wrapping), but then no longer be free when this is called from
519 // emitPrologue.
520 //
521 // FIXME: This is a bit conservative, since in the above case we could use one
522 // of the callee-save registers as a scratch temp to re-align the stack pointer,
523 // but we would then have to make sure that we were in fact saving at least one
524 // callee-save register in the prologue, which is additional complexity that
525 // doesn't seem worth the benefit.
527  MachineFunction *MF = MBB.getParent();
528 
529  const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
530  const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
531  LivePhysRegs LiveRegs(TRI);
532  LiveRegs.addLiveIns(MBB);
533 
534  // Mark callee saved registers as used so we will not choose them.
535  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
536  for (unsigned i = 0; CSRegs[i]; ++i)
537  LiveRegs.addReg(CSRegs[i]);
538 
540 
541  for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) {
542  if (LiveRegs.available(MRI, Reg))
543  return Reg;
544  }
545 
546  return AMDGPU::NoRegister;
547 }
548 
550  MachineBasicBlock &MBB) const {
552  if (FuncInfo->isEntryFunction()) {
553  emitEntryFunctionPrologue(MF, MBB);
554  return;
555  }
556 
557  const MachineFrameInfo &MFI = MF.getFrameInfo();
558  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
559  const SIInstrInfo *TII = ST.getInstrInfo();
560  const SIRegisterInfo &TRI = TII->getRegisterInfo();
561 
562  unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
563  unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
564 
565  MachineBasicBlock::iterator MBBI = MBB.begin();
566  DebugLoc DL;
567 
568  // XXX - Is this the right predicate?
569 
570  bool NeedFP = hasFP(MF);
571  uint32_t NumBytes = MFI.getStackSize();
572  uint32_t RoundedSize = NumBytes;
573  const bool NeedsRealignment = TRI.needsStackRealignment(MF);
574 
575  if (NeedsRealignment) {
576  assert(NeedFP);
577  const unsigned Alignment = MFI.getMaxAlignment();
578 
579  RoundedSize += Alignment;
580 
581  unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB);
582  assert(ScratchSPReg != AMDGPU::NoRegister);
583 
584  // s_add_u32 tmp_reg, s32, NumBytes
585  // s_and_b32 s32, tmp_reg, 0b111...0000
586  BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
587  .addReg(StackPtrReg)
588  .addImm((Alignment - 1) * ST.getWavefrontSize())
589  .setMIFlag(MachineInstr::FrameSetup);
590  BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
591  .addReg(ScratchSPReg, RegState::Kill)
592  .addImm(-Alignment * ST.getWavefrontSize())
593  .setMIFlag(MachineInstr::FrameSetup);
594  FuncInfo->setIsStackRealigned(true);
595  } else if (NeedFP) {
596  // If we need a base pointer, set it up here. It's whatever the value of
597  // the stack pointer is at this point. Any variable size objects will be
598  // allocated after this, so we can still use the base pointer to reference
599  // locals.
600  BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
601  .addReg(StackPtrReg)
603  }
604 
605  if (RoundedSize != 0 && hasSP(MF)) {
606  BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
607  .addReg(StackPtrReg)
608  .addImm(RoundedSize * ST.getWavefrontSize())
609  .setMIFlag(MachineInstr::FrameSetup);
610  }
611 
613  : FuncInfo->getSGPRSpillVGPRs()) {
614  if (!Reg.FI.hasValue())
615  continue;
616  TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
617  Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
618  &TII->getRegisterInfo());
619  }
620 }
621 
623  MachineBasicBlock &MBB) const {
624  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
625  if (FuncInfo->isEntryFunction())
626  return;
627 
628  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
629  const SIInstrInfo *TII = ST.getInstrInfo();
631 
633  : FuncInfo->getSGPRSpillVGPRs()) {
634  if (!Reg.FI.hasValue())
635  continue;
636  TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
637  Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
638  &TII->getRegisterInfo());
639  }
640 
641  unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
642  if (StackPtrReg == AMDGPU::NoRegister)
643  return;
644 
645  const MachineFrameInfo &MFI = MF.getFrameInfo();
646  uint32_t NumBytes = MFI.getStackSize();
647 
648  DebugLoc DL;
649 
650  // FIXME: Clarify distinction between no set SP and SP. For callee functions,
651  // it's really whether we need SP to be accurate or not.
652 
653  if (NumBytes != 0 && hasSP(MF)) {
654  uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
655  NumBytes + MFI.getMaxAlignment() : NumBytes;
656 
657  BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
658  .addReg(StackPtrReg)
659  .addImm(RoundedSize * ST.getWavefrontSize());
660  }
661 }
662 
663 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
664  for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
665  I != E; ++I) {
666  if (!MFI.isDeadObjectIndex(I))
667  return false;
668  }
669 
670  return true;
671 }
672 
674  unsigned &FrameReg) const {
675  const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
676 
677  FrameReg = RI->getFrameRegister(MF);
678  return MF.getFrameInfo().getObjectOffset(FI);
679 }
680 
682  MachineFunction &MF,
683  RegScavenger *RS) const {
684  MachineFrameInfo &MFI = MF.getFrameInfo();
685 
686  if (!MFI.hasStackObjects())
687  return;
688 
689  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
690  const SIInstrInfo *TII = ST.getInstrInfo();
691  const SIRegisterInfo &TRI = TII->getRegisterInfo();
693  bool AllSGPRSpilledToVGPRs = false;
694 
695  if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) {
696  AllSGPRSpilledToVGPRs = true;
697 
698  // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
699  // are spilled to VGPRs, in which case we can eliminate the stack usage.
700  //
701  // XXX - This operates under the assumption that only other SGPR spills are
702  // users of the frame index. I'm not 100% sure this is correct. The
703  // StackColoring pass has a comment saying a future improvement would be to
704  // merging of allocas with spill slots, but for now according to
705  // MachineFrameInfo isSpillSlot can't alias any other object.
706  for (MachineBasicBlock &MBB : MF) {
708  for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
709  MachineInstr &MI = *I;
710  Next = std::next(I);
711 
712  if (TII->isSGPRSpill(MI)) {
713  int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
715  if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
716  bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS);
717  (void)Spilled;
718  assert(Spilled && "failed to spill SGPR to VGPR when allocated");
719  } else
720  AllSGPRSpilledToVGPRs = false;
721  }
722  }
723  }
724  }
725 
726  FuncInfo->removeSGPRToVGPRFrameIndices(MFI);
727 
728  // FIXME: The other checks should be redundant with allStackObjectsAreDead,
729  // but currently hasNonSpillStackObjects is set only from source
730  // allocas. Stack temps produced from legalization are not counted currently.
731  if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() ||
732  !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) {
733  assert(RS && "RegScavenger required if spilling");
734 
735  // We force this to be at offset 0 so no user object ever has 0 as an
736  // address, so we may use 0 as an invalid pointer value. This is because
737  // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca
738  // is required to be address space 0, we are forced to accept this for
739  // now. Ideally we could have the stack in another address space with 0 as a
740  // valid pointer, and -1 as the null value.
741  //
742  // This will also waste additional space when user stack objects require > 4
743  // byte alignment.
744  //
745  // The main cost here is losing the offset for addressing modes. However
746  // this also ensures we shouldn't need a register for the offset when
747  // emergency scavenging.
748  int ScavengeFI = MFI.CreateFixedObject(
749  TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
750  RS->addScavengingFrameIndex(ScavengeFI);
751  }
752 }
753 
755  RegScavenger *RS) const {
758 
759  // The SP is specifically managed and we don't want extra spills of it.
760  SavedRegs.reset(MFI->getStackPtrOffsetReg());
761 }
762 
764  MachineFunction &MF,
765  MachineBasicBlock &MBB,
766  MachineBasicBlock::iterator I) const {
767  int64_t Amount = I->getOperand(0).getImm();
768  if (Amount == 0)
769  return MBB.erase(I);
770 
771  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
772  const SIInstrInfo *TII = ST.getInstrInfo();
773  const DebugLoc &DL = I->getDebugLoc();
774  unsigned Opc = I->getOpcode();
775  bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
776  uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
777 
779  if (!TFI->hasReservedCallFrame(MF)) {
780  unsigned Align = getStackAlignment();
781 
782  Amount = alignTo(Amount, Align);
783  assert(isUInt<32>(Amount) && "exceeded stack address space size");
785  unsigned SPReg = MFI->getStackPtrOffsetReg();
786 
787  unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
788  BuildMI(MBB, I, DL, TII->get(Op), SPReg)
789  .addReg(SPReg)
790  .addImm(Amount * ST.getWavefrontSize());
791  } else if (CalleePopAmount != 0) {
792  llvm_unreachable("is this used?");
793  }
794 
795  return MBB.erase(I);
796 }
797 
799  // All stack operations are relative to the frame offset SGPR.
800  // TODO: Still want to eliminate sometimes.
801  const MachineFrameInfo &MFI = MF.getFrameInfo();
802 
803  // XXX - Is this only called after frame is finalized? Should be able to check
804  // frame size.
805  return MFI.hasStackObjects() && !allStackObjectsAreDead(MFI);
806 }
807 
809  const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
810  // All stack operations are relative to the frame offset SGPR.
811  const MachineFrameInfo &MFI = MF.getFrameInfo();
812  return MFI.hasCalls() || MFI.hasVarSizedObjects() || TRI->needsStackRealignment(MF);
813 }
constexpr bool isUInt< 32 >(uint64_t x)
Definition: MathExtras.h:348
int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override
getFrameIndexReference - This method should return the base register and offset used to reference a f...
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
Interface definition for SIRegisterInfo.
unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch buffer in case spilling is needed...
AMDGPU specific subclass of TargetSubtarget.
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:224
static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB)
Address space for constant memory (VTX2).
Definition: AMDGPU.h:257
bool isAllocatable(unsigned PhysReg) const
isAllocatable - Returns true when PhysReg belongs to an allocatable register class and it hasn&#39;t been...
This class represents lattice values for constants.
Definition: AllocatorList.h:23
bool hasStackObjects() const
Return true if there are any stack objects in this function.
void addLiveIn(unsigned Reg, unsigned vreg=0)
addLiveIn - Add the specified register as a live-in.
bool isDeadObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a dead object.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:163
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const override
This method determines which of the registers reported by TargetRegisterInfo::getCalleeSavedRegs() sh...
void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS=nullptr) const override
processFunctionBeforeFrameFinalized - This method is called immediately before the specified function...
void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI)
unsigned Reg
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space...
Definition: Type.cpp:629
void emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const
const SIInstrInfo * getInstrInfo() const override
bool hasMergedShaders() const
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:33
F(f)
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
Definition: MathExtras.h:684
void setIsStackRealigned(bool Realigned=true)
static IntegerType * getInt64Ty(LLVMContext &C)
Definition: Type.cpp:176
const SIRegisterInfo & getRegisterInfo() const
Definition: SIInstrInfo.h:165
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
virtual void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const
This method determines which of the registers reported by TargetRegisterInfo::getCalleeSavedRegs() sh...
Calling convention used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:207
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override
bool isMesaGfxShader(const Function &F) const
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
const HexagonInstrInfo * TII
uint64_t getScratchRsrcWords23() const
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
Definition: ArrayRef.h:450
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
static bool allStackObjectsAreDead(const MachineFrameInfo &MFI)
The memory access is dereferenceable (i.e., doesn&#39;t trap).
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted...
bool hasVarSizedObjects() const
This method may be called any time after instruction selection is complete to determine if the stack ...
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, unsigned base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
int getObjectIndexBegin() const
Return the minimum frame object index.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI)
Reserve a slice of a VGPR to support spilling for FrameIndex FI.
Class to represent pointers.
Definition: DerivedTypes.h:498
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
This method is called during prolog/epilog code insertion to eliminate call frame setup and destroy p...
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata *> MDs)
Definition: Metadata.h:1165
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isCompute(CallingConv::ID cc)
void addLiveIn(MCPhysReg PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
This file declares the machine register scavenger class.
unsigned const MachineRegisterInfo * MRI
unsigned getFrameRegister(const MachineFunction &MF) const override
unsigned reservedPrivateSegmentWaveByteOffsetReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch wave offset in case spilling is needed...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:148
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
void addLiveIns(const MachineBasicBlock &MBB)
Adds all live-in registers of basic block MBB.
static ArrayRef< MCPhysReg > getAllSGPR128(const GCNSubtarget &ST, const MachineFunction &MF)
static ArrayRef< MCPhysReg > getAllSGPRs(const GCNSubtarget &ST, const MachineFunction &MF)
BitVector & reset()
Definition: BitVector.h:438
unsigned getMaxAlignment() const
Return the alignment in bytes that this function must be aligned to, which is greater than the defaul...
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Generation getGeneration() const
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function. ...
Definition: Function.cpp:196
static UndefValue * get(Type *T)
Static factory methods - Return an &#39;undef&#39; object of the specified type.
Definition: Constants.cpp:1424
ArrayRef< SGPRSpillVGPRCSR > getSGPRSpillVGPRs() const
void setScratchWaveOffsetReg(unsigned Reg)
const MachineBasicBlock & front() const
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
This class contains a discriminated union of information about pointers in memory operands...
unsigned getStackAlignment() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
virtual bool hasReservedCallFrame(const MachineFunction &MF) const
hasReservedCallFrame - Under normal circumstances, when a frame pointer is not required, we reserve argument space for call sites in the function immediately on entry to the current function.
unsigned getWavefrontSize() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:212
bool available(const MachineRegisterInfo &MRI, MCPhysReg Reg) const
Returns true if register Reg and no aliasing register is in the set.
void addScavengingFrameIndex(int FI)
Add a scavenging frame index.
unsigned getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses...
Information about stack frame layout on the target.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override
emitProlog/emitEpilog - These methods insert prolog and epilog code into the function.
bool hasSGPRInitBug() const
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:193
const Function & getFunction() const
Return the LLVM function that this machine code represents.
bool isPhysRegUsed(unsigned PhysReg) const
Return true if the specified register is modified or read in this function.
Calling convention used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS) const
Special case of eliminateFrameIndex.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array...
Definition: ArrayRef.h:178
void replaceRegWith(unsigned FromReg, unsigned ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
The memory access reads data.
bool flatScratchIsPointer() const
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:63
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
unsigned getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
A set of physical registers with utility functions to track liveness when walking backward/forward th...
Definition: LivePhysRegs.h:48
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned char TargetFlags=0) const
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned getImplicitBufferPtrUserSGPR() const
virtual const TargetFrameLowering * getFrameLowering() const
The memory access always returns the same value (or traps).
bool isAmdHsaOrMesa(const Function &F) const
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
bool spillSGPRToVGPR() const
bool hasSP(const MachineFunction &MF) const
uint8_t getStackID(int ObjectIdx) const
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void addReg(MCPhysReg Reg)
Adds a physical register and all its sub-registers to the set.
Definition: LivePhysRegs.h:79
IRTranslator LLVM IR MI
Calling convention used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects...
bool hasFP(const MachineFunction &MF) const override
hasFP - Return true if the specified function should have a dedicated frame pointer register...
bool isReserved(unsigned PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
bool hasCalls() const
Return true if the current function has any function calls.
const SIRegisterInfo * getRegisterInfo() const override