LLVM 20.0.0git
SIFrameLowering.cpp
Go to the documentation of this file.
1//===----------------------- SIFrameLowering.cpp --------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8
9#include "SIFrameLowering.h"
10#include "AMDGPU.h"
11#include "GCNSubtarget.h"
18
19using namespace llvm;
20
21#define DEBUG_TYPE "frame-info"
22
24 "amdgpu-spill-vgpr-to-agpr",
25 cl::desc("Enable spilling VGPRs to AGPRs"),
27 cl::init(true));
28
29// Find a register matching \p RC from \p LiveUnits which is unused and
30// available throughout the function. On failure, returns AMDGPU::NoRegister.
31// TODO: Rewrite the loop here to iterate over MCRegUnits instead of
32// MCRegisters. This should reduce the number of iterations and avoid redundant
33// checking.
35 const LiveRegUnits &LiveUnits,
36 const TargetRegisterClass &RC) {
37 for (MCRegister Reg : RC) {
38 if (!MRI.isPhysRegUsed(Reg) && LiveUnits.available(Reg) &&
39 !MRI.isReserved(Reg))
40 return Reg;
41 }
42 return MCRegister();
43}
44
45// Find a scratch register that we can use in the prologue. We avoid using
46// callee-save registers since they may appear to be free when this is called
47// from canUseAsPrologue (during shrink wrapping), but then no longer be free
48// when this is called from emitPrologue.
51 const TargetRegisterClass &RC, bool Unused = false) {
52 // Mark callee saved registers as used so we will not choose them.
53 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
54 for (unsigned i = 0; CSRegs[i]; ++i)
55 LiveUnits.addReg(CSRegs[i]);
56
57 // We are looking for a register that can be used throughout the entire
58 // function, so any use is unacceptable.
59 if (Unused)
60 return findUnusedRegister(MRI, LiveUnits, RC);
61
62 for (MCRegister Reg : RC) {
63 if (LiveUnits.available(Reg) && !MRI.isReserved(Reg))
64 return Reg;
65 }
66
67 return MCRegister();
68}
69
70/// Query target location for spilling SGPRs
71/// \p IncludeScratchCopy : Also look for free scratch SGPRs
73 MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR,
74 const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
75 bool IncludeScratchCopy = true) {
77 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
78
79 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
80 const SIRegisterInfo *TRI = ST.getRegisterInfo();
81 unsigned Size = TRI->getSpillSize(RC);
82 Align Alignment = TRI->getSpillAlign(RC);
83
84 // We need to save and restore the given SGPR.
85
86 Register ScratchSGPR;
87 // 1: Try to save the given register into an unused scratch SGPR. The
88 // LiveUnits should have all the callee saved registers marked as used. For
89 // certain cases we skip copy to scratch SGPR.
90 if (IncludeScratchCopy)
91 ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC);
92
93 if (!ScratchSGPR) {
94 int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr,
96
97 if (TRI->spillSGPRToVGPR() &&
98 MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true,
99 /*IsPrologEpilog=*/true)) {
100 // 2: There's no free lane to spill, and no free register to save the
101 // SGPR, so we're forced to take another VGPR to use for the spill.
104 SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
105
106 LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
107 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
108 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
109 << '\n';);
110 } else {
111 // Remove dead <FI> index
113 // 3: If all else fails, spill the register to memory.
114 FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
116 SGPR,
117 PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI));
118 LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
119 << printReg(SGPR, TRI) << '\n');
120 }
121 } else {
124 SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
125 LiveUnits.addReg(ScratchSGPR);
126 LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
127 << printReg(ScratchSGPR, TRI) << '\n');
128 }
129}
130
131// We need to specially emit stack operations here because a different frame
132// register is used than in the rest of the function, as getFrameRegister would
133// use.
134static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
135 const SIMachineFunctionInfo &FuncInfo,
136 LiveRegUnits &LiveUnits, MachineFunction &MF,
139 Register SpillReg, int FI, Register FrameReg,
140 int64_t DwordOff = 0) {
141 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
142 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
143
144 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
147 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
148 FrameInfo.getObjectAlign(FI));
149 LiveUnits.addReg(SpillReg);
150 bool IsKill = !MBB.isLiveIn(SpillReg);
151 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg,
152 DwordOff, MMO, nullptr, &LiveUnits);
153 if (IsKill)
154 LiveUnits.removeReg(SpillReg);
155}
156
157static void buildEpilogRestore(const GCNSubtarget &ST,
158 const SIRegisterInfo &TRI,
159 const SIMachineFunctionInfo &FuncInfo,
160 LiveRegUnits &LiveUnits, MachineFunction &MF,
163 const DebugLoc &DL, Register SpillReg, int FI,
164 Register FrameReg, int64_t DwordOff = 0) {
165 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
166 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
167
168 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
171 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
172 FrameInfo.getObjectAlign(FI));
173 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg,
174 DwordOff, MMO, nullptr, &LiveUnits);
175}
176
178 const DebugLoc &DL, const SIInstrInfo *TII,
179 Register TargetReg) {
182 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
183 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
184 Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
185 Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
186
187 if (MFI->getGITPtrHigh() != 0xffffffff) {
188 BuildMI(MBB, I, DL, SMovB32, TargetHi)
189 .addImm(MFI->getGITPtrHigh())
190 .addReg(TargetReg, RegState::ImplicitDefine);
191 } else {
192 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_pseudo);
193 BuildMI(MBB, I, DL, GetPC64, TargetReg);
194 }
195 Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
196 MF->getRegInfo().addLiveIn(GitPtrLo);
197 MBB.addLiveIn(GitPtrLo);
198 BuildMI(MBB, I, DL, SMovB32, TargetLo)
199 .addReg(GitPtrLo);
200}
201
202static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI,
203 const SIMachineFunctionInfo *FuncInfo,
205 MachineBasicBlock::iterator MBBI, bool IsProlog) {
206 if (LiveUnits.empty()) {
207 LiveUnits.init(TRI);
208 if (IsProlog) {
209 LiveUnits.addLiveIns(MBB);
210 } else {
211 // In epilog.
212 LiveUnits.addLiveOuts(MBB);
213 LiveUnits.stepBackward(*MBBI);
214 }
215 }
216}
217
218namespace llvm {
219
220// SpillBuilder to save/restore special SGPR spills like the one needed for FP,
221// BP, etc. These spills are delayed until the current function's frame is
222// finalized. For a given register, the builder uses the
223// PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
227 MachineFunction &MF;
228 const GCNSubtarget &ST;
229 MachineFrameInfo &MFI;
230 SIMachineFunctionInfo *FuncInfo;
231 const SIInstrInfo *TII;
232 const SIRegisterInfo &TRI;
233 Register SuperReg;
235 LiveRegUnits &LiveUnits;
236 const DebugLoc &DL;
237 Register FrameReg;
238 ArrayRef<int16_t> SplitParts;
239 unsigned NumSubRegs;
240 unsigned EltSize = 4;
241
242 void saveToMemory(const int FI) const {
244 assert(!MFI.isDeadObjectIndex(FI));
245
246 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
247
249 MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
250 if (!TmpVGPR)
251 report_fatal_error("failed to find free scratch register");
252
253 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
254 Register SubReg = NumSubRegs == 1
255 ? SuperReg
256 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
257 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
258 .addReg(SubReg);
259
260 buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR,
261 FI, FrameReg, DwordOff);
262 DwordOff += 4;
263 }
264 }
265
266 void saveToVGPRLane(const int FI) const {
267 assert(!MFI.isDeadObjectIndex(FI));
268
272 assert(Spill.size() == NumSubRegs);
273
274 for (unsigned I = 0; I < NumSubRegs; ++I) {
275 Register SubReg = NumSubRegs == 1
276 ? SuperReg
277 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
278 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR),
279 Spill[I].VGPR)
280 .addReg(SubReg)
281 .addImm(Spill[I].Lane)
282 .addReg(Spill[I].VGPR, RegState::Undef);
283 }
284 }
285
286 void copyToScratchSGPR(Register DstReg) const {
287 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg)
288 .addReg(SuperReg)
290 }
291
292 void restoreFromMemory(const int FI) {
294
295 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
297 MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
298 if (!TmpVGPR)
299 report_fatal_error("failed to find free scratch register");
300
301 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
302 Register SubReg = NumSubRegs == 1
303 ? SuperReg
304 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
305
306 buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
307 TmpVGPR, FI, FrameReg, DwordOff);
308 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
309 .addReg(TmpVGPR, RegState::Kill);
310 DwordOff += 4;
311 }
312 }
313
314 void restoreFromVGPRLane(const int FI) {
318 assert(Spill.size() == NumSubRegs);
319
320 for (unsigned I = 0; I < NumSubRegs; ++I) {
321 Register SubReg = NumSubRegs == 1
322 ? SuperReg
323 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
324 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
325 .addReg(Spill[I].VGPR)
326 .addImm(Spill[I].Lane);
327 }
328 }
329
330 void copyFromScratchSGPR(Register SrcReg) const {
331 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg)
332 .addReg(SrcReg)
334 }
335
336public:
341 const DebugLoc &DL, const SIInstrInfo *TII,
342 const SIRegisterInfo &TRI,
343 LiveRegUnits &LiveUnits, Register FrameReg)
344 : MI(MI), MBB(MBB), MF(*MBB.getParent()),
345 ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
346 FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
347 SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL),
348 FrameReg(FrameReg) {
349 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
350 SplitParts = TRI.getRegSplitParts(RC, EltSize);
351 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
352
353 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
354 }
355
356 void save() {
357 switch (SI.getKind()) {
359 return saveToMemory(SI.getIndex());
361 return saveToVGPRLane(SI.getIndex());
363 return copyToScratchSGPR(SI.getReg());
364 }
365 }
366
367 void restore() {
368 switch (SI.getKind()) {
370 return restoreFromMemory(SI.getIndex());
372 return restoreFromVGPRLane(SI.getIndex());
374 return copyFromScratchSGPR(SI.getReg());
375 }
376 }
377};
378
379} // namespace llvm
380
381// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
382void SIFrameLowering::emitEntryFunctionFlatScratchInit(
384 const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
386 const SIInstrInfo *TII = ST.getInstrInfo();
387 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
389
390 // We don't need this if we only have spills since there is no user facing
391 // scratch.
392
393 // TODO: If we know we don't have flat instructions earlier, we can omit
394 // this from the input registers.
395 //
396 // TODO: We only need to know if we access scratch space through a flat
397 // pointer. Because we only detect if flat instructions are used at all,
398 // this will be used more often than necessary on VI.
399
400 Register FlatScrInitLo;
401 Register FlatScrInitHi;
402
403 if (ST.isAmdPalOS()) {
404 // Extract the scratch offset from the descriptor in the GIT
405 LiveRegUnits LiveUnits;
406 LiveUnits.init(*TRI);
407 LiveUnits.addLiveIns(MBB);
408
409 // Find unused reg to load flat scratch init into
411 Register FlatScrInit = AMDGPU::NoRegister;
412 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
413 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
414 AllSGPR64s = AllSGPR64s.slice(
415 std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
416 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
417 for (MCPhysReg Reg : AllSGPR64s) {
418 if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
419 MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
420 FlatScrInit = Reg;
421 break;
422 }
423 }
424 assert(FlatScrInit && "Failed to find free register for scratch init");
425
426 FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
427 FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
428
429 buildGitPtr(MBB, I, DL, TII, FlatScrInit);
430
431 // We now have the GIT ptr - now get the scratch descriptor from the entry
432 // at offset 0 (or offset 16 for a compute shader).
434 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
435 auto *MMO = MF.getMachineMemOperand(
436 PtrInfo,
439 8, Align(4));
440 unsigned Offset =
442 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
443 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
444 BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
445 .addReg(FlatScrInit)
446 .addImm(EncodedOffset) // offset
447 .addImm(0) // cpol
448 .addMemOperand(MMO);
449
450 // Mask the offset in [47:0] of the descriptor
451 const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
452 auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
453 .addReg(FlatScrInitHi)
454 .addImm(0xffff);
455 And->getOperand(3).setIsDead(); // Mark SCC as dead.
456 } else {
457 Register FlatScratchInitReg =
459 assert(FlatScratchInitReg);
460
462 MRI.addLiveIn(FlatScratchInitReg);
463 MBB.addLiveIn(FlatScratchInitReg);
464
465 FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
466 FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
467 }
468
469 // Do a 64-bit pointer add.
470 if (ST.flatScratchIsPointer()) {
471 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
472 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
473 .addReg(FlatScrInitLo)
474 .addReg(ScratchWaveOffsetReg);
475 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
476 FlatScrInitHi)
477 .addReg(FlatScrInitHi)
478 .addImm(0);
479 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
480
481 using namespace AMDGPU::Hwreg;
482 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
483 .addReg(FlatScrInitLo)
484 .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_LO, 0, 32)));
485 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
486 .addReg(FlatScrInitHi)
487 .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_HI, 0, 32)));
488 return;
489 }
490
491 // For GFX9.
492 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
493 .addReg(FlatScrInitLo)
494 .addReg(ScratchWaveOffsetReg);
495 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
496 AMDGPU::FLAT_SCR_HI)
497 .addReg(FlatScrInitHi)
498 .addImm(0);
499 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
500
501 return;
502 }
503
504 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
505
506 // Copy the size in bytes.
507 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
508 .addReg(FlatScrInitHi, RegState::Kill);
509
510 // Add wave offset in bytes to private base offset.
511 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
512 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
513 .addReg(FlatScrInitLo)
514 .addReg(ScratchWaveOffsetReg);
515
516 // Convert offset to 256-byte units.
517 auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32),
518 AMDGPU::FLAT_SCR_HI)
519 .addReg(FlatScrInitLo, RegState::Kill)
520 .addImm(8);
521 LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
522}
523
524// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
525// memory. They should have been removed by now.
527 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
528 I != E; ++I) {
529 if (!MFI.isDeadObjectIndex(I))
530 return false;
531 }
532
533 return true;
534}
535
536// Shift down registers reserved for the scratch RSRC.
537Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
538 MachineFunction &MF) const {
539
541 const SIInstrInfo *TII = ST.getInstrInfo();
542 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
545
546 assert(MFI->isEntryFunction());
547
548 Register ScratchRsrcReg = MFI->getScratchRSrcReg();
549
550 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
552 return Register();
553
554 if (ST.hasSGPRInitBug() ||
555 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
556 return ScratchRsrcReg;
557
558 // We reserved the last registers for this. Shift it down to the end of those
559 // which were actually used.
560 //
561 // FIXME: It might be safer to use a pseudoregister before replacement.
562
563 // FIXME: We should be able to eliminate unused input registers. We only
564 // cannot do this for the resources required for scratch access. For now we
565 // skip over user SGPRs and may leave unused holes.
566
567 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
568 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
569 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
570
571 // Skip the last N reserved elements because they should have already been
572 // reserved for VCC etc.
573 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
574 for (MCPhysReg Reg : AllSGPR128s) {
575 // Pick the first unallocated one. Make sure we don't clobber the other
576 // reserved input we needed. Also for PAL, make sure we don't clobber
577 // the GIT pointer passed in SGPR0 or SGPR8.
578 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
579 (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) {
580 MRI.replaceRegWith(ScratchRsrcReg, Reg);
581 MFI->setScratchRSrcReg(Reg);
582 MRI.reserveReg(Reg, TRI);
583 return Reg;
584 }
585 }
586
587 return ScratchRsrcReg;
588}
589
590static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
591 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
592}
593
595 MachineBasicBlock &MBB) const {
596 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
597
598 // FIXME: If we only have SGPR spills, we won't actually be using scratch
599 // memory since these spill to VGPRs. We should be cleaning up these unused
600 // SGPR spill frame indices somewhere.
601
602 // FIXME: We still have implicit uses on SGPR spill instructions in case they
603 // need to spill to vector memory. It's likely that will not happen, but at
604 // this point it appears we need the setup. This part of the prolog should be
605 // emitted after frame indices are eliminated.
606
607 // FIXME: Remove all of the isPhysRegUsed checks
608
610 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
611 const SIInstrInfo *TII = ST.getInstrInfo();
612 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
614 const Function &F = MF.getFunction();
615 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
616
617 assert(MFI->isEntryFunction());
618
619 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
621
622 // We need to do the replacement of the private segment buffer register even
623 // if there are no stack objects. There could be stores to undef or a
624 // constant without an associated object.
625 //
626 // This will return `Register()` in cases where there are no actual
627 // uses of the SRSRC.
628 Register ScratchRsrcReg;
629 if (!ST.enableFlatScratch())
630 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
631
632 // Make the selected register live throughout the function.
633 if (ScratchRsrcReg) {
634 for (MachineBasicBlock &OtherBB : MF) {
635 if (&OtherBB != &MBB) {
636 OtherBB.addLiveIn(ScratchRsrcReg);
637 }
638 }
639 }
640
641 // Now that we have fixed the reserved SRSRC we need to locate the
642 // (potentially) preloaded SRSRC.
643 Register PreloadedScratchRsrcReg;
644 if (ST.isAmdHsaOrMesa(F)) {
645 PreloadedScratchRsrcReg =
647 if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
648 // We added live-ins during argument lowering, but since they were not
649 // used they were deleted. We're adding the uses now, so add them back.
650 MRI.addLiveIn(PreloadedScratchRsrcReg);
651 MBB.addLiveIn(PreloadedScratchRsrcReg);
652 }
653 }
654
655 // Debug location must be unknown since the first debug location is used to
656 // determine the end of the prologue.
657 DebugLoc DL;
659
660 // We found the SRSRC first because it needs four registers and has an
661 // alignment requirement. If the SRSRC that we found is clobbering with
662 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
663 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
664 // wave offset to a free SGPR.
665 Register ScratchWaveOffsetReg;
666 if (PreloadedScratchWaveOffsetReg &&
667 TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
668 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
669 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
670 AllSGPRs = AllSGPRs.slice(
671 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
672 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
673 for (MCPhysReg Reg : AllSGPRs) {
674 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
675 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
676 ScratchWaveOffsetReg = Reg;
677 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
678 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
679 break;
680 }
681 }
682
683 // FIXME: We can spill incoming arguments and restore at the end of the
684 // prolog.
685 if (!ScratchWaveOffsetReg)
687 "could not find temporary scratch offset register in prolog");
688 } else {
689 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
690 }
691 assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
692
693 if (hasFP(MF)) {
694 Register FPReg = MFI->getFrameOffsetReg();
695 assert(FPReg != AMDGPU::FP_REG);
696 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
697 }
698
700 Register SPReg = MFI->getStackPtrOffsetReg();
701 assert(SPReg != AMDGPU::SP_REG);
702 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
703 .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
704 }
705
706 bool NeedsFlatScratchInit =
708 (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
709 (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
710
711 if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
712 PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
713 MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
714 MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
715 }
716
717 if (NeedsFlatScratchInit) {
718 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
719 }
720
721 if (ScratchRsrcReg) {
722 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
723 PreloadedScratchRsrcReg,
724 ScratchRsrcReg, ScratchWaveOffsetReg);
725 }
726}
727
728// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
729void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
731 const DebugLoc &DL, Register PreloadedScratchRsrcReg,
732 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
733
734 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
735 const SIInstrInfo *TII = ST.getInstrInfo();
736 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
738 const Function &Fn = MF.getFunction();
739
740 if (ST.isAmdPalOS()) {
741 // The pointer to the GIT is formed from the offset passed in and either
742 // the amdgpu-git-ptr-high function attribute or the top part of the PC
743 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
744 Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
745
746 buildGitPtr(MBB, I, DL, TII, Rsrc01);
747
748 // We now have the GIT ptr - now get the scratch descriptor from the entry
749 // at offset 0 (or offset 16 for a compute shader).
751 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
752 auto MMO = MF.getMachineMemOperand(PtrInfo,
756 16, Align(4));
757 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
758 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
759 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
760 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
761 .addReg(Rsrc01)
762 .addImm(EncodedOffset) // offset
763 .addImm(0) // cpol
764 .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
765 .addMemOperand(MMO);
766
767 // The driver will always set the SRD for wave 64 (bits 118:117 of
768 // descriptor / bits 22:21 of third sub-reg will be 0b11)
769 // If the shader is actually wave32 we have to modify the const_index_stride
770 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
771 // reason the driver does this is that there can be cases where it presents
772 // 2 shaders with different wave size (e.g. VsFs).
773 // TODO: convert to using SCRATCH instructions or multiple SRD buffers
774 if (ST.isWave32()) {
775 const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
776 BuildMI(MBB, I, DL, SBitsetB32, Rsrc03)
777 .addImm(21)
778 .addReg(Rsrc03);
779 }
780 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
781 assert(!ST.isAmdHsaOrMesa(Fn));
782 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
783
784 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
785 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
786
787 // Use relocations to get the pointer, and setup the other bits manually.
788 uint64_t Rsrc23 = TII->getScratchRsrcWords23();
789
791 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
792
794 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
795
796 BuildMI(MBB, I, DL, Mov64, Rsrc01)
798 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
799 } else {
800 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
801
803 auto MMO = MF.getMachineMemOperand(
804 PtrInfo,
807 8, Align(4));
808 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
810 .addImm(0) // offset
811 .addImm(0) // cpol
812 .addMemOperand(MMO)
813 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
814
817 }
818 } else {
819 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
820 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
821
822 BuildMI(MBB, I, DL, SMovB32, Rsrc0)
823 .addExternalSymbol("SCRATCH_RSRC_DWORD0")
824 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
825
826 BuildMI(MBB, I, DL, SMovB32, Rsrc1)
827 .addExternalSymbol("SCRATCH_RSRC_DWORD1")
828 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
829 }
830
831 BuildMI(MBB, I, DL, SMovB32, Rsrc2)
832 .addImm(Rsrc23 & 0xffffffff)
833 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
834
835 BuildMI(MBB, I, DL, SMovB32, Rsrc3)
836 .addImm(Rsrc23 >> 32)
837 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
838 } else if (ST.isAmdHsaOrMesa(Fn)) {
839 assert(PreloadedScratchRsrcReg);
840
841 if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
842 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
843 .addReg(PreloadedScratchRsrcReg, RegState::Kill);
844 }
845 }
846
847 // Add the scratch wave offset into the scratch RSRC.
848 //
849 // We only want to update the first 48 bits, which is the base address
850 // pointer, without touching the adjacent 16 bits of flags. We know this add
851 // cannot carry-out from bit 47, otherwise the scratch allocation would be
852 // impossible to fit in the 48-bit global address space.
853 //
854 // TODO: Evaluate if it is better to just construct an SRD using the flat
855 // scratch init and some constants rather than update the one we are passed.
856 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
857 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
858
859 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
860 // the kernel body via inreg arguments.
861 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
862 .addReg(ScratchRsrcSub0)
863 .addReg(ScratchWaveOffsetReg)
864 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
865 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
866 .addReg(ScratchRsrcSub1)
867 .addImm(0)
868 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
869 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
870}
871
873 switch (ID) {
877 return true;
880 return false;
881 }
882 llvm_unreachable("Invalid TargetStackID::Value");
883}
884
885// Activate only the inactive lanes when \p EnableInactiveLanes is true.
886// Otherwise, activate all lanes. It returns the saved exec.
888 MachineFunction &MF,
891 const DebugLoc &DL, bool IsProlog,
892 bool EnableInactiveLanes) {
893 Register ScratchExecCopy;
895 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
896 const SIInstrInfo *TII = ST.getInstrInfo();
897 const SIRegisterInfo &TRI = TII->getRegisterInfo();
899
900 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
901
902 ScratchExecCopy = findScratchNonCalleeSaveRegister(
903 MRI, LiveUnits, *TRI.getWaveMaskRegClass());
904 if (!ScratchExecCopy)
905 report_fatal_error("failed to find free scratch register");
906
907 LiveUnits.addReg(ScratchExecCopy);
908
909 const unsigned SaveExecOpc =
910 ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
911 : AMDGPU::S_OR_SAVEEXEC_B32)
912 : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
913 : AMDGPU::S_OR_SAVEEXEC_B64);
914 auto SaveExec =
915 BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1);
916 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
917
918 return ScratchExecCopy;
919}
920
924 Register FrameReg, Register FramePtrRegScratchCopy) const {
926 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
927 const SIInstrInfo *TII = ST.getInstrInfo();
928 const SIRegisterInfo &TRI = TII->getRegisterInfo();
929
930 // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
931 // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
932 // might end up flipping the EXEC bits twice.
933 Register ScratchExecCopy;
934 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
935 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
936 if (!WWMScratchRegs.empty())
937 ScratchExecCopy =
938 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
939 /*IsProlog*/ true, /*EnableInactiveLanes*/ true);
940
941 auto StoreWWMRegisters =
943 for (const auto &Reg : WWMRegs) {
944 Register VGPR = Reg.first;
945 int FI = Reg.second;
946 buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
947 VGPR, FI, FrameReg);
948 }
949 };
950
951 StoreWWMRegisters(WWMScratchRegs);
952 if (!WWMCalleeSavedRegs.empty()) {
953 if (ScratchExecCopy) {
954 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
955 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
956 } else {
957 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
958 /*IsProlog*/ true,
959 /*EnableInactiveLanes*/ false);
960 }
961 }
962
963 StoreWWMRegisters(WWMCalleeSavedRegs);
964 if (ScratchExecCopy) {
965 // FIXME: Split block and make terminator.
966 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
967 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
968 .addReg(ScratchExecCopy, RegState::Kill);
969 LiveUnits.addReg(ScratchExecCopy);
970 }
971
972 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
973
974 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
975 // Special handle FP spill:
976 // Skip if FP is saved to a scratch SGPR, the save has already been emitted.
977 // Otherwise, FP has been moved to a temporary register and spill it
978 // instead.
979 Register Reg =
980 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
981 if (!Reg)
982 continue;
983
984 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
985 LiveUnits, FrameReg);
986 SB.save();
987 }
988
989 // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
990 // such scratch registers live throughout the function.
991 SmallVector<Register, 1> ScratchSGPRs;
992 FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs);
993 if (!ScratchSGPRs.empty()) {
994 for (MachineBasicBlock &MBB : MF) {
995 for (MCPhysReg Reg : ScratchSGPRs)
996 MBB.addLiveIn(Reg);
997
999 }
1000 if (!LiveUnits.empty()) {
1001 for (MCPhysReg Reg : ScratchSGPRs)
1002 LiveUnits.addReg(Reg);
1003 }
1004 }
1005}
1006
1010 Register FrameReg, Register FramePtrRegScratchCopy) const {
1011 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1012 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1013 const SIInstrInfo *TII = ST.getInstrInfo();
1014 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1015 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1016
1017 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1018 // Special handle FP restore:
1019 // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1020 // the FP value to a temporary register. The frame pointer should be
1021 // overwritten only at the end when all other spills are restored from
1022 // current frame.
1023 Register Reg =
1024 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1025 if (!Reg)
1026 continue;
1027
1028 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1029 LiveUnits, FrameReg);
1030 SB.restore();
1031 }
1032
1033 // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1034 // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1035 // this, we might end up flipping the EXEC bits twice.
1036 Register ScratchExecCopy;
1037 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1038 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
1039 if (!WWMScratchRegs.empty())
1040 ScratchExecCopy =
1041 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1042 /*IsProlog*/ false, /*EnableInactiveLanes*/ true);
1043
1044 auto RestoreWWMRegisters =
1046 for (const auto &Reg : WWMRegs) {
1047 Register VGPR = Reg.first;
1048 int FI = Reg.second;
1049 buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
1050 VGPR, FI, FrameReg);
1051 }
1052 };
1053
1054 RestoreWWMRegisters(WWMScratchRegs);
1055 if (!WWMCalleeSavedRegs.empty()) {
1056 if (ScratchExecCopy) {
1057 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1058 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
1059 } else {
1060 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1061 /*IsProlog*/ false,
1062 /*EnableInactiveLanes*/ false);
1063 }
1064 }
1065
1066 RestoreWWMRegisters(WWMCalleeSavedRegs);
1067 if (ScratchExecCopy) {
1068 // FIXME: Split block and make terminator.
1069 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1070 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
1071 .addReg(ScratchExecCopy, RegState::Kill);
1072 }
1073}
1074
1076 MachineBasicBlock &MBB) const {
1078 if (FuncInfo->isEntryFunction()) {
1080 return;
1081 }
1082
1083 MachineFrameInfo &MFI = MF.getFrameInfo();
1084 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1085 const SIInstrInfo *TII = ST.getInstrInfo();
1086 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1088
1089 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1090 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1091 Register BasePtrReg =
1092 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
1093 LiveRegUnits LiveUnits;
1094
1096 // DebugLoc must be unknown since the first instruction with DebugLoc is used
1097 // to determine the end of the prologue.
1098 DebugLoc DL;
1099
1100 if (FuncInfo->isChainFunction()) {
1101 // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but
1102 // are free to set one up if they need it.
1103 bool UseSP = requiresStackPointerReference(MF);
1104 if (UseSP) {
1105 assert(StackPtrReg != AMDGPU::SP_REG);
1106
1107 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg)
1109 }
1110 }
1111
1112 bool HasFP = false;
1113 bool HasBP = false;
1114 uint32_t NumBytes = MFI.getStackSize();
1115 uint32_t RoundedSize = NumBytes;
1116
1117 if (TRI.hasStackRealignment(MF))
1118 HasFP = true;
1119
1120 Register FramePtrRegScratchCopy;
1121 if (!HasFP && !hasFP(MF)) {
1122 // Emit the CSR spill stores with SP base register.
1123 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
1124 FuncInfo->isChainFunction() ? Register() : StackPtrReg,
1125 FramePtrRegScratchCopy);
1126 } else {
1127 // CSR spill stores will use FP as base register.
1128 Register SGPRForFPSaveRestoreCopy =
1129 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1130
1131 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
1132 if (SGPRForFPSaveRestoreCopy) {
1133 // Copy FP to the scratch register now and emit the CFI entry. It avoids
1134 // the extra FP copy needed in the other two cases when FP is spilled to
1135 // memory or to a VGPR lane.
1137 FramePtrReg,
1138 FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI,
1139 DL, TII, TRI, LiveUnits, FramePtrReg);
1140 SB.save();
1141 LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
1142 } else {
1143 // Copy FP into a new scratch register so that its previous value can be
1144 // spilled after setting up the new frame.
1145 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1146 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1147 if (!FramePtrRegScratchCopy)
1148 report_fatal_error("failed to find free scratch register");
1149
1150 LiveUnits.addReg(FramePtrRegScratchCopy);
1151 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy)
1152 .addReg(FramePtrReg);
1153 }
1154 }
1155
1156 if (HasFP) {
1157 const unsigned Alignment = MFI.getMaxAlign().value();
1158
1159 RoundedSize += Alignment;
1160 if (LiveUnits.empty()) {
1161 LiveUnits.init(TRI);
1162 LiveUnits.addLiveIns(MBB);
1163 }
1164
1165 // s_add_i32 s33, s32, NumBytes
1166 // s_and_b32 s33, s33, 0b111...0000
1167 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
1168 .addReg(StackPtrReg)
1169 .addImm((Alignment - 1) * getScratchScaleFactor(ST))
1171 auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
1172 .addReg(FramePtrReg, RegState::Kill)
1173 .addImm(-Alignment * getScratchScaleFactor(ST))
1175 And->getOperand(3).setIsDead(); // Mark SCC as dead.
1176 FuncInfo->setIsStackRealigned(true);
1177 } else if ((HasFP = hasFP(MF))) {
1178 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1179 .addReg(StackPtrReg)
1181 }
1182
1183 // If FP is used, emit the CSR spills with FP base register.
1184 if (HasFP) {
1185 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
1186 FramePtrRegScratchCopy);
1187 if (FramePtrRegScratchCopy)
1188 LiveUnits.removeReg(FramePtrRegScratchCopy);
1189 }
1190
1191 // If we need a base pointer, set it up here. It's whatever the value of
1192 // the stack pointer is at this point. Any variable size objects will be
1193 // allocated after this, so we can still use the base pointer to reference
1194 // the incoming arguments.
1195 if ((HasBP = TRI.hasBasePointer(MF))) {
1196 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
1197 .addReg(StackPtrReg)
1199 }
1200
1201 if (HasFP && RoundedSize != 0) {
1202 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1203 .addReg(StackPtrReg)
1204 .addImm(RoundedSize * getScratchScaleFactor(ST))
1206 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1207 }
1208
1209 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1210 (void)FPSaved;
1211 assert((!HasFP || FPSaved) &&
1212 "Needed to save FP but didn't save it anywhere");
1213
1214 // If we allow spilling to AGPRs we may have saved FP but then spill
1215 // everything into AGPRs instead of the stack.
1216 assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) &&
1217 "Saved FP but didn't need it");
1218
1219 bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg);
1220 (void)BPSaved;
1221 assert((!HasBP || BPSaved) &&
1222 "Needed to save BP but didn't save it anywhere");
1223
1224 assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
1225}
1226
1228 MachineBasicBlock &MBB) const {
1229 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1230 if (FuncInfo->isEntryFunction())
1231 return;
1232
1233 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1234 const SIInstrInfo *TII = ST.getInstrInfo();
1235 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1237 LiveRegUnits LiveUnits;
1238 // Get the insert location for the epilogue. If there were no terminators in
1239 // the block, get the last instruction.
1241 DebugLoc DL;
1242 if (!MBB.empty()) {
1244 if (MBBI != MBB.end())
1245 DL = MBBI->getDebugLoc();
1246
1248 }
1249
1250 const MachineFrameInfo &MFI = MF.getFrameInfo();
1251 uint32_t NumBytes = MFI.getStackSize();
1252 uint32_t RoundedSize = FuncInfo->isStackRealigned()
1253 ? NumBytes + MFI.getMaxAlign().value()
1254 : NumBytes;
1255 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1256 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1257 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1258
1259 Register FramePtrRegScratchCopy;
1260 Register SGPRForFPSaveRestoreCopy =
1261 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1262 if (FPSaved) {
1263 // CSR spill restores should use FP as base register. If
1264 // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1265 // into a new scratch register and copy to FP later when other registers are
1266 // restored from the current stack frame.
1267 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
1268 if (SGPRForFPSaveRestoreCopy) {
1269 LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
1270 } else {
1271 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1272 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1273 if (!FramePtrRegScratchCopy)
1274 report_fatal_error("failed to find free scratch register");
1275
1276 LiveUnits.addReg(FramePtrRegScratchCopy);
1277 }
1278
1279 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
1280 FramePtrRegScratchCopy);
1281 }
1282
1283 if (RoundedSize != 0 && hasFP(MF)) {
1284 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1285 .addReg(StackPtrReg)
1286 .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
1288 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1289 }
1290
1291 if (FPSaved) {
1292 // Insert the copy to restore FP.
1293 Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1294 : FramePtrRegScratchCopy;
1296 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1297 .addReg(SrcReg);
1298 if (SGPRForFPSaveRestoreCopy)
1300 } else {
1301 // Insert the CSR spill restores with SP as the base register.
1302 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg,
1303 FramePtrRegScratchCopy);
1304 }
1305}
1306
1307#ifndef NDEBUG
1309 const MachineFrameInfo &MFI = MF.getFrameInfo();
1310 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1311 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1312 I != E; ++I) {
1313 if (!MFI.isDeadObjectIndex(I) &&
1316 return false;
1317 }
1318 }
1319
1320 return true;
1321}
1322#endif
1323
1325 int FI,
1326 Register &FrameReg) const {
1327 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1328
1329 FrameReg = RI->getFrameRegister(MF);
1331}
1332
1334 MachineFunction &MF,
1335 RegScavenger *RS) const {
1336 MachineFrameInfo &MFI = MF.getFrameInfo();
1337
1338 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1339 const SIInstrInfo *TII = ST.getInstrInfo();
1340 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1343
1344 // Allocate spill slots for WWM reserved VGPRs.
1345 // For chain functions, we only need to do this if we have calls to
1346 // llvm.amdgcn.cs.chain.
1347 bool IsChainWithoutCalls =
1348 FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
1349 if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
1350 for (Register Reg : FuncInfo->getWWMReservedRegs()) {
1351 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1352 FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
1353 TRI->getSpillAlign(*RC));
1354 }
1355 }
1356
1357 const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1359
1360 if (SpillVGPRToAGPR) {
1361 // To track the spill frame indices handled in this pass.
1362 BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1363 BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1364
1365 bool SeenDbgInstr = false;
1366
1367 for (MachineBasicBlock &MBB : MF) {
1369 int FrameIndex;
1370 if (MI.isDebugInstr())
1371 SeenDbgInstr = true;
1372
1373 if (TII->isVGPRSpill(MI)) {
1374 // Try to eliminate stack used by VGPR spills before frame
1375 // finalization.
1376 unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1377 AMDGPU::OpName::vaddr);
1378 int FI = MI.getOperand(FIOp).getIndex();
1379 Register VReg =
1380 TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
1381 if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1382 TRI->isAGPR(MRI, VReg))) {
1383 assert(RS != nullptr);
1385 RS->backward(std::next(MI.getIterator()));
1386 TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
1387 SpillFIs.set(FI);
1388 continue;
1389 }
1390 } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
1391 TII->isLoadFromStackSlot(MI, FrameIndex))
1392 if (!MFI.isFixedObjectIndex(FrameIndex))
1393 NonVGPRSpillFIs.set(FrameIndex);
1394 }
1395 }
1396
1397 // Stack slot coloring may assign different objects to the same stack slot.
1398 // If not, then the VGPR to AGPR spill slot is dead.
1399 for (unsigned FI : SpillFIs.set_bits())
1400 if (!NonVGPRSpillFIs.test(FI))
1401 FuncInfo->setVGPRToAGPRSpillDead(FI);
1402
1403 for (MachineBasicBlock &MBB : MF) {
1404 for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1405 MBB.addLiveIn(Reg);
1406
1407 for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1408 MBB.addLiveIn(Reg);
1409
1411
1412 if (!SpillFIs.empty() && SeenDbgInstr) {
1413 // FIXME: The dead frame indices are replaced with a null register from
1414 // the debug value instructions. We should instead, update it with the
1415 // correct register value. But not sure the register value alone is
1416 for (MachineInstr &MI : MBB) {
1417 if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
1418 !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) &&
1419 SpillFIs[MI.getOperand(0).getIndex()]) {
1420 MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
1421 }
1422 }
1423 }
1424 }
1425 }
1426
1427 // At this point we've already allocated all spilled SGPRs to VGPRs if we
1428 // can. Any remaining SGPR spills will go to memory, so move them back to the
1429 // default stack.
1430 bool HaveSGPRToVMemSpill =
1431 FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
1433 "SGPR spill should have been removed in SILowerSGPRSpills");
1434
1435 // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1436 // but currently hasNonSpillStackObjects is set only from source
1437 // allocas. Stack temps produced from legalization are not counted currently.
1438 if (!allStackObjectsAreDead(MFI)) {
1439 assert(RS && "RegScavenger required if spilling");
1440
1441 // Add an emergency spill slot
1442 RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
1443
1444 // If we are spilling SGPRs to memory with a large frame, we may need a
1445 // second VGPR emergency frame index.
1446 if (HaveSGPRToVMemSpill &&
1448 RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false));
1449 }
1450 }
1451}
1452
1454 MachineFunction &MF, RegScavenger *RS) const {
1455 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1456 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1459
1460 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1461 // On gfx908, we had initially reserved highest available VGPR for AGPR
1462 // copy. Now since we are done with RA, check if there exist an unused VGPR
1463 // which is lower than the eariler reserved VGPR before RA. If one exist,
1464 // use it for AGPR copy instead of one reserved before RA.
1465 Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1466 Register UnusedLowVGPR =
1467 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
1468 if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
1469 TRI->getHWRegIndex(VGPRForAGPRCopy))) {
1470 // Reserve this newly identified VGPR (for AGPR copy)
1471 // reserved registers should already be frozen at this point
1472 // so we can avoid calling MRI.freezeReservedRegs and just use
1473 // MRI.reserveReg
1474 FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1475 MRI.reserveReg(UnusedLowVGPR, TRI);
1476 }
1477 }
1478 // We initally reserved the highest available SGPR pair for long branches
1479 // now, after RA, we shift down to a lower unused one if one exists
1480 Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg();
1481 Register UnusedLowSGPR =
1482 TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF);
1483 // If LongBranchReservedReg is null then we didn't find a long branch
1484 // and never reserved a register to begin with so there is nothing to
1485 // shift down. Then if UnusedLowSGPR is null, there isn't available lower
1486 // register to use so just keep the original one we set.
1487 if (LongBranchReservedReg && UnusedLowSGPR) {
1488 FuncInfo->setLongBranchReservedReg(UnusedLowSGPR);
1489 MRI.reserveReg(UnusedLowSGPR, TRI);
1490 }
1491}
1492
1493// The special SGPR spills like the one needed for FP, BP or any reserved
1494// registers delayed until frame lowering.
1496 MachineFunction &MF, BitVector &SavedVGPRs,
1497 bool NeedExecCopyReservedReg) const {
1498 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1501 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1502 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1503 LiveRegUnits LiveUnits;
1504 LiveUnits.init(*TRI);
1505 // Initially mark callee saved registers as used so we will not choose them
1506 // while looking for scratch SGPRs.
1507 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
1508 for (unsigned I = 0; CSRegs[I]; ++I)
1509 LiveUnits.addReg(CSRegs[I]);
1510
1511 const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
1512
1513 Register ReservedRegForExecCopy = MFI->getSGPRForEXECCopy();
1514 if (NeedExecCopyReservedReg ||
1515 (ReservedRegForExecCopy &&
1516 MRI.isPhysRegUsed(ReservedRegForExecCopy, /*SkipRegMaskTest=*/true))) {
1517 MRI.reserveReg(ReservedRegForExecCopy, TRI);
1518 Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
1519 if (UnusedScratchReg) {
1520 // If found any unused scratch SGPR, reserve the register itself for Exec
1521 // copy and there is no need for any spills in that case.
1522 MFI->setSGPRForEXECCopy(UnusedScratchReg);
1523 MRI.replaceRegWith(ReservedRegForExecCopy, UnusedScratchReg);
1524 LiveUnits.addReg(UnusedScratchReg);
1525 } else {
1526 // Needs spill.
1527 assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedRegForExecCopy) &&
1528 "Re-reserving spill slot for EXEC copy register");
1529 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, ReservedRegForExecCopy, RC,
1530 /*IncludeScratchCopy=*/false);
1531 }
1532 } else if (ReservedRegForExecCopy) {
1533 // Reset it at this point. There are no whole-wave copies and spills
1534 // encountered.
1535 MFI->setSGPRForEXECCopy(AMDGPU::NoRegister);
1536 }
1537
1538 // hasFP only knows about stack objects that already exist. We're now
1539 // determining the stack slots that will be created, so we have to predict
1540 // them. Stack objects force FP usage with calls.
1541 //
1542 // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1543 // don't want to report it here.
1544 //
1545 // FIXME: Is this really hasReservedCallFrame?
1546 const bool WillHaveFP =
1547 FrameInfo.hasCalls() &&
1548 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1549
1550 if (WillHaveFP || hasFP(MF)) {
1551 Register FramePtrReg = MFI->getFrameOffsetReg();
1552 assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
1553 "Re-reserving spill slot for FP");
1554 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, FramePtrReg);
1555 }
1556
1557 if (TRI->hasBasePointer(MF)) {
1558 Register BasePtrReg = TRI->getBaseRegister();
1559 assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
1560 "Re-reserving spill slot for BP");
1561 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, BasePtrReg);
1562 }
1563}
1564
1565// Only report VGPRs to generic code.
1567 BitVector &SavedVGPRs,
1568 RegScavenger *RS) const {
1570
1571 // If this is a function with the amdgpu_cs_chain[_preserve] calling
1572 // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then
1573 // we don't need to save and restore anything.
1574 if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
1575 return;
1576
1578
1580 if (MFI->isEntryFunction())
1581 return;
1582
1583 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1584 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1585 const SIInstrInfo *TII = ST.getInstrInfo();
1586 bool NeedExecCopyReservedReg = false;
1587
1588 MachineInstr *ReturnMI = nullptr;
1589 for (MachineBasicBlock &MBB : MF) {
1590 for (MachineInstr &MI : MBB) {
1591 // WRITELANE instructions used for SGPR spills can overwrite the inactive
1592 // lanes of VGPRs and callee must spill and restore them even if they are
1593 // marked Caller-saved.
1594
1595 // TODO: Handle this elsewhere at an early point. Walking through all MBBs
1596 // here would be a bad heuristic. A better way should be by calling
1597 // allocateWWMSpill during the regalloc pipeline whenever a physical
1598 // register is allocated for the intended virtual registers.
1599 if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR)
1600 MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
1601 else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
1602 MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
1603 else if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
1604 NeedExecCopyReservedReg = true;
1605 else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
1606 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1607 (MFI->isChainFunction() &&
1608 TII->isChainCallOpcode(MI.getOpcode()))) {
1609 // We expect all return to be the same size.
1610 assert(!ReturnMI ||
1611 (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
1612 count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); })));
1613 ReturnMI = &MI;
1614 }
1615 }
1616 }
1617
1618 // Remove any VGPRs used in the return value because these do not need to be saved.
1619 // This prevents CSR restore from clobbering return VGPRs.
1620 if (ReturnMI) {
1621 for (auto &Op : ReturnMI->operands()) {
1622 if (Op.isReg())
1623 SavedVGPRs.reset(Op.getReg());
1624 }
1625 }
1626
1627 // Ignore the SGPRs the default implementation found.
1628 SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
1629
1630 // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1631 // In gfx908 there was do AGPR loads and stores and thus spilling also
1632 // require a temporary VGPR.
1633 if (!ST.hasGFX90AInsts())
1634 SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
1635
1636 determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
1637
1638 // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
1639 // allow the default insertion to handle them.
1640 for (auto &Reg : MFI->getWWMSpills())
1641 SavedVGPRs.reset(Reg.first);
1642
1643 // Mark all lane VGPRs as BB LiveIns.
1644 for (MachineBasicBlock &MBB : MF) {
1645 for (auto &Reg : MFI->getWWMSpills())
1646 MBB.addLiveIn(Reg.first);
1647
1649 }
1650}
1651
1653 BitVector &SavedRegs,
1654 RegScavenger *RS) const {
1657 if (MFI->isEntryFunction())
1658 return;
1659
1660 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1661 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1662
1663 // The SP is specifically managed and we don't want extra spills of it.
1664 SavedRegs.reset(MFI->getStackPtrOffsetReg());
1665
1666 const BitVector AllSavedRegs = SavedRegs;
1667 SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
1668
1669 // We have to anticipate introducing CSR VGPR spills or spill of caller
1670 // save VGPR reserved for SGPR spills as we now always create stack entry
1671 // for it, if we don't have any stack objects already, since we require a FP
1672 // if there is a call and stack. We will allocate a VGPR for SGPR spills if
1673 // there are any SGPR spills. Whether they are CSR spills or otherwise.
1674 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1675 const bool WillHaveFP =
1676 FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
1677
1678 // FP will be specially managed like SP.
1679 if (WillHaveFP || hasFP(MF))
1680 SavedRegs.reset(MFI->getFrameOffsetReg());
1681
1682 // Return address use with return instruction is hidden through the SI_RETURN
1683 // pseudo. Given that and since the IPRA computes actual register usage and
1684 // does not use CSR list, the clobbering of return address by function calls
1685 // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1686 // usage collection. This will ensure save/restore of return address happens
1687 // in those scenarios.
1688 const MachineRegisterInfo &MRI = MF.getRegInfo();
1689 Register RetAddrReg = TRI->getReturnAddressReg(MF);
1690 if (!MFI->isEntryFunction() &&
1691 (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) {
1692 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
1693 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
1694 }
1695}
1696
1699 std::vector<CalleeSavedInfo> &CSI) const {
1700 if (CSI.empty())
1701 return true; // Early exit if no callee saved registers are modified!
1702
1703 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1704 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1705 const SIRegisterInfo *RI = ST.getRegisterInfo();
1706 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1707 Register BasePtrReg = RI->getBaseRegister();
1708 Register SGPRForFPSaveRestoreCopy =
1709 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1710 Register SGPRForBPSaveRestoreCopy =
1711 FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg);
1712 if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
1713 return false;
1714
1715 unsigned NumModifiedRegs = 0;
1716
1717 if (SGPRForFPSaveRestoreCopy)
1718 NumModifiedRegs++;
1719 if (SGPRForBPSaveRestoreCopy)
1720 NumModifiedRegs++;
1721
1722 for (auto &CS : CSI) {
1723 if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) {
1724 CS.setDstReg(SGPRForFPSaveRestoreCopy);
1725 if (--NumModifiedRegs)
1726 break;
1727 } else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) {
1728 CS.setDstReg(SGPRForBPSaveRestoreCopy);
1729 if (--NumModifiedRegs)
1730 break;
1731 }
1732 }
1733
1734 return false;
1735}
1736
1738 const MachineFunction &MF) const {
1739
1740 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1741 const MachineFrameInfo &MFI = MF.getFrameInfo();
1742 const SIInstrInfo *TII = ST.getInstrInfo();
1743 uint64_t EstStackSize = MFI.estimateStackSize(MF);
1744 uint64_t MaxOffset = EstStackSize - 1;
1745
1746 // We need the emergency stack slots to be allocated in range of the
1747 // MUBUF/flat scratch immediate offset from the base register, so assign these
1748 // first at the incoming SP position.
1749 //
1750 // TODO: We could try sorting the objects to find a hole in the first bytes
1751 // rather than allocating as close to possible. This could save a lot of space
1752 // on frames with alignment requirements.
1753 if (ST.enableFlatScratch()) {
1754 if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1756 return false;
1757 } else {
1758 if (TII->isLegalMUBUFImmOffset(MaxOffset))
1759 return false;
1760 }
1761
1762 return true;
1763}
1764
1766 MachineFunction &MF,
1769 int64_t Amount = I->getOperand(0).getImm();
1770 if (Amount == 0)
1771 return MBB.erase(I);
1772
1773 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1774 const SIInstrInfo *TII = ST.getInstrInfo();
1775 const DebugLoc &DL = I->getDebugLoc();
1776 unsigned Opc = I->getOpcode();
1777 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1778 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1779
1780 if (!hasReservedCallFrame(MF)) {
1781 Amount = alignTo(Amount, getStackAlign());
1782 assert(isUInt<32>(Amount) && "exceeded stack address space size");
1784 Register SPReg = MFI->getStackPtrOffsetReg();
1785
1786 Amount *= getScratchScaleFactor(ST);
1787 if (IsDestroy)
1788 Amount = -Amount;
1789 auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
1790 .addReg(SPReg)
1791 .addImm(Amount);
1792 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1793 } else if (CalleePopAmount != 0) {
1794 llvm_unreachable("is this used?");
1795 }
1796
1797 return MBB.erase(I);
1798}
1799
1800/// Returns true if the frame will require a reference to the stack pointer.
1801///
1802/// This is the set of conditions common to setting up the stack pointer in a
1803/// kernel, and for using a frame pointer in a callable function.
1804///
1805/// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
1806/// references SP.
1808 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
1809}
1810
1811// The FP for kernels is always known 0, so we never really need to setup an
1812// explicit register for it. However, DisableFramePointerElim will force us to
1813// use a register for it.
1815 const MachineFrameInfo &MFI = MF.getFrameInfo();
1816
1817 // For entry & chain functions we can use an immediate offset in most cases,
1818 // so the presence of calls doesn't imply we need a distinct frame pointer.
1819 if (MFI.hasCalls() &&
1822 // All offsets are unsigned, so need to be addressed in the same direction
1823 // as stack growth.
1824
1825 // FIXME: This function is pretty broken, since it can be called before the
1826 // frame layout is determined or CSR spills are inserted.
1827 return MFI.getStackSize() != 0;
1828 }
1829
1830 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
1831 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
1832 MF) ||
1834}
1835
1836// This is essentially a reduced version of hasFP for entry functions. Since the
1837// stack pointer is known 0 on entry to kernels, we never really need an FP
1838// register. We may need to initialize the stack pointer depending on the frame
1839// properties, which logically overlaps many of the cases where an ordinary
1840// function would require an FP.
1841// Also used for chain functions. While not technically entry functions, chain
1842// functions may need to set up a stack pointer in some situations.
1844 const MachineFunction &MF) const {
1845 // Callable functions always require a stack pointer reference.
1848 "only expected to call this for entry points and chain functions");
1849
1850 const MachineFrameInfo &MFI = MF.getFrameInfo();
1851
1852 // Entry points ordinarily don't need to initialize SP. We have to set it up
1853 // for callees if there are any. Also note tail calls are impossible/don't
1854 // make any sense for kernels.
1855 if (MFI.hasCalls())
1856 return true;
1857
1858 // We still need to initialize the SP if we're doing anything weird that
1859 // references the SP, like variable sized stack objects.
1860 return frameTriviallyRequiresSP(MFI);
1861}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB)
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static const Function * getParent(const Value *V)
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A set of register units.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
This file declares the machine register scavenger class.
static void buildEpilogRestore(const GCNSubtarget &ST, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &FuncInfo, LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SpillReg, int FI, Register FrameReg, int64_t DwordOff=0)
static cl::opt< bool > EnableSpillVGPRToAGPR("amdgpu-spill-vgpr-to-agpr", cl::desc("Enable spilling VGPRs to AGPRs"), cl::ReallyHidden, cl::init(true))
static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR, const TargetRegisterClass &RC=AMDGPU::SReg_32_XM0_XEXECRegClass, bool IncludeScratchCopy=true)
Query target location for spilling SGPRs IncludeScratchCopy : Also look for free scratch SGPRs.
static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, const SIInstrInfo *TII, Register TargetReg)
static bool allStackObjectsAreDead(const MachineFrameInfo &MFI)
static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &FuncInfo, LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SpillReg, int FI, Register FrameReg, int64_t DwordOff=0)
static Register buildScratchExecCopy(LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool IsProlog, bool EnableInactiveLanes)
static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI)
Returns true if the frame will require a reference to the stack pointer.
static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI, const SIMachineFunctionInfo *FuncInfo, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, bool IsProlog)
static bool allSGPRSpillsAreDead(const MachineFunction &MF)
static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits, const TargetRegisterClass &RC, bool Unused=false)
static MCRegister findUnusedRegister(MachineRegisterInfo &MRI, const LiveRegUnits &LiveUnits, const TargetRegisterClass &RC)
static unsigned getScratchScaleFactor(const GCNSubtarget &ST)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:195
bool test(unsigned Idx) const
Definition: BitVector.h:461
BitVector & reset()
Definition: BitVector.h:392
void clearBitsNotInMask(const uint32_t *Mask, unsigned MaskWords=~0u)
clearBitsNotInMask - Clear a bit in this vector for every '0' bit in Mask.
Definition: BitVector.h:725
BitVector & set()
Definition: BitVector.h:351
bool any() const
any - Returns true if any bit is set.
Definition: BitVector.h:170
void clearBitsInMask(const uint32_t *Mask, unsigned MaskWords=~0u)
clearBitsInMask - Clear any bits in this vector that are set in Mask.
Definition: BitVector.h:713
iterator_range< const_set_bits_iterator > set_bits() const
Definition: BitVector.h:140
bool empty() const
empty - Tests whether there are no bits in this bitvector.
Definition: BitVector.h:156
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:281
bool hasImplicitBufferPtr() const
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
TargetInstrInfo overrides.
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
If the specified machine instruction is a direct store to a stack slot, return the virtual or physica...
A set of register units used to track register liveness.
Definition: LiveRegUnits.h:30
bool available(MCPhysReg Reg) const
Returns true if no part of physical register Reg is live.
Definition: LiveRegUnits.h:116
void init(const TargetRegisterInfo &TRI)
Initialize and clear the set.
Definition: LiveRegUnits.h:73
void stepBackward(const MachineInstr &MI)
Updates liveness when stepping backwards over the instruction MI.
void addReg(MCPhysReg Reg)
Adds register units covered by physical register Reg.
Definition: LiveRegUnits.h:86
void addLiveOuts(const MachineBasicBlock &MBB)
Adds registers living out of block MBB.
bool empty() const
Returns true if the set is empty.
Definition: LiveRegUnits.h:83
void addLiveIns(const MachineBasicBlock &MBB)
Adds registers living into block MBB.
void removeReg(MCPhysReg Reg)
Removes all register units covered by physical register Reg.
Definition: LiveRegUnits.h:102
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
bool isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void sortUniqueLiveIns()
Sorts and uniques the LiveIns vector.
iterator getLastNonDebugInstr(bool SkipPseudoOp=true)
Returns an iterator to the last non-debug instruction in the basic block, or end().
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasVarSizedObjects() const
This method may be called any time after instruction selection is complete to determine if the stack ...
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
bool hasCalls() const
Return true if the current function has any function calls.
bool isFrameAddressTaken() const
This method may be called any time after instruction selection is complete to determine if there is a...
Align getMaxAlign() const
Return the alignment in bytes that this function must be aligned to, which is greater than the defaul...
bool hasPatchPoint() const
This method may be called any time after instruction selection is complete to determine if there is a...
bool hasTailCall() const
Returns true if the function contains a tail call.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasStackMap() const
This method may be called any time after instruction selection is complete to determine if there is a...
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getObjectIndexBegin() const
Return the minimum frame object index.
bool isDeadObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a dead object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:685
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
void setIsDead(bool Val=true)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const MCPhysReg * getCalleeSavedRegs() const
Returns list of callee saved registers.
void addLiveIn(MCRegister Reg, Register vreg=Register())
addLiveIn - Add the specified register as a live-in.
PrologEpilogSGPRSpillBuilder(Register Reg, const PrologEpilogSGPRSaveRestoreInfo SI, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, const SIInstrInfo *TII, const SIRegisterInfo &TRI, LiveRegUnits &LiveUnits, Register FrameReg)
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
void backward()
Update internal register state and move MBB iterator backwards.
void addScavengingFrameIndex(int FI)
Add a scavenging frame index.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
void determinePrologEpilogSGPRSaves(MachineFunction &MF, BitVector &SavedRegs, bool NeedExecCopyReservedReg) const
bool hasFP(const MachineFunction &MF) const override
hasFP - Return true if the specified function should have a dedicated frame pointer register.
StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, Register &FrameReg) const override
getFrameIndexReference - This method should return the base register and offset used to reference a f...
void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS=nullptr) const override
processFunctionBeforeFrameFinalized - This method is called immediately before the specified function...
bool allocateScavengingFrameIndexesNearIncomingSP(const MachineFunction &MF) const override
Control the placement of special register scavenging spill slots when allocating a stack frame.
bool requiresStackPointerReference(const MachineFunction &MF) const
void emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const override
This method determines which of the registers reported by TargetRegisterInfo::getCalleeSavedRegs() sh...
void emitCSRSpillStores(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, Register FrameReg, Register FramePtrRegScratchCopy) const
bool assignCalleeSavedSpillSlots(MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector< CalleeSavedInfo > &CSI) const override
void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override
void emitCSRSpillRestores(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, Register FrameReg, Register FramePtrRegScratchCopy) const
void processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF, RegScavenger *RS=nullptr) const override
processFunctionBeforeFrameIndicesReplaced - This method is called immediately before MO_FrameIndex op...
bool isSupportedStackID(TargetStackID::Value ID) const override
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override
emitProlog/emitEpilog - These methods insert prolog and epilog code into the function.
MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
This method is called during prolog/epilog code insertion to eliminate call frame setup and destroy p...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
ArrayRef< PrologEpilogSGPRSpill > getPrologEpilogSGPRSpills() const
const WWMSpillsMap & getWWMSpills() const
void getAllScratchSGPRCopyDstRegs(SmallVectorImpl< Register > &Regs) const
ArrayRef< MCPhysReg > getAGPRSpillVGPRs() const
GCNUserSGPRUsageInfo & getUserSGPRInfo()
void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size=4, Align Alignment=Align(4))
void setVGPRToAGPRSpillDead(int FrameIndex)
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< MCPhysReg > getVGPRSpillAGPRs() const
int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI)
bool hasPrologEpilogSGPRSpillEntry(Register Reg) const
Register getGITPtrLoReg(const MachineFunction &MF) const
void setVGPRForAGPRCopy(Register NewVGPRForAGPRCopy)
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR)
Reserve AGPRs or VGPRs to support spilling for FrameIndex FI.
void splitWWMSpillRegisters(MachineFunction &MF, SmallVectorImpl< std::pair< Register, int > > &CalleeSavedRegs, SmallVectorImpl< std::pair< Register, int > > &ScratchRegs) const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const
void shiftSpillPhysVGPRsToLowestRange(MachineFunction &MF)
bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, bool SpillToPhysVGPRLane=false, bool IsPrologEpilog=false)
void setLongBranchReservedReg(Register Reg)
bool removeDeadFrameIndices(MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs)
If ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill to the default stack.
MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
bool checkIndexInPrologEpilogSGPRSpills(int FI) const
const ReservedRegSet & getWWMReservedRegs() const
Register getImplicitBufferPtrUserSGPR() const
const PrologEpilogSGPRSaveRestoreInfo & getPrologEpilogSGPRSaveRestoreInfo(Register Reg) const
void setIsStackRealigned(bool Realigned=true)
void addToPrologEpilogSGPRSpills(Register Reg, PrologEpilogSGPRSaveRestoreInfo SI)
Register getScratchSGPRCopyDstReg(Register Reg) const
Register getFrameRegister(const MachineFunction &MF) const override
bool empty() const
Definition: SmallVector.h:94
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
int64_t getFixed() const
Returns the fixed component of the stack.
Definition: TypeSize.h:49
virtual bool hasReservedCallFrame(const MachineFunction &MF) const
hasReservedCallFrame - Under normal circumstances, when a frame pointer is not required,...
virtual void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const
This method determines which of the registers reported by TargetRegisterInfo::getCalleeSavedRegs() sh...
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetOptions Options
bool DisableFramePointerElim(const MachineFunction &MF) const
DisableFramePointerElim - This returns true if frame pointer elimination optimization should be disab...
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ PRIVATE_ADDRESS
Address space for private memory.
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isCompute(CallingConv::ID cc)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1928
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.