LLVM 17.0.0git
SIFrameLowering.cpp
Go to the documentation of this file.
1//===----------------------- SIFrameLowering.cpp --------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8
9#include "SIFrameLowering.h"
10#include "AMDGPU.h"
11#include "GCNSubtarget.h"
18
19using namespace llvm;
20
21#define DEBUG_TYPE "frame-info"
22
24 "amdgpu-spill-vgpr-to-agpr",
25 cl::desc("Enable spilling VGPRs to AGPRs"),
27 cl::init(true));
28
29// Find a register matching \p RC from \p LiveRegs which is unused and available
30// throughout the function. On failure, returns AMDGPU::NoRegister.
32 const LivePhysRegs &LiveRegs,
33 const TargetRegisterClass &RC) {
34 for (MCRegister Reg : RC) {
35 if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
36 return Reg;
37 }
38 return MCRegister();
39}
40
41// Find a scratch register that we can use in the prologue. We avoid using
42// callee-save registers since they may appear to be free when this is called
43// from canUseAsPrologue (during shrink wrapping), but then no longer be free
44// when this is called from emitPrologue.
46 LivePhysRegs &LiveRegs,
47 const TargetRegisterClass &RC,
48 bool Unused = false) {
49 // Mark callee saved registers as used so we will not choose them.
50 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
51 for (unsigned i = 0; CSRegs[i]; ++i)
52 LiveRegs.addReg(CSRegs[i]);
53
54 // We are looking for a register that can be used throughout the entire
55 // function, so any use is unacceptable.
56 if (Unused)
57 return findUnusedRegister(MRI, LiveRegs, RC);
58
59 for (MCRegister Reg : RC) {
60 if (LiveRegs.available(MRI, Reg))
61 return Reg;
62 }
63
64 return MCRegister();
65}
66
68 MachineFunction &MF, LivePhysRegs &LiveRegs, Register SGPR,
69 const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass) {
71 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
72
73 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
74 const SIRegisterInfo *TRI = ST.getRegisterInfo();
75 unsigned Size = TRI->getSpillSize(RC);
76 Align Alignment = TRI->getSpillAlign(RC);
77
78 // We need to save and restore the given SGPR.
79
80 // 1: Try to save the given register into an unused scratch SGPR. The LiveRegs
81 // should have all the callee saved registers marked as used.
82 Register ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC);
83
84 if (!ScratchSGPR) {
85 int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr,
87
88 if (TRI->spillSGPRToVGPR() &&
89 MFI->allocateSGPRSpillToVGPRLane(MF, FI, /* IsPrologEpilog */ true)) {
90 // 2: There's no free lane to spill, and no free register to save the
91 // SGPR, so we're forced to take another VGPR to use for the spill.
94 SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
95
97 auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front();
98 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
99 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
100 } else {
101 // Remove dead <FI> index
103 // 3: If all else fails, spill the register to memory.
104 FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
106 SGPR,
107 PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI));
108 LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
109 << printReg(SGPR, TRI) << '\n');
110 }
111 } else {
114 SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
115 LiveRegs.addReg(ScratchSGPR);
116 LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
117 << printReg(ScratchSGPR, TRI) << '\n');
118 }
119}
120
121// We need to specially emit stack operations here because a different frame
122// register is used than in the rest of the function, as getFrameRegister would
123// use.
124static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
126 LivePhysRegs &LiveRegs, MachineFunction &MF,
129 Register SpillReg, int FI, Register FrameReg,
130 int64_t DwordOff = 0) {
131 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
132 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
133
134 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
137 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
138 FrameInfo.getObjectAlign(FI));
139 LiveRegs.addReg(SpillReg);
140 bool IsKill = !MBB.isLiveIn(SpillReg);
141 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg,
142 DwordOff, MMO, nullptr, &LiveRegs);
143 if (IsKill)
144 LiveRegs.removeReg(SpillReg);
145}
146
147static void buildEpilogRestore(const GCNSubtarget &ST,
148 const SIRegisterInfo &TRI,
150 LivePhysRegs &LiveRegs, MachineFunction &MF,
153 const DebugLoc &DL, Register SpillReg, int FI,
154 Register FrameReg, int64_t DwordOff = 0) {
155 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
156 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
157
158 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
161 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
162 FrameInfo.getObjectAlign(FI));
163 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg,
164 DwordOff, MMO, nullptr, &LiveRegs);
165}
166
168 const DebugLoc &DL, const SIInstrInfo *TII,
169 Register TargetReg) {
172 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
173 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
174 Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
175 Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
176
177 if (MFI->getGITPtrHigh() != 0xffffffff) {
178 BuildMI(MBB, I, DL, SMovB32, TargetHi)
179 .addImm(MFI->getGITPtrHigh())
180 .addReg(TargetReg, RegState::ImplicitDefine);
181 } else {
182 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
183 BuildMI(MBB, I, DL, GetPC64, TargetReg);
184 }
185 Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
186 MF->getRegInfo().addLiveIn(GitPtrLo);
187 MBB.addLiveIn(GitPtrLo);
188 BuildMI(MBB, I, DL, SMovB32, TargetLo)
189 .addReg(GitPtrLo);
190}
191
192static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI,
195 MachineBasicBlock::iterator MBBI, bool IsProlog) {
196 if (LiveRegs.empty()) {
197 LiveRegs.init(TRI);
198 if (IsProlog) {
199 LiveRegs.addLiveIns(MBB);
200 } else {
201 // In epilog.
202 LiveRegs.addLiveOuts(MBB);
203 LiveRegs.stepBackward(*MBBI);
204 }
205 }
206}
207
208namespace llvm {
209
210// SpillBuilder to save/restore special SGPR spills like the one needed for FP,
211// BP, etc. These spills are delayed until the current function's frame is
212// finalized. For a given register, the builder uses the
213// PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
217 MachineFunction &MF;
218 const GCNSubtarget &ST;
219 MachineFrameInfo &MFI;
221 const SIInstrInfo *TII;
222 const SIRegisterInfo &TRI;
223 Register SuperReg;
225 LivePhysRegs &LiveRegs;
226 const DebugLoc &DL;
227 Register FrameReg;
228 ArrayRef<int16_t> SplitParts;
229 unsigned NumSubRegs;
230 unsigned EltSize = 4;
231
232 void saveToMemory(const int FI) const {
234 assert(!MFI.isDeadObjectIndex(FI));
235
236 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
237
239 MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
240 if (!TmpVGPR)
241 report_fatal_error("failed to find free scratch register");
242
243 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
244 Register SubReg = NumSubRegs == 1
245 ? SuperReg
246 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
247 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
248 .addReg(SubReg);
249
250 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR,
251 FI, FrameReg, DwordOff);
252 DwordOff += 4;
253 }
254 }
255
256 void saveToVGPRLane(const int FI) const {
257 assert(!MFI.isDeadObjectIndex(FI));
258
261 FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
262 assert(Spill.size() == NumSubRegs);
263
264 for (unsigned I = 0; I < NumSubRegs; ++I) {
265 Register SubReg = NumSubRegs == 1
266 ? SuperReg
267 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
268 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[I].VGPR)
269 .addReg(SubReg)
270 .addImm(Spill[I].Lane)
271 .addReg(Spill[I].VGPR, RegState::Undef);
272 }
273 }
274
275 void copyToScratchSGPR(Register DstReg) const {
276 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg)
277 .addReg(SuperReg)
279 }
280
281 void restoreFromMemory(const int FI) {
283
284 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
286 MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
287 if (!TmpVGPR)
288 report_fatal_error("failed to find free scratch register");
289
290 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
291 Register SubReg = NumSubRegs == 1
292 ? SuperReg
293 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
294
295 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR,
296 FI, FrameReg, DwordOff);
297 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
298 .addReg(TmpVGPR, RegState::Kill);
299 DwordOff += 4;
300 }
301 }
302
303 void restoreFromVGPRLane(const int FI) {
306 FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
307 assert(Spill.size() == NumSubRegs);
308
309 for (unsigned I = 0; I < NumSubRegs; ++I) {
310 Register SubReg = NumSubRegs == 1
311 ? SuperReg
312 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
313 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
314 .addReg(Spill[I].VGPR)
315 .addImm(Spill[I].Lane);
316 }
317 }
318
319 void copyFromScratchSGPR(Register SrcReg) const {
320 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg)
321 .addReg(SrcReg)
323 }
324
325public:
330 const DebugLoc &DL, const SIInstrInfo *TII,
331 const SIRegisterInfo &TRI,
332 LivePhysRegs &LiveRegs, Register FrameReg)
333 : MI(MI), MBB(MBB), MF(*MBB.getParent()),
334 ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
335 FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
336 SuperReg(Reg), SI(SI), LiveRegs(LiveRegs), DL(DL), FrameReg(FrameReg) {
337 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
338 SplitParts = TRI.getRegSplitParts(RC, EltSize);
339 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
340
341 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
342 }
343
344 void save() {
345 switch (SI.getKind()) {
347 return saveToMemory(SI.getIndex());
349 return saveToVGPRLane(SI.getIndex());
351 return copyToScratchSGPR(SI.getReg());
352 }
353 }
354
355 void restore() {
356 switch (SI.getKind()) {
358 return restoreFromMemory(SI.getIndex());
360 return restoreFromVGPRLane(SI.getIndex());
362 return copyFromScratchSGPR(SI.getReg());
363 }
364 }
365};
366
367} // namespace llvm
368
369// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
370void SIFrameLowering::emitEntryFunctionFlatScratchInit(
372 const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
374 const SIInstrInfo *TII = ST.getInstrInfo();
375 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
377
378 // We don't need this if we only have spills since there is no user facing
379 // scratch.
380
381 // TODO: If we know we don't have flat instructions earlier, we can omit
382 // this from the input registers.
383 //
384 // TODO: We only need to know if we access scratch space through a flat
385 // pointer. Because we only detect if flat instructions are used at all,
386 // this will be used more often than necessary on VI.
387
388 Register FlatScrInitLo;
389 Register FlatScrInitHi;
390
391 if (ST.isAmdPalOS()) {
392 // Extract the scratch offset from the descriptor in the GIT
393 LivePhysRegs LiveRegs;
394 LiveRegs.init(*TRI);
395 LiveRegs.addLiveIns(MBB);
396
397 // Find unused reg to load flat scratch init into
399 Register FlatScrInit = AMDGPU::NoRegister;
400 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
401 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
402 AllSGPR64s = AllSGPR64s.slice(
403 std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
404 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
405 for (MCPhysReg Reg : AllSGPR64s) {
406 if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) &&
407 !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
408 FlatScrInit = Reg;
409 break;
410 }
411 }
412 assert(FlatScrInit && "Failed to find free register for scratch init");
413
414 FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
415 FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
416
417 buildGitPtr(MBB, I, DL, TII, FlatScrInit);
418
419 // We now have the GIT ptr - now get the scratch descriptor from the entry
420 // at offset 0 (or offset 16 for a compute shader).
422 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
423 auto *MMO = MF.getMachineMemOperand(
424 PtrInfo,
427 8, Align(4));
428 unsigned Offset =
430 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
431 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
432 BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
433 .addReg(FlatScrInit)
434 .addImm(EncodedOffset) // offset
435 .addImm(0) // cpol
436 .addMemOperand(MMO);
437
438 // Mask the offset in [47:0] of the descriptor
439 const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
440 auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
441 .addReg(FlatScrInitHi)
442 .addImm(0xffff);
443 And->getOperand(3).setIsDead(); // Mark SCC as dead.
444 } else {
445 Register FlatScratchInitReg =
447 assert(FlatScratchInitReg);
448
450 MRI.addLiveIn(FlatScratchInitReg);
451 MBB.addLiveIn(FlatScratchInitReg);
452
453 FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
454 FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
455 }
456
457 // Do a 64-bit pointer add.
458 if (ST.flatScratchIsPointer()) {
459 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
460 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
461 .addReg(FlatScrInitLo)
462 .addReg(ScratchWaveOffsetReg);
463 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
464 FlatScrInitHi)
465 .addReg(FlatScrInitHi)
466 .addImm(0);
467 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
468
469 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
470 addReg(FlatScrInitLo).
471 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
473 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
474 addReg(FlatScrInitHi).
475 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
477 return;
478 }
479
480 // For GFX9.
481 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
482 .addReg(FlatScrInitLo)
483 .addReg(ScratchWaveOffsetReg);
484 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
485 AMDGPU::FLAT_SCR_HI)
486 .addReg(FlatScrInitHi)
487 .addImm(0);
488 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
489
490 return;
491 }
492
493 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
494
495 // Copy the size in bytes.
496 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
497 .addReg(FlatScrInitHi, RegState::Kill);
498
499 // Add wave offset in bytes to private base offset.
500 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
501 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
502 .addReg(FlatScrInitLo)
503 .addReg(ScratchWaveOffsetReg);
504
505 // Convert offset to 256-byte units.
506 auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32),
507 AMDGPU::FLAT_SCR_HI)
508 .addReg(FlatScrInitLo, RegState::Kill)
509 .addImm(8);
510 LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
511}
512
513// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
514// memory. They should have been removed by now.
516 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
517 I != E; ++I) {
518 if (!MFI.isDeadObjectIndex(I))
519 return false;
520 }
521
522 return true;
523}
524
525// Shift down registers reserved for the scratch RSRC.
526Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
527 MachineFunction &MF) const {
528
530 const SIInstrInfo *TII = ST.getInstrInfo();
531 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
534
535 assert(MFI->isEntryFunction());
536
537 Register ScratchRsrcReg = MFI->getScratchRSrcReg();
538
539 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
541 return Register();
542
543 if (ST.hasSGPRInitBug() ||
544 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
545 return ScratchRsrcReg;
546
547 // We reserved the last registers for this. Shift it down to the end of those
548 // which were actually used.
549 //
550 // FIXME: It might be safer to use a pseudoregister before replacement.
551
552 // FIXME: We should be able to eliminate unused input registers. We only
553 // cannot do this for the resources required for scratch access. For now we
554 // skip over user SGPRs and may leave unused holes.
555
556 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
557 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
558 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
559
560 // Skip the last N reserved elements because they should have already been
561 // reserved for VCC etc.
562 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
563 for (MCPhysReg Reg : AllSGPR128s) {
564 // Pick the first unallocated one. Make sure we don't clobber the other
565 // reserved input we needed. Also for PAL, make sure we don't clobber
566 // the GIT pointer passed in SGPR0 or SGPR8.
567 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
568 (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) {
569 MRI.replaceRegWith(ScratchRsrcReg, Reg);
570 MFI->setScratchRSrcReg(Reg);
571 return Reg;
572 }
573 }
574
575 return ScratchRsrcReg;
576}
577
578static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
579 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
580}
581
583 MachineBasicBlock &MBB) const {
584 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
585
586 // FIXME: If we only have SGPR spills, we won't actually be using scratch
587 // memory since these spill to VGPRs. We should be cleaning up these unused
588 // SGPR spill frame indices somewhere.
589
590 // FIXME: We still have implicit uses on SGPR spill instructions in case they
591 // need to spill to vector memory. It's likely that will not happen, but at
592 // this point it appears we need the setup. This part of the prolog should be
593 // emitted after frame indices are eliminated.
594
595 // FIXME: Remove all of the isPhysRegUsed checks
596
598 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
599 const SIInstrInfo *TII = ST.getInstrInfo();
600 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
602 const Function &F = MF.getFunction();
603 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
604
605 assert(MFI->isEntryFunction());
606
607 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
609
610 // We need to do the replacement of the private segment buffer register even
611 // if there are no stack objects. There could be stores to undef or a
612 // constant without an associated object.
613 //
614 // This will return `Register()` in cases where there are no actual
615 // uses of the SRSRC.
616 Register ScratchRsrcReg;
617 if (!ST.enableFlatScratch())
618 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
619
620 // Make the selected register live throughout the function.
621 if (ScratchRsrcReg) {
622 for (MachineBasicBlock &OtherBB : MF) {
623 if (&OtherBB != &MBB) {
624 OtherBB.addLiveIn(ScratchRsrcReg);
625 }
626 }
627 }
628
629 // Now that we have fixed the reserved SRSRC we need to locate the
630 // (potentially) preloaded SRSRC.
631 Register PreloadedScratchRsrcReg;
632 if (ST.isAmdHsaOrMesa(F)) {
633 PreloadedScratchRsrcReg =
635 if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
636 // We added live-ins during argument lowering, but since they were not
637 // used they were deleted. We're adding the uses now, so add them back.
638 MRI.addLiveIn(PreloadedScratchRsrcReg);
639 MBB.addLiveIn(PreloadedScratchRsrcReg);
640 }
641 }
642
643 // Debug location must be unknown since the first debug location is used to
644 // determine the end of the prologue.
645 DebugLoc DL;
647
648 // We found the SRSRC first because it needs four registers and has an
649 // alignment requirement. If the SRSRC that we found is clobbering with
650 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
651 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
652 // wave offset to a free SGPR.
653 Register ScratchWaveOffsetReg;
654 if (PreloadedScratchWaveOffsetReg &&
655 TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
656 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
657 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
658 AllSGPRs = AllSGPRs.slice(
659 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
660 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
661 for (MCPhysReg Reg : AllSGPRs) {
662 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
663 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
664 ScratchWaveOffsetReg = Reg;
665 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
666 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
667 break;
668 }
669 }
670 } else {
671 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
672 }
673 assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
674
676 Register SPReg = MFI->getStackPtrOffsetReg();
677 assert(SPReg != AMDGPU::SP_REG);
678 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
679 .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
680 }
681
682 if (hasFP(MF)) {
683 Register FPReg = MFI->getFrameOffsetReg();
684 assert(FPReg != AMDGPU::FP_REG);
685 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
686 }
687
688 bool NeedsFlatScratchInit =
689 MFI->hasFlatScratchInit() &&
690 (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
691 (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
692
693 if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
694 PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
695 MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
696 MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
697 }
698
699 if (NeedsFlatScratchInit) {
700 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
701 }
702
703 if (ScratchRsrcReg) {
704 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
705 PreloadedScratchRsrcReg,
706 ScratchRsrcReg, ScratchWaveOffsetReg);
707 }
708}
709
710// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
711void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
713 const DebugLoc &DL, Register PreloadedScratchRsrcReg,
714 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
715
716 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
717 const SIInstrInfo *TII = ST.getInstrInfo();
718 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
720 const Function &Fn = MF.getFunction();
721
722 if (ST.isAmdPalOS()) {
723 // The pointer to the GIT is formed from the offset passed in and either
724 // the amdgpu-git-ptr-high function attribute or the top part of the PC
725 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
726 Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
727
728 buildGitPtr(MBB, I, DL, TII, Rsrc01);
729
730 // We now have the GIT ptr - now get the scratch descriptor from the entry
731 // at offset 0 (or offset 16 for a compute shader).
733 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
734 auto MMO = MF.getMachineMemOperand(PtrInfo,
738 16, Align(4));
739 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
740 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
741 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
742 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
743 .addReg(Rsrc01)
744 .addImm(EncodedOffset) // offset
745 .addImm(0) // cpol
746 .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
747 .addMemOperand(MMO);
748
749 // The driver will always set the SRD for wave 64 (bits 118:117 of
750 // descriptor / bits 22:21 of third sub-reg will be 0b11)
751 // If the shader is actually wave32 we have to modify the const_index_stride
752 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
753 // reason the driver does this is that there can be cases where it presents
754 // 2 shaders with different wave size (e.g. VsFs).
755 // TODO: convert to using SCRATCH instructions or multiple SRD buffers
756 if (ST.isWave32()) {
757 const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
758 BuildMI(MBB, I, DL, SBitsetB32, Rsrc03)
759 .addImm(21)
760 .addReg(Rsrc03);
761 }
762 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
763 assert(!ST.isAmdHsaOrMesa(Fn));
764 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
765
766 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
767 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
768
769 // Use relocations to get the pointer, and setup the other bits manually.
770 uint64_t Rsrc23 = TII->getScratchRsrcWords23();
771
772 if (MFI->hasImplicitBufferPtr()) {
773 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
774
776 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
777
778 BuildMI(MBB, I, DL, Mov64, Rsrc01)
780 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
781 } else {
782 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
783
785 auto MMO = MF.getMachineMemOperand(
786 PtrInfo,
789 8, Align(4));
790 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
792 .addImm(0) // offset
793 .addImm(0) // cpol
794 .addMemOperand(MMO)
795 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
796
799 }
800 } else {
801 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
802 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
803
804 BuildMI(MBB, I, DL, SMovB32, Rsrc0)
805 .addExternalSymbol("SCRATCH_RSRC_DWORD0")
806 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
807
808 BuildMI(MBB, I, DL, SMovB32, Rsrc1)
809 .addExternalSymbol("SCRATCH_RSRC_DWORD1")
810 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
811
812 }
813
814 BuildMI(MBB, I, DL, SMovB32, Rsrc2)
815 .addImm(Rsrc23 & 0xffffffff)
816 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
817
818 BuildMI(MBB, I, DL, SMovB32, Rsrc3)
819 .addImm(Rsrc23 >> 32)
820 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
821 } else if (ST.isAmdHsaOrMesa(Fn)) {
822 assert(PreloadedScratchRsrcReg);
823
824 if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
825 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
826 .addReg(PreloadedScratchRsrcReg, RegState::Kill);
827 }
828 }
829
830 // Add the scratch wave offset into the scratch RSRC.
831 //
832 // We only want to update the first 48 bits, which is the base address
833 // pointer, without touching the adjacent 16 bits of flags. We know this add
834 // cannot carry-out from bit 47, otherwise the scratch allocation would be
835 // impossible to fit in the 48-bit global address space.
836 //
837 // TODO: Evaluate if it is better to just construct an SRD using the flat
838 // scratch init and some constants rather than update the one we are passed.
839 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
840 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
841
842 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
843 // the kernel body via inreg arguments.
844 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
845 .addReg(ScratchRsrcSub0)
846 .addReg(ScratchWaveOffsetReg)
847 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
848 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
849 .addReg(ScratchRsrcSub1)
850 .addImm(0)
851 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
852 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
853}
854
856 switch (ID) {
860 return true;
863 return false;
864 }
865 llvm_unreachable("Invalid TargetStackID::Value");
866}
867
868// Activate only the inactive lanes when \p EnableInactiveLanes is true.
869// Otherwise, activate all lanes. It returns the saved exec.
871 MachineFunction &MF,
874 const DebugLoc &DL, bool IsProlog,
875 bool EnableInactiveLanes) {
876 Register ScratchExecCopy;
878 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
879 const SIInstrInfo *TII = ST.getInstrInfo();
880 const SIRegisterInfo &TRI = TII->getRegisterInfo();
882
883 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
884
885 ScratchExecCopy = findScratchNonCalleeSaveRegister(
886 MRI, LiveRegs, *TRI.getWaveMaskRegClass());
887 if (!ScratchExecCopy)
888 report_fatal_error("failed to find free scratch register");
889
890 LiveRegs.addReg(ScratchExecCopy);
891
892 const unsigned SaveExecOpc =
893 ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
894 : AMDGPU::S_OR_SAVEEXEC_B32)
895 : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
896 : AMDGPU::S_OR_SAVEEXEC_B64);
897 auto SaveExec =
898 BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1);
899 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
900
901 return ScratchExecCopy;
902}
903
907 Register FrameReg, Register FramePtrRegScratchCopy) const {
909 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
910 const SIInstrInfo *TII = ST.getInstrInfo();
911 const SIRegisterInfo &TRI = TII->getRegisterInfo();
912
913 // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
914 // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
915 // might end up flipping the EXEC bits twice.
916 Register ScratchExecCopy;
917 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
918 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
919 if (!WWMScratchRegs.empty())
920 ScratchExecCopy =
921 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
922 /*IsProlog*/ true, /*EnableInactiveLanes*/ true);
923
924 auto StoreWWMRegisters =
926 for (const auto &Reg : WWMRegs) {
927 Register VGPR = Reg.first;
928 int FI = Reg.second;
929 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
930 VGPR, FI, FrameReg);
931 }
932 };
933
934 StoreWWMRegisters(WWMScratchRegs);
935 if (!WWMCalleeSavedRegs.empty()) {
936 if (ScratchExecCopy) {
937 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
938 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
939 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
940 } else {
941 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
942 /*IsProlog*/ true,
943 /*EnableInactiveLanes*/ false);
944 }
945 }
946
947 StoreWWMRegisters(WWMCalleeSavedRegs);
948 if (ScratchExecCopy) {
949 // FIXME: Split block and make terminator.
950 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
951 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
952 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
953 .addReg(ScratchExecCopy, RegState::Kill);
954 LiveRegs.addReg(ScratchExecCopy);
955 }
956
957 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
958
959 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
960 // Special handle FP spill:
961 // Skip if FP is saved to a scratch SGPR, the save has already been emitted.
962 // Otherwise, FP has been moved to a temporary register and spill it
963 // instead.
964 Register Reg =
965 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
966 if (!Reg)
967 continue;
968
969 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
970 LiveRegs, FrameReg);
971 SB.save();
972 }
973
974 // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
975 // such scratch registers live throughout the function.
976 SmallVector<Register, 1> ScratchSGPRs;
977 FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs);
978 if (!ScratchSGPRs.empty()) {
979 for (MachineBasicBlock &MBB : MF) {
980 for (MCPhysReg Reg : ScratchSGPRs)
981 MBB.addLiveIn(Reg);
982
984 }
985 if (!LiveRegs.empty()) {
986 for (MCPhysReg Reg : ScratchSGPRs)
987 LiveRegs.addReg(Reg);
988 }
989 }
990}
991
995 Register FrameReg, Register FramePtrRegScratchCopy) const {
997 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
998 const SIInstrInfo *TII = ST.getInstrInfo();
999 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1000 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1001
1002 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1003 // Special handle FP restore:
1004 // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1005 // the FP value to a temporary register. The frame pointer should be
1006 // overwritten only at the end when all other spills are restored from
1007 // current frame.
1008 Register Reg =
1009 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1010 if (!Reg)
1011 continue;
1012
1013 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1014 LiveRegs, FrameReg);
1015 SB.restore();
1016 }
1017
1018 // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1019 // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1020 // this, we might end up flipping the EXEC bits twice.
1021 Register ScratchExecCopy;
1022 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1023 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
1024 if (!WWMScratchRegs.empty())
1025 ScratchExecCopy =
1026 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
1027 /*IsProlog*/ false, /*EnableInactiveLanes*/ true);
1028
1029 auto RestoreWWMRegisters =
1031 for (const auto &Reg : WWMRegs) {
1032 Register VGPR = Reg.first;
1033 int FI = Reg.second;
1034 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
1035 VGPR, FI, FrameReg);
1036 }
1037 };
1038
1039 RestoreWWMRegisters(WWMScratchRegs);
1040 if (!WWMCalleeSavedRegs.empty()) {
1041 if (ScratchExecCopy) {
1042 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1043 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1044 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
1045 } else {
1046 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
1047 /*IsProlog*/ false,
1048 /*EnableInactiveLanes*/ false);
1049 }
1050 }
1051
1052 RestoreWWMRegisters(WWMCalleeSavedRegs);
1053 if (ScratchExecCopy) {
1054 // FIXME: Split block and make terminator.
1055 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1056 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1057 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
1058 .addReg(ScratchExecCopy, RegState::Kill);
1059 }
1060}
1061
1063 MachineBasicBlock &MBB) const {
1065 if (FuncInfo->isEntryFunction()) {
1067 return;
1068 }
1069
1070 MachineFrameInfo &MFI = MF.getFrameInfo();
1071 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1072 const SIInstrInfo *TII = ST.getInstrInfo();
1073 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1075
1076 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1077 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1078 Register BasePtrReg =
1079 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
1080 LivePhysRegs LiveRegs;
1081
1083 // DebugLoc must be unknown since the first instruction with DebugLoc is used
1084 // to determine the end of the prologue.
1085 DebugLoc DL;
1086
1087 bool HasFP = false;
1088 bool HasBP = false;
1089 uint32_t NumBytes = MFI.getStackSize();
1090 uint32_t RoundedSize = NumBytes;
1091
1092 if (TRI.hasStackRealignment(MF))
1093 HasFP = true;
1094
1095 Register FramePtrRegScratchCopy;
1096 if (!HasFP && !hasFP(MF)) {
1097 // Emit the CSR spill stores with SP base register.
1098 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg,
1099 FramePtrRegScratchCopy);
1100 } else {
1101 // CSR spill stores will use FP as base register.
1102 Register SGPRForFPSaveRestoreCopy =
1103 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1104
1105 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
1106 if (SGPRForFPSaveRestoreCopy) {
1107 // Copy FP to the scratch register now and emit the CFI entry. It avoids
1108 // the extra FP copy needed in the other two cases when FP is spilled to
1109 // memory or to a VGPR lane.
1111 FramePtrReg,
1112 FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI,
1113 DL, TII, TRI, LiveRegs, FramePtrReg);
1114 SB.save();
1115 LiveRegs.addReg(SGPRForFPSaveRestoreCopy);
1116 } else {
1117 // Copy FP into a new scratch register so that its previous value can be
1118 // spilled after setting up the new frame.
1119 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1120 MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass);
1121 if (!FramePtrRegScratchCopy)
1122 report_fatal_error("failed to find free scratch register");
1123
1124 LiveRegs.addReg(FramePtrRegScratchCopy);
1125 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy)
1126 .addReg(FramePtrReg);
1127 }
1128 }
1129
1130 if (HasFP) {
1131 const unsigned Alignment = MFI.getMaxAlign().value();
1132
1133 RoundedSize += Alignment;
1134 if (LiveRegs.empty()) {
1135 LiveRegs.init(TRI);
1136 LiveRegs.addLiveIns(MBB);
1137 }
1138
1139 // s_add_i32 s33, s32, NumBytes
1140 // s_and_b32 s33, s33, 0b111...0000
1141 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
1142 .addReg(StackPtrReg)
1143 .addImm((Alignment - 1) * getScratchScaleFactor(ST))
1145 auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
1146 .addReg(FramePtrReg, RegState::Kill)
1147 .addImm(-Alignment * getScratchScaleFactor(ST))
1149 And->getOperand(3).setIsDead(); // Mark SCC as dead.
1150 FuncInfo->setIsStackRealigned(true);
1151 } else if ((HasFP = hasFP(MF))) {
1152 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1153 .addReg(StackPtrReg)
1155 }
1156
1157 // If FP is used, emit the CSR spills with FP base register.
1158 if (HasFP) {
1159 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg,
1160 FramePtrRegScratchCopy);
1161 if (FramePtrRegScratchCopy)
1162 LiveRegs.removeReg(FramePtrRegScratchCopy);
1163 }
1164
1165 // If we need a base pointer, set it up here. It's whatever the value of
1166 // the stack pointer is at this point. Any variable size objects will be
1167 // allocated after this, so we can still use the base pointer to reference
1168 // the incoming arguments.
1169 if ((HasBP = TRI.hasBasePointer(MF))) {
1170 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
1171 .addReg(StackPtrReg)
1173 }
1174
1175 if (HasFP && RoundedSize != 0) {
1176 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1177 .addReg(StackPtrReg)
1178 .addImm(RoundedSize * getScratchScaleFactor(ST))
1180 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1181 }
1182
1183 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1184 (void)FPSaved;
1185 assert((!HasFP || FPSaved) &&
1186 "Needed to save FP but didn't save it anywhere");
1187
1188 // If we allow spilling to AGPRs we may have saved FP but then spill
1189 // everything into AGPRs instead of the stack.
1190 assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) &&
1191 "Saved FP but didn't need it");
1192
1193 bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg);
1194 (void)BPSaved;
1195 assert((!HasBP || BPSaved) &&
1196 "Needed to save BP but didn't save it anywhere");
1197
1198 assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
1199}
1200
1202 MachineBasicBlock &MBB) const {
1204 if (FuncInfo->isEntryFunction())
1205 return;
1206
1207 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1208 const SIInstrInfo *TII = ST.getInstrInfo();
1209 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1211 LivePhysRegs LiveRegs;
1212 // Get the insert location for the epilogue. If there were no terminators in
1213 // the block, get the last instruction.
1215 DebugLoc DL;
1216 if (!MBB.empty()) {
1218 if (MBBI != MBB.end())
1219 DL = MBBI->getDebugLoc();
1220
1222 }
1223
1224 const MachineFrameInfo &MFI = MF.getFrameInfo();
1225 uint32_t NumBytes = MFI.getStackSize();
1226 uint32_t RoundedSize = FuncInfo->isStackRealigned()
1227 ? NumBytes + MFI.getMaxAlign().value()
1228 : NumBytes;
1229 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1230 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1231 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1232
1233 Register FramePtrRegScratchCopy;
1234 Register SGPRForFPSaveRestoreCopy =
1235 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1236 if (FPSaved) {
1237 // CSR spill restores should use FP as base register. If
1238 // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1239 // into a new scratch register and copy to FP later when other registers are
1240 // restored from the current stack frame.
1241 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
1242 if (SGPRForFPSaveRestoreCopy) {
1243 LiveRegs.addReg(SGPRForFPSaveRestoreCopy);
1244 } else {
1245 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1246 MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass);
1247 if (!FramePtrRegScratchCopy)
1248 report_fatal_error("failed to find free scratch register");
1249
1250 LiveRegs.addReg(FramePtrRegScratchCopy);
1251 }
1252
1253 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg,
1254 FramePtrRegScratchCopy);
1255 }
1256
1257 if (RoundedSize != 0 && hasFP(MF)) {
1258 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1259 .addReg(StackPtrReg)
1260 .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
1262 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1263 }
1264
1265 if (FPSaved) {
1266 // Insert the copy to restore FP.
1267 Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1268 : FramePtrRegScratchCopy;
1270 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1271 .addReg(SrcReg);
1272 if (SGPRForFPSaveRestoreCopy)
1274 } else {
1275 // Insert the CSR spill restores with SP as the base register.
1276 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg,
1277 FramePtrRegScratchCopy);
1278 }
1279}
1280
1281#ifndef NDEBUG
1283 const MachineFrameInfo &MFI = MF.getFrameInfo();
1285 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1286 I != E; ++I) {
1287 if (!MFI.isDeadObjectIndex(I) &&
1289 !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) {
1290 return false;
1291 }
1292 }
1293
1294 return true;
1295}
1296#endif
1297
1299 int FI,
1300 Register &FrameReg) const {
1301 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1302
1303 FrameReg = RI->getFrameRegister(MF);
1305}
1306
1308 MachineFunction &MF,
1309 RegScavenger *RS) const {
1310 MachineFrameInfo &MFI = MF.getFrameInfo();
1311
1312 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1313 const SIInstrInfo *TII = ST.getInstrInfo();
1314 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1317
1318 // Allocate spill slots for WWM reserved VGPRs.
1319 if (!FuncInfo->isEntryFunction()) {
1320 for (Register Reg : FuncInfo->getWWMReservedRegs()) {
1321 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1322 FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
1323 TRI->getSpillAlign(*RC));
1324 }
1325 }
1326
1327 const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1329
1330 if (SpillVGPRToAGPR) {
1331 // To track the spill frame indices handled in this pass.
1332 BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1333 BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1334
1335 bool SeenDbgInstr = false;
1336
1337 for (MachineBasicBlock &MBB : MF) {
1339 int FrameIndex;
1340 if (MI.isDebugInstr())
1341 SeenDbgInstr = true;
1342
1343 if (TII->isVGPRSpill(MI)) {
1344 // Try to eliminate stack used by VGPR spills before frame
1345 // finalization.
1346 unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1347 AMDGPU::OpName::vaddr);
1348 int FI = MI.getOperand(FIOp).getIndex();
1349 Register VReg =
1350 TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
1351 if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1352 TRI->isAGPR(MRI, VReg))) {
1353 assert(RS != nullptr);
1354 // FIXME: change to enterBasicBlockEnd()
1355 RS->enterBasicBlock(MBB);
1356 TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
1357 SpillFIs.set(FI);
1358 continue;
1359 }
1360 } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
1361 TII->isLoadFromStackSlot(MI, FrameIndex))
1362 if (!MFI.isFixedObjectIndex(FrameIndex))
1363 NonVGPRSpillFIs.set(FrameIndex);
1364 }
1365 }
1366
1367 // Stack slot coloring may assign different objects to the same stack slot.
1368 // If not, then the VGPR to AGPR spill slot is dead.
1369 for (unsigned FI : SpillFIs.set_bits())
1370 if (!NonVGPRSpillFIs.test(FI))
1371 FuncInfo->setVGPRToAGPRSpillDead(FI);
1372
1373 for (MachineBasicBlock &MBB : MF) {
1374 for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1375 MBB.addLiveIn(Reg);
1376
1377 for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1378 MBB.addLiveIn(Reg);
1379
1381
1382 if (!SpillFIs.empty() && SeenDbgInstr) {
1383 // FIXME: The dead frame indices are replaced with a null register from
1384 // the debug value instructions. We should instead, update it with the
1385 // correct register value. But not sure the register value alone is
1386 for (MachineInstr &MI : MBB) {
1387 if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
1388 !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) &&
1389 SpillFIs[MI.getOperand(0).getIndex()]) {
1390 MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
1391 }
1392 }
1393 }
1394 }
1395 }
1396
1397 // At this point we've already allocated all spilled SGPRs to VGPRs if we
1398 // can. Any remaining SGPR spills will go to memory, so move them back to the
1399 // default stack.
1400 bool HaveSGPRToVMemSpill =
1401 FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
1403 "SGPR spill should have been removed in SILowerSGPRSpills");
1404
1405 // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1406 // but currently hasNonSpillStackObjects is set only from source
1407 // allocas. Stack temps produced from legalization are not counted currently.
1408 if (!allStackObjectsAreDead(MFI)) {
1409 assert(RS && "RegScavenger required if spilling");
1410
1411 // Add an emergency spill slot
1412 RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
1413
1414 // If we are spilling SGPRs to memory with a large frame, we may need a
1415 // second VGPR emergency frame index.
1416 if (HaveSGPRToVMemSpill &&
1418 RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false));
1419 }
1420 }
1421}
1422
1424 MachineFunction &MF, RegScavenger *RS) const {
1425 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1426 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1429
1430 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1431 // On gfx908, we had initially reserved highest available VGPR for AGPR
1432 // copy. Now since we are done with RA, check if there exist an unused VGPR
1433 // which is lower than the eariler reserved VGPR before RA. If one exist,
1434 // use it for AGPR copy instead of one reserved before RA.
1435 Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1436 Register UnusedLowVGPR =
1437 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
1438 if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
1439 TRI->getHWRegIndex(VGPRForAGPRCopy))) {
1440 // Call to setVGPRForAGPRCopy() should happen first before calling
1441 // freezeReservedRegs() so that getReservedRegs() can reserve this newly
1442 // identified VGPR (for AGPR copy).
1443 FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1444 MRI.freezeReservedRegs(MF);
1445 }
1446 }
1447}
1448
1449// The special SGPR spills like the one needed for FP, BP or any reserved
1450// registers delayed until frame lowering.
1452 MachineFunction &MF, BitVector &SavedVGPRs) const {
1453 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1455 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1456 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1457 LivePhysRegs LiveRegs;
1458 LiveRegs.init(*TRI);
1459 // Initially mark callee saved registers as used so we will not choose them
1460 // while looking for scratch SGPRs.
1461 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
1462 for (unsigned I = 0; CSRegs[I]; ++I)
1463 LiveRegs.addReg(CSRegs[I]);
1464
1465 // hasFP only knows about stack objects that already exist. We're now
1466 // determining the stack slots that will be created, so we have to predict
1467 // them. Stack objects force FP usage with calls.
1468 //
1469 // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1470 // don't want to report it here.
1471 //
1472 // FIXME: Is this really hasReservedCallFrame?
1473 const bool WillHaveFP =
1474 FrameInfo.hasCalls() &&
1475 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1476
1477 if (WillHaveFP || hasFP(MF)) {
1478 Register FramePtrReg = MFI->getFrameOffsetReg();
1479 assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
1480 "Re-reserving spill slot for FP");
1481 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, FramePtrReg);
1482 }
1483
1484 if (TRI->hasBasePointer(MF)) {
1485 Register BasePtrReg = TRI->getBaseRegister();
1486 assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
1487 "Re-reserving spill slot for BP");
1488 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, BasePtrReg);
1489 }
1490}
1491
1492// Only report VGPRs to generic code.
1494 BitVector &SavedVGPRs,
1495 RegScavenger *RS) const {
1498 if (MFI->isEntryFunction())
1499 return;
1500
1501 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1502 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1503
1504 for (MachineBasicBlock &MBB : MF) {
1505 for (MachineInstr &MI : MBB) {
1506 // WRITELANE instructions used for SGPR spills can overwrite the inactive
1507 // lanes of VGPRs and callee must spill and restore them even if they are
1508 // marked Caller-saved.
1509
1510 // TODO: Handle this elsewhere at an early point. Walking through all MBBs
1511 // here would be a bad heuristic. A better way should be by calling
1512 // allocateWWMSpill during the regalloc pipeline whenever a physical
1513 // register is allocated for the intended virtual registers. That will
1514 // also help excluding the general use of WRITELANE/READLANE intrinsics
1515 // that won't really need any such special handling.
1516 if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32)
1517 MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
1518 else if (MI.getOpcode() == AMDGPU::V_READLANE_B32)
1519 MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
1520 }
1521 }
1522
1523 // Ignore the SGPRs the default implementation found.
1524 SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
1525
1526 // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1527 // In gfx908 there was do AGPR loads and stores and thus spilling also
1528 // require a temporary VGPR.
1529 if (!ST.hasGFX90AInsts())
1530 SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
1531
1532 determinePrologEpilogSGPRSaves(MF, SavedVGPRs);
1533
1534 // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
1535 // allow the default insertion to handle them.
1536 for (auto &Reg : MFI->getWWMSpills())
1537 SavedVGPRs.reset(Reg.first);
1538
1539 // Mark all lane VGPRs as BB LiveIns.
1540 for (MachineBasicBlock &MBB : MF) {
1541 for (auto &Reg : MFI->getWWMSpills())
1542 MBB.addLiveIn(Reg.first);
1543
1545 }
1546}
1547
1549 BitVector &SavedRegs,
1550 RegScavenger *RS) const {
1553 if (MFI->isEntryFunction())
1554 return;
1555
1556 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1557 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1558
1559 // The SP is specifically managed and we don't want extra spills of it.
1560 SavedRegs.reset(MFI->getStackPtrOffsetReg());
1561
1562 const BitVector AllSavedRegs = SavedRegs;
1563 SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
1564
1565 // We have to anticipate introducing CSR VGPR spills or spill of caller
1566 // save VGPR reserved for SGPR spills as we now always create stack entry
1567 // for it, if we don't have any stack objects already, since we require a FP
1568 // if there is a call and stack. We will allocate a VGPR for SGPR spills if
1569 // there are any SGPR spills. Whether they are CSR spills or otherwise.
1570 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1571 const bool WillHaveFP =
1572 FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
1573
1574 // FP will be specially managed like SP.
1575 if (WillHaveFP || hasFP(MF))
1576 SavedRegs.reset(MFI->getFrameOffsetReg());
1577
1578 // Return address use with return instruction is hidden through the SI_RETURN
1579 // pseudo. Given that and since the IPRA computes actual register usage and
1580 // does not use CSR list, the clobbering of return address by function calls
1581 // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1582 // usage collection. This will ensure save/restore of return address happens
1583 // in those scenarios.
1584 const MachineRegisterInfo &MRI = MF.getRegInfo();
1585 Register RetAddrReg = TRI->getReturnAddressReg(MF);
1586 if (!MFI->isEntryFunction() &&
1587 (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) {
1588 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
1589 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
1590 }
1591}
1592
1595 std::vector<CalleeSavedInfo> &CSI) const {
1596 if (CSI.empty())
1597 return true; // Early exit if no callee saved registers are modified!
1598
1600 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1601 const SIRegisterInfo *RI = ST.getRegisterInfo();
1602 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1603 Register BasePtrReg = RI->getBaseRegister();
1604 Register SGPRForFPSaveRestoreCopy =
1605 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1606 Register SGPRForBPSaveRestoreCopy =
1607 FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg);
1608 if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
1609 return false;
1610
1611 unsigned NumModifiedRegs = 0;
1612
1613 if (SGPRForFPSaveRestoreCopy)
1614 NumModifiedRegs++;
1615 if (SGPRForBPSaveRestoreCopy)
1616 NumModifiedRegs++;
1617
1618 for (auto &CS : CSI) {
1619 if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) {
1620 CS.setDstReg(SGPRForFPSaveRestoreCopy);
1621 if (--NumModifiedRegs)
1622 break;
1623 } else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) {
1624 CS.setDstReg(SGPRForBPSaveRestoreCopy);
1625 if (--NumModifiedRegs)
1626 break;
1627 }
1628 }
1629
1630 return false;
1631}
1632
1634 const MachineFunction &MF) const {
1635
1636 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1637 const MachineFrameInfo &MFI = MF.getFrameInfo();
1638 uint64_t EstStackSize = MFI.estimateStackSize(MF);
1639 uint64_t MaxOffset = EstStackSize - 1;
1640
1641 // We need the emergency stack slots to be allocated in range of the
1642 // MUBUF/flat scratch immediate offset from the base register, so assign these
1643 // first at the incoming SP position.
1644 //
1645 // TODO: We could try sorting the objects to find a hole in the first bytes
1646 // rather than allocating as close to possible. This could save a lot of space
1647 // on frames with alignment requirements.
1648 if (ST.enableFlatScratch()) {
1649 const SIInstrInfo *TII = ST.getInstrInfo();
1650 if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1652 return false;
1653 } else {
1655 return false;
1656 }
1657
1658 return true;
1659}
1660
1662 MachineFunction &MF,
1665 int64_t Amount = I->getOperand(0).getImm();
1666 if (Amount == 0)
1667 return MBB.erase(I);
1668
1669 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1670 const SIInstrInfo *TII = ST.getInstrInfo();
1671 const DebugLoc &DL = I->getDebugLoc();
1672 unsigned Opc = I->getOpcode();
1673 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1674 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1675
1676 if (!hasReservedCallFrame(MF)) {
1677 Amount = alignTo(Amount, getStackAlign());
1678 assert(isUInt<32>(Amount) && "exceeded stack address space size");
1680 Register SPReg = MFI->getStackPtrOffsetReg();
1681
1682 Amount *= getScratchScaleFactor(ST);
1683 if (IsDestroy)
1684 Amount = -Amount;
1685 auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
1686 .addReg(SPReg)
1687 .addImm(Amount);
1688 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1689 } else if (CalleePopAmount != 0) {
1690 llvm_unreachable("is this used?");
1691 }
1692
1693 return MBB.erase(I);
1694}
1695
1696/// Returns true if the frame will require a reference to the stack pointer.
1697///
1698/// This is the set of conditions common to setting up the stack pointer in a
1699/// kernel, and for using a frame pointer in a callable function.
1700///
1701/// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
1702/// references SP.
1704 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
1705}
1706
1707// The FP for kernels is always known 0, so we never really need to setup an
1708// explicit register for it. However, DisableFramePointerElim will force us to
1709// use a register for it.
1711 const MachineFrameInfo &MFI = MF.getFrameInfo();
1712
1713 // For entry functions we can use an immediate offset in most cases, so the
1714 // presence of calls doesn't imply we need a distinct frame pointer.
1715 if (MFI.hasCalls() &&
1717 // All offsets are unsigned, so need to be addressed in the same direction
1718 // as stack growth.
1719
1720 // FIXME: This function is pretty broken, since it can be called before the
1721 // frame layout is determined or CSR spills are inserted.
1722 return MFI.getStackSize() != 0;
1723 }
1724
1725 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
1726 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
1727 MF) ||
1729}
1730
1731// This is essentially a reduced version of hasFP for entry functions. Since the
1732// stack pointer is known 0 on entry to kernels, we never really need an FP
1733// register. We may need to initialize the stack pointer depending on the frame
1734// properties, which logically overlaps many of the cases where an ordinary
1735// function would require an FP.
1737 const MachineFunction &MF) const {
1738 // Callable functions always require a stack pointer reference.
1740 "only expected to call this for entry points");
1741
1742 const MachineFrameInfo &MFI = MF.getFrameInfo();
1743
1744 // Entry points ordinarily don't need to initialize SP. We have to set it up
1745 // for callees if there are any. Also note tail calls are impossible/don't
1746 // make any sense for kernels.
1747 if (MFI.hasCalls())
1748 return true;
1749
1750 // We still need to initialize the SP if we're doing anything weird that
1751 // references the SP, like variable sized stack objects.
1752 return frameTriviallyRequiresSP(MFI);
1753}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Provides AMDGPU specific target descriptions.
static const Function * getParent(const Value *V)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
typename CallsiteContextGraph< DerivedCCG, FuncTy, CallTy >::FuncInfo FuncInfo
This file declares the machine register scavenger class.
static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &FuncInfo, LivePhysRegs &LiveRegs, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SpillReg, int FI, Register FrameReg, int64_t DwordOff=0)
static cl::opt< bool > EnableSpillVGPRToAGPR("amdgpu-spill-vgpr-to-agpr", cl::desc("Enable spilling VGPRs to AGPRs"), cl::ReallyHidden, cl::init(true))
static Register buildScratchExecCopy(LivePhysRegs &LiveRegs, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool IsProlog, bool EnableInactiveLanes)
static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, const SIInstrInfo *TII, Register TargetReg)
static bool allStackObjectsAreDead(const MachineFrameInfo &MFI)
static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI)
Returns true if the frame will require a reference to the stack pointer.
static MCRegister findUnusedRegister(MachineRegisterInfo &MRI, const LivePhysRegs &LiveRegs, const TargetRegisterClass &RC)
static void buildEpilogRestore(const GCNSubtarget &ST, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &FuncInfo, LivePhysRegs &LiveRegs, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SpillReg, int FI, Register FrameReg, int64_t DwordOff=0)
static bool allSGPRSpillsAreDead(const MachineFunction &MF)
static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF, LivePhysRegs &LiveRegs, Register SGPR, const TargetRegisterClass &RC=AMDGPU::SReg_32_XM0_XEXECRegClass)
static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs, const TargetRegisterClass &RC, bool Unused=false)
static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI, const SIMachineFunctionInfo *FuncInfo, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, bool IsProlog)
static unsigned getScratchScaleFactor(const GCNSubtarget &ST)
@ SI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:163
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:158
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:193
bool test(unsigned Idx) const
Definition: BitVector.h:461
BitVector & reset()
Definition: BitVector.h:392
void clearBitsNotInMask(const uint32_t *Mask, unsigned MaskWords=~0u)
clearBitsNotInMask - Clear a bit in this vector for every '0' bit in Mask.
Definition: BitVector.h:725
BitVector & set()
Definition: BitVector.h:351
bool any() const
any - Returns true if any bit is set.
Definition: BitVector.h:170
void clearBitsInMask(const uint32_t *Mask, unsigned MaskWords=~0u)
clearBitsInMask - Clear any bits in this vector that are set in Mask.
Definition: BitVector.h:713
iterator_range< const_set_bits_iterator > set_bits() const
Definition: BitVector.h:140
bool empty() const
empty - Tests whether there are no bits in this bitvector.
Definition: BitVector.h:156
A debug info location.
Definition: DebugLoc.h:33
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:237
unsigned isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
If the specified machine instruction is a direct store to a stack slot, return the virtual or physica...
unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
TargetInstrInfo overrides.
A set of physical registers with utility functions to track liveness when walking backward/forward th...
Definition: LivePhysRegs.h:50
bool available(const MachineRegisterInfo &MRI, MCPhysReg Reg) const
Returns true if register Reg and no aliasing register is in the set.
void stepBackward(const MachineInstr &MI)
Simulates liveness when stepping backwards over an instruction(bundle).
void removeReg(MCPhysReg Reg)
Removes a physical register, all its sub-registers, and all its super-registers from the set.
Definition: LivePhysRegs.h:90
void init(const TargetRegisterInfo &TRI)
(re-)initializes and clears the set.
Definition: LivePhysRegs.h:68
void addLiveIns(const MachineBasicBlock &MBB)
Adds all live-in registers of basic block MBB.
void addLiveOuts(const MachineBasicBlock &MBB)
Adds all live-out registers of basic block MBB.
void addReg(MCPhysReg Reg)
Adds a physical register and all its sub-registers to the set.
Definition: LivePhysRegs.h:81
bool empty() const
Returns true if the set is empty.
Definition: LivePhysRegs.h:78
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:24
bool isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void sortUniqueLiveIns()
Sorts and uniques the LiveIns vector.
iterator getLastNonDebugInstr(bool SkipPseudoOp=true)
Returns an iterator to the last non-debug instruction in the basic block, or end().
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasVarSizedObjects() const
This method may be called any time after instruction selection is complete to determine if the stack ...
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
bool hasCalls() const
Return true if the current function has any function calls.
bool isFrameAddressTaken() const
This method may be called any time after instruction selection is complete to determine if there is a...
Align getMaxAlign() const
Return the alignment in bytes that this function must be aligned to, which is greater than the defaul...
bool hasPatchPoint() const
This method may be called any time after instruction selection is complete to determine if there is a...
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasStackMap() const
This method may be called any time after instruction selection is complete to determine if there is a...
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getObjectIndexBegin() const
Return the minimum frame object index.
bool isDeadObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a dead object.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
Definition: MachineInstr.h:68
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:533
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
void setIsDead(bool Val=true)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const MCPhysReg * getCalleeSavedRegs() const
Returns list of callee saved registers.
void addLiveIn(MCRegister Reg, Register vreg=Register())
addLiveIn - Add the specified register as a live-in.
PrologEpilogSGPRSpillBuilder(Register Reg, const PrologEpilogSGPRSaveRestoreInfo SI, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, const SIInstrInfo *TII, const SIRegisterInfo &TRI, LivePhysRegs &LiveRegs, Register FrameReg)
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
void addScavengingFrameIndex(int FI)
Add a scavenging frame index.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool hasFP(const MachineFunction &MF) const override
hasFP - Return true if the specified function should have a dedicated frame pointer register.
void determinePrologEpilogSGPRSaves(MachineFunction &MF, BitVector &SavedRegs) const
StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, Register &FrameReg) const override
getFrameIndexReference - This method should return the base register and offset used to reference a f...
void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS=nullptr) const override
processFunctionBeforeFrameFinalized - This method is called immediately before the specified function...
bool allocateScavengingFrameIndexesNearIncomingSP(const MachineFunction &MF) const override
Control the placement of special register scavenging spill slots when allocating a stack frame.
bool requiresStackPointerReference(const MachineFunction &MF) const
void emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const override
This method determines which of the registers reported by TargetRegisterInfo::getCalleeSavedRegs() sh...
void emitCSRSpillStores(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs, Register FrameReg, Register FramePtrRegScratchCopy) const
bool assignCalleeSavedSpillSlots(MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector< CalleeSavedInfo > &CSI) const override
void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override
void emitCSRSpillRestores(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs, Register FrameReg, Register FramePtrRegScratchCopy) const
void processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF, RegScavenger *RS=nullptr) const override
processFunctionBeforeFrameIndicesReplaced - This method is called immediately before MO_FrameIndex op...
bool isSupportedStackID(TargetStackID::Value ID) const override
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override
emitProlog/emitEpilog - These methods insert prolog and epilog code into the function.
MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
This method is called during prolog/epilog code insertion to eliminate call frame setup and destroy p...
static bool isLegalMUBUFImmOffset(unsigned Imm)
Definition: SIInstrInfo.h:1165
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
const WWMSpillsMap & getWWMSpills() const
void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size=4, Align Alignment=Align(4))
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< SIRegisterInfo::SpilledReg > getPrologEpilogSGPRSpillToVGPRLanes(int FrameIndex) const
bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, bool IsPrologEpilog=false)
bool hasPrologEpilogSGPRSpillEntry(Register Reg) const
Register getGITPtrLoReg(const MachineFunction &MF) const
MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
Register getImplicitBufferPtrUserSGPR() const
void addToPrologEpilogSGPRSpills(Register Reg, PrologEpilogSGPRSaveRestoreInfo SI)
Register getFrameRegister(const MachineFunction &MF) const override
bool empty() const
Definition: SmallVector.h:94
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:36
int64_t getFixed() const
Returns the fixed component of the stack.
Definition: TypeSize.h:52
virtual bool hasReservedCallFrame(const MachineFunction &MF) const
hasReservedCallFrame - Under normal circumstances, when a frame pointer is not required,...
virtual void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const
This method determines which of the registers reported by TargetRegisterInfo::getCalleeSavedRegs() sh...
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetOptions Options
bool DisableFramePointerElim(const MachineFunction &MF) const
DisableFramePointerElim - This returns true if frame pointer elimination optimization should be disab...
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:392
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:394
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isCompute(CallingConv::ID cc)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:194
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:440
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:748
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:145
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.