LLVM 20.0.0git
SIFrameLowering.cpp
Go to the documentation of this file.
1//===----------------------- SIFrameLowering.cpp --------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8
9#include "SIFrameLowering.h"
10#include "AMDGPU.h"
11#include "GCNSubtarget.h"
18
19using namespace llvm;
20
21#define DEBUG_TYPE "frame-info"
22
24 "amdgpu-spill-vgpr-to-agpr",
25 cl::desc("Enable spilling VGPRs to AGPRs"),
27 cl::init(true));
28
29// Find a register matching \p RC from \p LiveUnits which is unused and
30// available throughout the function. On failure, returns AMDGPU::NoRegister.
31// TODO: Rewrite the loop here to iterate over MCRegUnits instead of
32// MCRegisters. This should reduce the number of iterations and avoid redundant
33// checking.
35 const LiveRegUnits &LiveUnits,
36 const TargetRegisterClass &RC) {
37 for (MCRegister Reg : RC) {
38 if (!MRI.isPhysRegUsed(Reg) && LiveUnits.available(Reg) &&
39 !MRI.isReserved(Reg))
40 return Reg;
41 }
42 return MCRegister();
43}
44
45// Find a scratch register that we can use in the prologue. We avoid using
46// callee-save registers since they may appear to be free when this is called
47// from canUseAsPrologue (during shrink wrapping), but then no longer be free
48// when this is called from emitPrologue.
51 const TargetRegisterClass &RC, bool Unused = false) {
52 // Mark callee saved registers as used so we will not choose them.
53 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
54 for (unsigned i = 0; CSRegs[i]; ++i)
55 LiveUnits.addReg(CSRegs[i]);
56
57 // We are looking for a register that can be used throughout the entire
58 // function, so any use is unacceptable.
59 if (Unused)
60 return findUnusedRegister(MRI, LiveUnits, RC);
61
62 for (MCRegister Reg : RC) {
63 if (LiveUnits.available(Reg) && !MRI.isReserved(Reg))
64 return Reg;
65 }
66
67 return MCRegister();
68}
69
70/// Query target location for spilling SGPRs
71/// \p IncludeScratchCopy : Also look for free scratch SGPRs
73 MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR,
74 const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
75 bool IncludeScratchCopy = true) {
77 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
78
79 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
80 const SIRegisterInfo *TRI = ST.getRegisterInfo();
81 unsigned Size = TRI->getSpillSize(RC);
82 Align Alignment = TRI->getSpillAlign(RC);
83
84 // We need to save and restore the given SGPR.
85
86 Register ScratchSGPR;
87 // 1: Try to save the given register into an unused scratch SGPR. The
88 // LiveUnits should have all the callee saved registers marked as used. For
89 // certain cases we skip copy to scratch SGPR.
90 if (IncludeScratchCopy)
91 ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC);
92
93 if (!ScratchSGPR) {
94 int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr,
96
97 if (TRI->spillSGPRToVGPR() &&
98 MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true,
99 /*IsPrologEpilog=*/true)) {
100 // 2: There's no free lane to spill, and no free register to save the
101 // SGPR, so we're forced to take another VGPR to use for the spill.
104 SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
105
106 LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
107 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
108 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
109 << '\n';);
110 } else {
111 // Remove dead <FI> index
113 // 3: If all else fails, spill the register to memory.
114 FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
116 SGPR,
117 PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI));
118 LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
119 << printReg(SGPR, TRI) << '\n');
120 }
121 } else {
124 SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
125 LiveUnits.addReg(ScratchSGPR);
126 LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
127 << printReg(ScratchSGPR, TRI) << '\n');
128 }
129}
130
131// We need to specially emit stack operations here because a different frame
132// register is used than in the rest of the function, as getFrameRegister would
133// use.
134static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
135 const SIMachineFunctionInfo &FuncInfo,
136 LiveRegUnits &LiveUnits, MachineFunction &MF,
139 Register SpillReg, int FI, Register FrameReg,
140 int64_t DwordOff = 0) {
141 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
142 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
143
144 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
147 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
148 FrameInfo.getObjectAlign(FI));
149 LiveUnits.addReg(SpillReg);
150 bool IsKill = !MBB.isLiveIn(SpillReg);
151 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg,
152 DwordOff, MMO, nullptr, &LiveUnits);
153 if (IsKill)
154 LiveUnits.removeReg(SpillReg);
155}
156
157static void buildEpilogRestore(const GCNSubtarget &ST,
158 const SIRegisterInfo &TRI,
159 const SIMachineFunctionInfo &FuncInfo,
160 LiveRegUnits &LiveUnits, MachineFunction &MF,
163 const DebugLoc &DL, Register SpillReg, int FI,
164 Register FrameReg, int64_t DwordOff = 0) {
165 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
166 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
167
168 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
171 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
172 FrameInfo.getObjectAlign(FI));
173 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg,
174 DwordOff, MMO, nullptr, &LiveUnits);
175}
176
178 const DebugLoc &DL, const SIInstrInfo *TII,
179 Register TargetReg) {
182 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
183 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
184 Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
185 Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
186
187 if (MFI->getGITPtrHigh() != 0xffffffff) {
188 BuildMI(MBB, I, DL, SMovB32, TargetHi)
189 .addImm(MFI->getGITPtrHigh())
190 .addReg(TargetReg, RegState::ImplicitDefine);
191 } else {
192 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_pseudo);
193 BuildMI(MBB, I, DL, GetPC64, TargetReg);
194 }
195 Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
196 MF->getRegInfo().addLiveIn(GitPtrLo);
197 MBB.addLiveIn(GitPtrLo);
198 BuildMI(MBB, I, DL, SMovB32, TargetLo)
199 .addReg(GitPtrLo);
200}
201
202static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI,
203 const SIMachineFunctionInfo *FuncInfo,
205 MachineBasicBlock::iterator MBBI, bool IsProlog) {
206 if (LiveUnits.empty()) {
207 LiveUnits.init(TRI);
208 if (IsProlog) {
209 LiveUnits.addLiveIns(MBB);
210 } else {
211 // In epilog.
212 LiveUnits.addLiveOuts(MBB);
213 LiveUnits.stepBackward(*MBBI);
214 }
215 }
216}
217
218namespace llvm {
219
220// SpillBuilder to save/restore special SGPR spills like the one needed for FP,
221// BP, etc. These spills are delayed until the current function's frame is
222// finalized. For a given register, the builder uses the
223// PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
227 MachineFunction &MF;
228 const GCNSubtarget &ST;
229 MachineFrameInfo &MFI;
230 SIMachineFunctionInfo *FuncInfo;
231 const SIInstrInfo *TII;
232 const SIRegisterInfo &TRI;
233 Register SuperReg;
235 LiveRegUnits &LiveUnits;
236 const DebugLoc &DL;
237 Register FrameReg;
238 ArrayRef<int16_t> SplitParts;
239 unsigned NumSubRegs;
240 unsigned EltSize = 4;
241
242 void saveToMemory(const int FI) const {
244 assert(!MFI.isDeadObjectIndex(FI));
245
246 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
247
249 MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
250 if (!TmpVGPR)
251 report_fatal_error("failed to find free scratch register");
252
253 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
254 Register SubReg = NumSubRegs == 1
255 ? SuperReg
256 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
257 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
258 .addReg(SubReg);
259
260 buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR,
261 FI, FrameReg, DwordOff);
262 DwordOff += 4;
263 }
264 }
265
266 void saveToVGPRLane(const int FI) const {
267 assert(!MFI.isDeadObjectIndex(FI));
268
272 assert(Spill.size() == NumSubRegs);
273
274 for (unsigned I = 0; I < NumSubRegs; ++I) {
275 Register SubReg = NumSubRegs == 1
276 ? SuperReg
277 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
278 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR),
279 Spill[I].VGPR)
280 .addReg(SubReg)
281 .addImm(Spill[I].Lane)
282 .addReg(Spill[I].VGPR, RegState::Undef);
283 }
284 }
285
286 void copyToScratchSGPR(Register DstReg) const {
287 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg)
288 .addReg(SuperReg)
290 }
291
292 void restoreFromMemory(const int FI) {
294
295 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
297 MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
298 if (!TmpVGPR)
299 report_fatal_error("failed to find free scratch register");
300
301 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
302 Register SubReg = NumSubRegs == 1
303 ? SuperReg
304 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
305
306 buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
307 TmpVGPR, FI, FrameReg, DwordOff);
308 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
309 .addReg(TmpVGPR, RegState::Kill);
310 DwordOff += 4;
311 }
312 }
313
314 void restoreFromVGPRLane(const int FI) {
318 assert(Spill.size() == NumSubRegs);
319
320 for (unsigned I = 0; I < NumSubRegs; ++I) {
321 Register SubReg = NumSubRegs == 1
322 ? SuperReg
323 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
324 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
325 .addReg(Spill[I].VGPR)
326 .addImm(Spill[I].Lane);
327 }
328 }
329
330 void copyFromScratchSGPR(Register SrcReg) const {
331 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg)
332 .addReg(SrcReg)
334 }
335
336public:
341 const DebugLoc &DL, const SIInstrInfo *TII,
342 const SIRegisterInfo &TRI,
343 LiveRegUnits &LiveUnits, Register FrameReg)
344 : MI(MI), MBB(MBB), MF(*MBB.getParent()),
345 ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
346 FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
347 SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL),
348 FrameReg(FrameReg) {
349 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
350 SplitParts = TRI.getRegSplitParts(RC, EltSize);
351 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
352
353 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
354 }
355
356 void save() {
357 switch (SI.getKind()) {
359 return saveToMemory(SI.getIndex());
361 return saveToVGPRLane(SI.getIndex());
363 return copyToScratchSGPR(SI.getReg());
364 }
365 }
366
367 void restore() {
368 switch (SI.getKind()) {
370 return restoreFromMemory(SI.getIndex());
372 return restoreFromVGPRLane(SI.getIndex());
374 return copyFromScratchSGPR(SI.getReg());
375 }
376 }
377};
378
379} // namespace llvm
380
381// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
382void SIFrameLowering::emitEntryFunctionFlatScratchInit(
384 const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
386 const SIInstrInfo *TII = ST.getInstrInfo();
387 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
389
390 // We don't need this if we only have spills since there is no user facing
391 // scratch.
392
393 // TODO: If we know we don't have flat instructions earlier, we can omit
394 // this from the input registers.
395 //
396 // TODO: We only need to know if we access scratch space through a flat
397 // pointer. Because we only detect if flat instructions are used at all,
398 // this will be used more often than necessary on VI.
399
400 Register FlatScrInitLo;
401 Register FlatScrInitHi;
402
403 if (ST.isAmdPalOS()) {
404 // Extract the scratch offset from the descriptor in the GIT
405 LiveRegUnits LiveUnits;
406 LiveUnits.init(*TRI);
407 LiveUnits.addLiveIns(MBB);
408
409 // Find unused reg to load flat scratch init into
411 Register FlatScrInit = AMDGPU::NoRegister;
412 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
413 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
414 AllSGPR64s = AllSGPR64s.slice(
415 std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
416 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
417 for (MCPhysReg Reg : AllSGPR64s) {
418 if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
419 MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
420 FlatScrInit = Reg;
421 break;
422 }
423 }
424 assert(FlatScrInit && "Failed to find free register for scratch init");
425
426 FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
427 FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
428
429 buildGitPtr(MBB, I, DL, TII, FlatScrInit);
430
431 // We now have the GIT ptr - now get the scratch descriptor from the entry
432 // at offset 0 (or offset 16 for a compute shader).
434 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
435 auto *MMO = MF.getMachineMemOperand(
436 PtrInfo,
439 8, Align(4));
440 unsigned Offset =
442 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
443 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
444 BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
445 .addReg(FlatScrInit)
446 .addImm(EncodedOffset) // offset
447 .addImm(0) // cpol
448 .addMemOperand(MMO);
449
450 // Mask the offset in [47:0] of the descriptor
451 const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
452 auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
453 .addReg(FlatScrInitHi)
454 .addImm(0xffff);
455 And->getOperand(3).setIsDead(); // Mark SCC as dead.
456 } else {
457 Register FlatScratchInitReg =
459 assert(FlatScratchInitReg);
460
462 MRI.addLiveIn(FlatScratchInitReg);
463 MBB.addLiveIn(FlatScratchInitReg);
464
465 FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
466 FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
467 }
468
469 // Do a 64-bit pointer add.
470 if (ST.flatScratchIsPointer()) {
471 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
472 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
473 .addReg(FlatScrInitLo)
474 .addReg(ScratchWaveOffsetReg);
475 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
476 FlatScrInitHi)
477 .addReg(FlatScrInitHi)
478 .addImm(0);
479 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
480
481 using namespace AMDGPU::Hwreg;
482 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
483 .addReg(FlatScrInitLo)
484 .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_LO, 0, 32)));
485 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
486 .addReg(FlatScrInitHi)
487 .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_HI, 0, 32)));
488 return;
489 }
490
491 // For GFX9.
492 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
493 .addReg(FlatScrInitLo)
494 .addReg(ScratchWaveOffsetReg);
495 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
496 AMDGPU::FLAT_SCR_HI)
497 .addReg(FlatScrInitHi)
498 .addImm(0);
499 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
500
501 return;
502 }
503
504 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
505
506 // Copy the size in bytes.
507 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
508 .addReg(FlatScrInitHi, RegState::Kill);
509
510 // Add wave offset in bytes to private base offset.
511 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
512 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
513 .addReg(FlatScrInitLo)
514 .addReg(ScratchWaveOffsetReg);
515
516 // Convert offset to 256-byte units.
517 auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32),
518 AMDGPU::FLAT_SCR_HI)
519 .addReg(FlatScrInitLo, RegState::Kill)
520 .addImm(8);
521 LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
522}
523
524// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
525// memory. They should have been removed by now.
527 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
528 I != E; ++I) {
529 if (!MFI.isDeadObjectIndex(I))
530 return false;
531 }
532
533 return true;
534}
535
536// Shift down registers reserved for the scratch RSRC.
537Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
538 MachineFunction &MF) const {
539
541 const SIInstrInfo *TII = ST.getInstrInfo();
542 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
545
546 assert(MFI->isEntryFunction());
547
548 Register ScratchRsrcReg = MFI->getScratchRSrcReg();
549
550 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
552 return Register();
553
554 if (ST.hasSGPRInitBug() ||
555 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
556 return ScratchRsrcReg;
557
558 // We reserved the last registers for this. Shift it down to the end of those
559 // which were actually used.
560 //
561 // FIXME: It might be safer to use a pseudoregister before replacement.
562
563 // FIXME: We should be able to eliminate unused input registers. We only
564 // cannot do this for the resources required for scratch access. For now we
565 // skip over user SGPRs and may leave unused holes.
566
567 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
568 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
569 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
570
571 // Skip the last N reserved elements because they should have already been
572 // reserved for VCC etc.
573 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
574 for (MCPhysReg Reg : AllSGPR128s) {
575 // Pick the first unallocated one. Make sure we don't clobber the other
576 // reserved input we needed. Also for PAL, make sure we don't clobber
577 // the GIT pointer passed in SGPR0 or SGPR8.
578 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
579 (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) {
580 MRI.replaceRegWith(ScratchRsrcReg, Reg);
581 MFI->setScratchRSrcReg(Reg);
582 MRI.reserveReg(Reg, TRI);
583 return Reg;
584 }
585 }
586
587 return ScratchRsrcReg;
588}
589
590static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
591 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
592}
593
595 MachineBasicBlock &MBB) const {
596 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
597
598 // FIXME: If we only have SGPR spills, we won't actually be using scratch
599 // memory since these spill to VGPRs. We should be cleaning up these unused
600 // SGPR spill frame indices somewhere.
601
602 // FIXME: We still have implicit uses on SGPR spill instructions in case they
603 // need to spill to vector memory. It's likely that will not happen, but at
604 // this point it appears we need the setup. This part of the prolog should be
605 // emitted after frame indices are eliminated.
606
607 // FIXME: Remove all of the isPhysRegUsed checks
608
610 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
611 const SIInstrInfo *TII = ST.getInstrInfo();
612 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
614 const Function &F = MF.getFunction();
615 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
616
617 assert(MFI->isEntryFunction());
618
619 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
621
622 // We need to do the replacement of the private segment buffer register even
623 // if there are no stack objects. There could be stores to undef or a
624 // constant without an associated object.
625 //
626 // This will return `Register()` in cases where there are no actual
627 // uses of the SRSRC.
628 Register ScratchRsrcReg;
629 if (!ST.enableFlatScratch())
630 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
631
632 // Make the selected register live throughout the function.
633 if (ScratchRsrcReg) {
634 for (MachineBasicBlock &OtherBB : MF) {
635 if (&OtherBB != &MBB) {
636 OtherBB.addLiveIn(ScratchRsrcReg);
637 }
638 }
639 }
640
641 // Now that we have fixed the reserved SRSRC we need to locate the
642 // (potentially) preloaded SRSRC.
643 Register PreloadedScratchRsrcReg;
644 if (ST.isAmdHsaOrMesa(F)) {
645 PreloadedScratchRsrcReg =
647 if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
648 // We added live-ins during argument lowering, but since they were not
649 // used they were deleted. We're adding the uses now, so add them back.
650 MRI.addLiveIn(PreloadedScratchRsrcReg);
651 MBB.addLiveIn(PreloadedScratchRsrcReg);
652 }
653 }
654
655 // Debug location must be unknown since the first debug location is used to
656 // determine the end of the prologue.
657 DebugLoc DL;
659
660 // We found the SRSRC first because it needs four registers and has an
661 // alignment requirement. If the SRSRC that we found is clobbering with
662 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
663 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
664 // wave offset to a free SGPR.
665 Register ScratchWaveOffsetReg;
666 if (PreloadedScratchWaveOffsetReg &&
667 TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
668 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
669 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
670 AllSGPRs = AllSGPRs.slice(
671 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
672 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
673 for (MCPhysReg Reg : AllSGPRs) {
674 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
675 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
676 ScratchWaveOffsetReg = Reg;
677 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
678 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
679 break;
680 }
681 }
682
683 // FIXME: We can spill incoming arguments and restore at the end of the
684 // prolog.
685 if (!ScratchWaveOffsetReg)
687 "could not find temporary scratch offset register in prolog");
688 } else {
689 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
690 }
691 assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
692
693 if (hasFP(MF)) {
695 assert(FPReg != AMDGPU::FP_REG);
696 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
697 }
698
701 assert(SPReg != AMDGPU::SP_REG);
702 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
703 .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
704 }
705
706 bool NeedsFlatScratchInit =
708 (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
709 (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
710
711 if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
712 PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
713 MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
714 MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
715 }
716
717 if (NeedsFlatScratchInit) {
718 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
719 }
720
721 if (ScratchRsrcReg) {
722 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
723 PreloadedScratchRsrcReg,
724 ScratchRsrcReg, ScratchWaveOffsetReg);
725 }
726}
727
728// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
729void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
731 const DebugLoc &DL, Register PreloadedScratchRsrcReg,
732 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
733
734 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
735 const SIInstrInfo *TII = ST.getInstrInfo();
736 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
738 const Function &Fn = MF.getFunction();
739
740 if (ST.isAmdPalOS()) {
741 // The pointer to the GIT is formed from the offset passed in and either
742 // the amdgpu-git-ptr-high function attribute or the top part of the PC
743 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
744 Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
745
746 buildGitPtr(MBB, I, DL, TII, Rsrc01);
747
748 // We now have the GIT ptr - now get the scratch descriptor from the entry
749 // at offset 0 (or offset 16 for a compute shader).
751 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
752 auto *MMO = MF.getMachineMemOperand(
753 PtrInfo,
756 16, Align(4));
757 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
758 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
759 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
760 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
761 .addReg(Rsrc01)
762 .addImm(EncodedOffset) // offset
763 .addImm(0) // cpol
764 .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
765 .addMemOperand(MMO);
766
767 // The driver will always set the SRD for wave 64 (bits 118:117 of
768 // descriptor / bits 22:21 of third sub-reg will be 0b11)
769 // If the shader is actually wave32 we have to modify the const_index_stride
770 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
771 // reason the driver does this is that there can be cases where it presents
772 // 2 shaders with different wave size (e.g. VsFs).
773 // TODO: convert to using SCRATCH instructions or multiple SRD buffers
774 if (ST.isWave32()) {
775 const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
776 BuildMI(MBB, I, DL, SBitsetB32, Rsrc03)
777 .addImm(21)
778 .addReg(Rsrc03);
779 }
780 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
781 assert(!ST.isAmdHsaOrMesa(Fn));
782 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
783
784 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
785 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
786
787 // Use relocations to get the pointer, and setup the other bits manually.
788 uint64_t Rsrc23 = TII->getScratchRsrcWords23();
789
791 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
792
794 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
795
796 BuildMI(MBB, I, DL, Mov64, Rsrc01)
798 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
799 } else {
800 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
801
803 auto *MMO = MF.getMachineMemOperand(
804 PtrInfo,
807 8, Align(4));
808 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
810 .addImm(0) // offset
811 .addImm(0) // cpol
812 .addMemOperand(MMO)
813 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
814
817 }
818 } else {
819 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
820 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
821
822 BuildMI(MBB, I, DL, SMovB32, Rsrc0)
823 .addExternalSymbol("SCRATCH_RSRC_DWORD0")
824 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
825
826 BuildMI(MBB, I, DL, SMovB32, Rsrc1)
827 .addExternalSymbol("SCRATCH_RSRC_DWORD1")
828 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
829 }
830
831 BuildMI(MBB, I, DL, SMovB32, Rsrc2)
832 .addImm(Lo_32(Rsrc23))
833 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
834
835 BuildMI(MBB, I, DL, SMovB32, Rsrc3)
836 .addImm(Hi_32(Rsrc23))
837 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
838 } else if (ST.isAmdHsaOrMesa(Fn)) {
839 assert(PreloadedScratchRsrcReg);
840
841 if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
842 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
843 .addReg(PreloadedScratchRsrcReg, RegState::Kill);
844 }
845 }
846
847 // Add the scratch wave offset into the scratch RSRC.
848 //
849 // We only want to update the first 48 bits, which is the base address
850 // pointer, without touching the adjacent 16 bits of flags. We know this add
851 // cannot carry-out from bit 47, otherwise the scratch allocation would be
852 // impossible to fit in the 48-bit global address space.
853 //
854 // TODO: Evaluate if it is better to just construct an SRD using the flat
855 // scratch init and some constants rather than update the one we are passed.
856 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
857 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
858
859 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
860 // the kernel body via inreg arguments.
861 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
862 .addReg(ScratchRsrcSub0)
863 .addReg(ScratchWaveOffsetReg)
864 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
865 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
866 .addReg(ScratchRsrcSub1)
867 .addImm(0)
868 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
869 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
870}
871
873 switch (ID) {
877 return true;
880 return false;
881 }
882 llvm_unreachable("Invalid TargetStackID::Value");
883}
884
885// Activate only the inactive lanes when \p EnableInactiveLanes is true.
886// Otherwise, activate all lanes. It returns the saved exec.
888 MachineFunction &MF,
891 const DebugLoc &DL, bool IsProlog,
892 bool EnableInactiveLanes) {
893 Register ScratchExecCopy;
895 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
896 const SIInstrInfo *TII = ST.getInstrInfo();
897 const SIRegisterInfo &TRI = TII->getRegisterInfo();
899
900 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
901
902 ScratchExecCopy = findScratchNonCalleeSaveRegister(
903 MRI, LiveUnits, *TRI.getWaveMaskRegClass());
904 if (!ScratchExecCopy)
905 report_fatal_error("failed to find free scratch register");
906
907 LiveUnits.addReg(ScratchExecCopy);
908
909 const unsigned SaveExecOpc =
910 ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
911 : AMDGPU::S_OR_SAVEEXEC_B32)
912 : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
913 : AMDGPU::S_OR_SAVEEXEC_B64);
914 auto SaveExec =
915 BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1);
916 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
917
918 return ScratchExecCopy;
919}
920
924 Register FrameReg, Register FramePtrRegScratchCopy) const {
926 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
927 const SIInstrInfo *TII = ST.getInstrInfo();
928 const SIRegisterInfo &TRI = TII->getRegisterInfo();
929
930 // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
931 // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
932 // might end up flipping the EXEC bits twice.
933 Register ScratchExecCopy;
934 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
935 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
936 if (!WWMScratchRegs.empty())
937 ScratchExecCopy =
938 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
939 /*IsProlog*/ true, /*EnableInactiveLanes*/ true);
940
941 auto StoreWWMRegisters =
943 for (const auto &Reg : WWMRegs) {
944 Register VGPR = Reg.first;
945 int FI = Reg.second;
946 buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
947 VGPR, FI, FrameReg);
948 }
949 };
950
951 StoreWWMRegisters(WWMScratchRegs);
952 if (!WWMCalleeSavedRegs.empty()) {
953 if (ScratchExecCopy) {
954 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
955 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
956 } else {
957 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
958 /*IsProlog*/ true,
959 /*EnableInactiveLanes*/ false);
960 }
961 }
962
963 StoreWWMRegisters(WWMCalleeSavedRegs);
964 if (ScratchExecCopy) {
965 // FIXME: Split block and make terminator.
966 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
967 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
968 .addReg(ScratchExecCopy, RegState::Kill);
969 LiveUnits.addReg(ScratchExecCopy);
970 }
971
972 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
973
974 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
975 // Special handle FP spill:
976 // Skip if FP is saved to a scratch SGPR, the save has already been emitted.
977 // Otherwise, FP has been moved to a temporary register and spill it
978 // instead.
979 Register Reg =
980 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
981 if (!Reg)
982 continue;
983
984 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
985 LiveUnits, FrameReg);
986 SB.save();
987 }
988
989 // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
990 // such scratch registers live throughout the function.
991 SmallVector<Register, 1> ScratchSGPRs;
992 FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs);
993 if (!ScratchSGPRs.empty()) {
994 for (MachineBasicBlock &MBB : MF) {
995 for (MCPhysReg Reg : ScratchSGPRs)
996 MBB.addLiveIn(Reg);
997
999 }
1000 if (!LiveUnits.empty()) {
1001 for (MCPhysReg Reg : ScratchSGPRs)
1002 LiveUnits.addReg(Reg);
1003 }
1004 }
1005}
1006
1010 Register FrameReg, Register FramePtrRegScratchCopy) const {
1011 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1012 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1013 const SIInstrInfo *TII = ST.getInstrInfo();
1014 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1015 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1016
1017 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1018 // Special handle FP restore:
1019 // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1020 // the FP value to a temporary register. The frame pointer should be
1021 // overwritten only at the end when all other spills are restored from
1022 // current frame.
1023 Register Reg =
1024 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1025 if (!Reg)
1026 continue;
1027
1028 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1029 LiveUnits, FrameReg);
1030 SB.restore();
1031 }
1032
1033 // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1034 // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1035 // this, we might end up flipping the EXEC bits twice.
1036 Register ScratchExecCopy;
1037 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1038 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
1039 if (!WWMScratchRegs.empty())
1040 ScratchExecCopy =
1041 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1042 /*IsProlog*/ false, /*EnableInactiveLanes*/ true);
1043
1044 auto RestoreWWMRegisters =
1046 for (const auto &Reg : WWMRegs) {
1047 Register VGPR = Reg.first;
1048 int FI = Reg.second;
1049 buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
1050 VGPR, FI, FrameReg);
1051 }
1052 };
1053
1054 RestoreWWMRegisters(WWMScratchRegs);
1055 if (!WWMCalleeSavedRegs.empty()) {
1056 if (ScratchExecCopy) {
1057 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1058 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
1059 } else {
1060 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1061 /*IsProlog*/ false,
1062 /*EnableInactiveLanes*/ false);
1063 }
1064 }
1065
1066 RestoreWWMRegisters(WWMCalleeSavedRegs);
1067 if (ScratchExecCopy) {
1068 // FIXME: Split block and make terminator.
1069 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1070 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
1071 .addReg(ScratchExecCopy, RegState::Kill);
1072 }
1073}
1074
1076 MachineBasicBlock &MBB) const {
1078 if (FuncInfo->isEntryFunction()) {
1080 return;
1081 }
1082
1083 MachineFrameInfo &MFI = MF.getFrameInfo();
1084 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1085 const SIInstrInfo *TII = ST.getInstrInfo();
1086 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1088
1089 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1090 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1091 Register BasePtrReg =
1092 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
1093 LiveRegUnits LiveUnits;
1094
1096 // DebugLoc must be unknown since the first instruction with DebugLoc is used
1097 // to determine the end of the prologue.
1098 DebugLoc DL;
1099
1100 if (FuncInfo->isChainFunction()) {
1101 // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but
1102 // are free to set one up if they need it.
1103 bool UseSP = requiresStackPointerReference(MF);
1104 if (UseSP) {
1105 assert(StackPtrReg != AMDGPU::SP_REG);
1106
1107 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg)
1109 }
1110 }
1111
1112 bool HasFP = false;
1113 bool HasBP = false;
1114 uint32_t NumBytes = MFI.getStackSize();
1115 uint32_t RoundedSize = NumBytes;
1116
1117 if (TRI.hasStackRealignment(MF))
1118 HasFP = true;
1119
1120 Register FramePtrRegScratchCopy;
1121 if (!HasFP && !hasFP(MF)) {
1122 // Emit the CSR spill stores with SP base register.
1123 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
1124 FuncInfo->isChainFunction() ? Register() : StackPtrReg,
1125 FramePtrRegScratchCopy);
1126 } else {
1127 // CSR spill stores will use FP as base register.
1128 Register SGPRForFPSaveRestoreCopy =
1129 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1130
1131 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
1132 if (SGPRForFPSaveRestoreCopy) {
1133 // Copy FP to the scratch register now and emit the CFI entry. It avoids
1134 // the extra FP copy needed in the other two cases when FP is spilled to
1135 // memory or to a VGPR lane.
1137 FramePtrReg,
1138 FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI,
1139 DL, TII, TRI, LiveUnits, FramePtrReg);
1140 SB.save();
1141 LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
1142 } else {
1143 // Copy FP into a new scratch register so that its previous value can be
1144 // spilled after setting up the new frame.
1145 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1146 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1147 if (!FramePtrRegScratchCopy)
1148 report_fatal_error("failed to find free scratch register");
1149
1150 LiveUnits.addReg(FramePtrRegScratchCopy);
1151 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy)
1152 .addReg(FramePtrReg);
1153 }
1154 }
1155
1156 if (HasFP) {
1157 const unsigned Alignment = MFI.getMaxAlign().value();
1158
1159 RoundedSize += Alignment;
1160 if (LiveUnits.empty()) {
1161 LiveUnits.init(TRI);
1162 LiveUnits.addLiveIns(MBB);
1163 }
1164
1165 // s_add_i32 s33, s32, NumBytes
1166 // s_and_b32 s33, s33, 0b111...0000
1167 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
1168 .addReg(StackPtrReg)
1169 .addImm((Alignment - 1) * getScratchScaleFactor(ST))
1171 auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
1172 .addReg(FramePtrReg, RegState::Kill)
1173 .addImm(-Alignment * getScratchScaleFactor(ST))
1175 And->getOperand(3).setIsDead(); // Mark SCC as dead.
1176 FuncInfo->setIsStackRealigned(true);
1177 } else if ((HasFP = hasFP(MF))) {
1178 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1179 .addReg(StackPtrReg)
1181 }
1182
1183 // If FP is used, emit the CSR spills with FP base register.
1184 if (HasFP) {
1185 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
1186 FramePtrRegScratchCopy);
1187 if (FramePtrRegScratchCopy)
1188 LiveUnits.removeReg(FramePtrRegScratchCopy);
1189 }
1190
1191 // If we need a base pointer, set it up here. It's whatever the value of
1192 // the stack pointer is at this point. Any variable size objects will be
1193 // allocated after this, so we can still use the base pointer to reference
1194 // the incoming arguments.
1195 if ((HasBP = TRI.hasBasePointer(MF))) {
1196 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
1197 .addReg(StackPtrReg)
1199 }
1200
1201 if (HasFP && RoundedSize != 0) {
1202 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1203 .addReg(StackPtrReg)
1204 .addImm(RoundedSize * getScratchScaleFactor(ST))
1206 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1207 }
1208
1209 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1210 (void)FPSaved;
1211 assert((!HasFP || FPSaved) &&
1212 "Needed to save FP but didn't save it anywhere");
1213
1214 // If we allow spilling to AGPRs we may have saved FP but then spill
1215 // everything into AGPRs instead of the stack.
1216 assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) &&
1217 "Saved FP but didn't need it");
1218
1219 bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg);
1220 (void)BPSaved;
1221 assert((!HasBP || BPSaved) &&
1222 "Needed to save BP but didn't save it anywhere");
1223
1224 assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
1225}
1226
1228 MachineBasicBlock &MBB) const {
1229 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1230 if (FuncInfo->isEntryFunction())
1231 return;
1232
1233 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1234 const SIInstrInfo *TII = ST.getInstrInfo();
1235 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1237 LiveRegUnits LiveUnits;
1238 // Get the insert location for the epilogue. If there were no terminators in
1239 // the block, get the last instruction.
1241 DebugLoc DL;
1242 if (!MBB.empty()) {
1244 if (MBBI != MBB.end())
1245 DL = MBBI->getDebugLoc();
1246
1248 }
1249
1250 const MachineFrameInfo &MFI = MF.getFrameInfo();
1251 uint32_t NumBytes = MFI.getStackSize();
1252 uint32_t RoundedSize = FuncInfo->isStackRealigned()
1253 ? NumBytes + MFI.getMaxAlign().value()
1254 : NumBytes;
1255 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1256 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1257 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1258
1259 Register FramePtrRegScratchCopy;
1260 Register SGPRForFPSaveRestoreCopy =
1261 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1262 if (FPSaved) {
1263 // CSR spill restores should use FP as base register. If
1264 // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1265 // into a new scratch register and copy to FP later when other registers are
1266 // restored from the current stack frame.
1267 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
1268 if (SGPRForFPSaveRestoreCopy) {
1269 LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
1270 } else {
1271 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1272 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1273 if (!FramePtrRegScratchCopy)
1274 report_fatal_error("failed to find free scratch register");
1275
1276 LiveUnits.addReg(FramePtrRegScratchCopy);
1277 }
1278
1279 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
1280 FramePtrRegScratchCopy);
1281 }
1282
1283 if (RoundedSize != 0 && hasFP(MF)) {
1284 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1285 .addReg(StackPtrReg)
1286 .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
1288 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1289 }
1290
1291 if (FPSaved) {
1292 // Insert the copy to restore FP.
1293 Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1294 : FramePtrRegScratchCopy;
1296 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1297 .addReg(SrcReg);
1298 if (SGPRForFPSaveRestoreCopy)
1300 } else {
1301 // Insert the CSR spill restores with SP as the base register.
1302 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits,
1303 FuncInfo->isChainFunction() ? Register() : StackPtrReg,
1304 FramePtrRegScratchCopy);
1305 }
1306}
1307
1308#ifndef NDEBUG
1310 const MachineFrameInfo &MFI = MF.getFrameInfo();
1311 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1312 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1313 I != E; ++I) {
1314 if (!MFI.isDeadObjectIndex(I) &&
1317 return false;
1318 }
1319 }
1320
1321 return true;
1322}
1323#endif
1324
1326 int FI,
1327 Register &FrameReg) const {
1328 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1329
1330 FrameReg = RI->getFrameRegister(MF);
1332}
1333
1335 MachineFunction &MF,
1336 RegScavenger *RS) const {
1337 MachineFrameInfo &MFI = MF.getFrameInfo();
1338
1339 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1340 const SIInstrInfo *TII = ST.getInstrInfo();
1341 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1344
1345 const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1347
1348 if (SpillVGPRToAGPR) {
1349 // To track the spill frame indices handled in this pass.
1350 BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1351 BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1352
1353 bool SeenDbgInstr = false;
1354
1355 for (MachineBasicBlock &MBB : MF) {
1357 int FrameIndex;
1358 if (MI.isDebugInstr())
1359 SeenDbgInstr = true;
1360
1361 if (TII->isVGPRSpill(MI)) {
1362 // Try to eliminate stack used by VGPR spills before frame
1363 // finalization.
1364 unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1365 AMDGPU::OpName::vaddr);
1366 int FI = MI.getOperand(FIOp).getIndex();
1367 Register VReg =
1368 TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
1369 if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1370 TRI->isAGPR(MRI, VReg))) {
1371 assert(RS != nullptr);
1373 RS->backward(std::next(MI.getIterator()));
1374 TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
1375 SpillFIs.set(FI);
1376 continue;
1377 }
1378 } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
1379 TII->isLoadFromStackSlot(MI, FrameIndex))
1380 if (!MFI.isFixedObjectIndex(FrameIndex))
1381 NonVGPRSpillFIs.set(FrameIndex);
1382 }
1383 }
1384
1385 // Stack slot coloring may assign different objects to the same stack slot.
1386 // If not, then the VGPR to AGPR spill slot is dead.
1387 for (unsigned FI : SpillFIs.set_bits())
1388 if (!NonVGPRSpillFIs.test(FI))
1389 FuncInfo->setVGPRToAGPRSpillDead(FI);
1390
1391 for (MachineBasicBlock &MBB : MF) {
1392 for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1393 MBB.addLiveIn(Reg);
1394
1395 for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1396 MBB.addLiveIn(Reg);
1397
1399
1400 if (!SpillFIs.empty() && SeenDbgInstr) {
1401 // FIXME: The dead frame indices are replaced with a null register from
1402 // the debug value instructions. We should instead, update it with the
1403 // correct register value. But not sure the register value alone is
1404 for (MachineInstr &MI : MBB) {
1405 if (MI.isDebugValue()) {
1406 uint32_t StackOperandIdx = MI.isDebugValueList() ? 2 : 0;
1407 if (MI.getOperand(StackOperandIdx).isFI() &&
1408 !MFI.isFixedObjectIndex(
1409 MI.getOperand(StackOperandIdx).getIndex()) &&
1410 SpillFIs[MI.getOperand(StackOperandIdx).getIndex()]) {
1411 MI.getOperand(StackOperandIdx)
1412 .ChangeToRegister(Register(), false /*isDef*/);
1413 }
1414 }
1415 }
1416 }
1417 }
1418 }
1419
1420 // At this point we've already allocated all spilled SGPRs to VGPRs if we
1421 // can. Any remaining SGPR spills will go to memory, so move them back to the
1422 // default stack.
1423 bool HaveSGPRToVMemSpill =
1424 FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
1426 "SGPR spill should have been removed in SILowerSGPRSpills");
1427
1428 // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1429 // but currently hasNonSpillStackObjects is set only from source
1430 // allocas. Stack temps produced from legalization are not counted currently.
1431 if (!allStackObjectsAreDead(MFI)) {
1432 assert(RS && "RegScavenger required if spilling");
1433
1434 // Add an emergency spill slot
1435 RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
1436
1437 // If we are spilling SGPRs to memory with a large frame, we may need a
1438 // second VGPR emergency frame index.
1439 if (HaveSGPRToVMemSpill &&
1441 RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false));
1442 }
1443 }
1444}
1445
1447 MachineFunction &MF, RegScavenger *RS) const {
1448 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1449 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1452
1453 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1454 // On gfx908, we had initially reserved highest available VGPR for AGPR
1455 // copy. Now since we are done with RA, check if there exist an unused VGPR
1456 // which is lower than the eariler reserved VGPR before RA. If one exist,
1457 // use it for AGPR copy instead of one reserved before RA.
1458 Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1459 Register UnusedLowVGPR =
1460 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
1461 if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
1462 TRI->getHWRegIndex(VGPRForAGPRCopy))) {
1463 // Reserve this newly identified VGPR (for AGPR copy)
1464 // reserved registers should already be frozen at this point
1465 // so we can avoid calling MRI.freezeReservedRegs and just use
1466 // MRI.reserveReg
1467 FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1468 MRI.reserveReg(UnusedLowVGPR, TRI);
1469 }
1470 }
1471 // We initally reserved the highest available SGPR pair for long branches
1472 // now, after RA, we shift down to a lower unused one if one exists
1473 Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg();
1474 Register UnusedLowSGPR =
1475 TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF);
1476 // If LongBranchReservedReg is null then we didn't find a long branch
1477 // and never reserved a register to begin with so there is nothing to
1478 // shift down. Then if UnusedLowSGPR is null, there isn't available lower
1479 // register to use so just keep the original one we set.
1480 if (LongBranchReservedReg && UnusedLowSGPR) {
1481 FuncInfo->setLongBranchReservedReg(UnusedLowSGPR);
1482 MRI.reserveReg(UnusedLowSGPR, TRI);
1483 }
1484}
1485
1486// The special SGPR spills like the one needed for FP, BP or any reserved
1487// registers delayed until frame lowering.
1489 MachineFunction &MF, BitVector &SavedVGPRs,
1490 bool NeedExecCopyReservedReg) const {
1491 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1494 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1495 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1496 LiveRegUnits LiveUnits;
1497 LiveUnits.init(*TRI);
1498 // Initially mark callee saved registers as used so we will not choose them
1499 // while looking for scratch SGPRs.
1500 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
1501 for (unsigned I = 0; CSRegs[I]; ++I)
1502 LiveUnits.addReg(CSRegs[I]);
1503
1504 const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
1505
1506 Register ReservedRegForExecCopy = MFI->getSGPRForEXECCopy();
1507 if (NeedExecCopyReservedReg ||
1508 (ReservedRegForExecCopy &&
1509 MRI.isPhysRegUsed(ReservedRegForExecCopy, /*SkipRegMaskTest=*/true))) {
1510 MRI.reserveReg(ReservedRegForExecCopy, TRI);
1511 Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
1512 if (UnusedScratchReg) {
1513 // If found any unused scratch SGPR, reserve the register itself for Exec
1514 // copy and there is no need for any spills in that case.
1515 MFI->setSGPRForEXECCopy(UnusedScratchReg);
1516 MRI.replaceRegWith(ReservedRegForExecCopy, UnusedScratchReg);
1517 LiveUnits.addReg(UnusedScratchReg);
1518 } else {
1519 // Needs spill.
1520 assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedRegForExecCopy) &&
1521 "Re-reserving spill slot for EXEC copy register");
1522 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, ReservedRegForExecCopy, RC,
1523 /*IncludeScratchCopy=*/false);
1524 }
1525 } else if (ReservedRegForExecCopy) {
1526 // Reset it at this point. There are no whole-wave copies and spills
1527 // encountered.
1528 MFI->setSGPRForEXECCopy(AMDGPU::NoRegister);
1529 }
1530
1531 // hasFP only knows about stack objects that already exist. We're now
1532 // determining the stack slots that will be created, so we have to predict
1533 // them. Stack objects force FP usage with calls.
1534 //
1535 // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1536 // don't want to report it here.
1537 //
1538 // FIXME: Is this really hasReservedCallFrame?
1539 const bool WillHaveFP =
1540 FrameInfo.hasCalls() &&
1541 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1542
1543 if (WillHaveFP || hasFP(MF)) {
1544 Register FramePtrReg = MFI->getFrameOffsetReg();
1545 assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
1546 "Re-reserving spill slot for FP");
1547 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, FramePtrReg);
1548 }
1549
1550 if (TRI->hasBasePointer(MF)) {
1551 Register BasePtrReg = TRI->getBaseRegister();
1552 assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
1553 "Re-reserving spill slot for BP");
1554 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, BasePtrReg);
1555 }
1556}
1557
1558// Only report VGPRs to generic code.
1560 BitVector &SavedVGPRs,
1561 RegScavenger *RS) const {
1563
1564 // If this is a function with the amdgpu_cs_chain[_preserve] calling
1565 // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then
1566 // we don't need to save and restore anything.
1567 if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
1568 return;
1569
1571
1572 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1573 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1574 const SIInstrInfo *TII = ST.getInstrInfo();
1575 bool NeedExecCopyReservedReg = false;
1576
1577 MachineInstr *ReturnMI = nullptr;
1578 for (MachineBasicBlock &MBB : MF) {
1579 for (MachineInstr &MI : MBB) {
1580 // TODO: Walking through all MBBs here would be a bad heuristic. Better
1581 // handle them elsewhere.
1582 if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
1583 NeedExecCopyReservedReg = true;
1584 else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
1585 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1586 (MFI->isChainFunction() &&
1587 TII->isChainCallOpcode(MI.getOpcode()))) {
1588 // We expect all return to be the same size.
1589 assert(!ReturnMI ||
1590 (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
1591 count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); })));
1592 ReturnMI = &MI;
1593 }
1594 }
1595 }
1596
1597 SmallVector<Register> SortedWWMVGPRs;
1598 for (Register Reg : MFI->getWWMReservedRegs()) {
1599 // The shift-back is needed only for the VGPRs used for SGPR spills and they
1600 // are of 32-bit size. SIPreAllocateWWMRegs pass can add tuples into WWM
1601 // reserved registers.
1602 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1603 if (TRI->getRegSizeInBits(*RC) > 32)
1604 continue;
1605 SortedWWMVGPRs.push_back(Reg);
1606 }
1607
1608 sort(SortedWWMVGPRs, std::greater<Register>());
1609 MFI->shiftWwmVGPRsToLowestRange(MF, SortedWWMVGPRs, SavedVGPRs);
1610
1611 if (MFI->isEntryFunction())
1612 return;
1613
1614 // Remove any VGPRs used in the return value because these do not need to be saved.
1615 // This prevents CSR restore from clobbering return VGPRs.
1616 if (ReturnMI) {
1617 for (auto &Op : ReturnMI->operands()) {
1618 if (Op.isReg())
1619 SavedVGPRs.reset(Op.getReg());
1620 }
1621 }
1622
1623 // Create the stack objects for WWM registers now.
1624 for (Register Reg : MFI->getWWMReservedRegs()) {
1625 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1626 MFI->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
1627 TRI->getSpillAlign(*RC));
1628 }
1629
1630 // Ignore the SGPRs the default implementation found.
1631 SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
1632
1633 // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1634 // In gfx908 there was do AGPR loads and stores and thus spilling also
1635 // require a temporary VGPR.
1636 if (!ST.hasGFX90AInsts())
1637 SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
1638
1639 determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
1640
1641 // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
1642 // allow the default insertion to handle them.
1643 for (auto &Reg : MFI->getWWMSpills())
1644 SavedVGPRs.reset(Reg.first);
1645}
1646
1648 BitVector &SavedRegs,
1649 RegScavenger *RS) const {
1652 if (MFI->isEntryFunction())
1653 return;
1654
1655 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1656 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1657
1658 // The SP is specifically managed and we don't want extra spills of it.
1659 SavedRegs.reset(MFI->getStackPtrOffsetReg());
1660
1661 const BitVector AllSavedRegs = SavedRegs;
1662 SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
1663
1664 // We have to anticipate introducing CSR VGPR spills or spill of caller
1665 // save VGPR reserved for SGPR spills as we now always create stack entry
1666 // for it, if we don't have any stack objects already, since we require a FP
1667 // if there is a call and stack. We will allocate a VGPR for SGPR spills if
1668 // there are any SGPR spills. Whether they are CSR spills or otherwise.
1669 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1670 const bool WillHaveFP =
1671 FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
1672
1673 // FP will be specially managed like SP.
1674 if (WillHaveFP || hasFP(MF))
1675 SavedRegs.reset(MFI->getFrameOffsetReg());
1676
1677 // Return address use with return instruction is hidden through the SI_RETURN
1678 // pseudo. Given that and since the IPRA computes actual register usage and
1679 // does not use CSR list, the clobbering of return address by function calls
1680 // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1681 // usage collection. This will ensure save/restore of return address happens
1682 // in those scenarios.
1683 const MachineRegisterInfo &MRI = MF.getRegInfo();
1684 Register RetAddrReg = TRI->getReturnAddressReg(MF);
1685 if (!MFI->isEntryFunction() &&
1686 (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) {
1687 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
1688 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
1689 }
1690}
1691
1694 std::vector<CalleeSavedInfo> &CSI) const {
1695 if (CSI.empty())
1696 return true; // Early exit if no callee saved registers are modified!
1697
1698 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1699 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1700 const SIRegisterInfo *RI = ST.getRegisterInfo();
1701 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1702 Register BasePtrReg = RI->getBaseRegister();
1703 Register SGPRForFPSaveRestoreCopy =
1704 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1705 Register SGPRForBPSaveRestoreCopy =
1706 FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg);
1707 if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
1708 return false;
1709
1710 unsigned NumModifiedRegs = 0;
1711
1712 if (SGPRForFPSaveRestoreCopy)
1713 NumModifiedRegs++;
1714 if (SGPRForBPSaveRestoreCopy)
1715 NumModifiedRegs++;
1716
1717 for (auto &CS : CSI) {
1718 if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) {
1719 CS.setDstReg(SGPRForFPSaveRestoreCopy);
1720 if (--NumModifiedRegs)
1721 break;
1722 } else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) {
1723 CS.setDstReg(SGPRForBPSaveRestoreCopy);
1724 if (--NumModifiedRegs)
1725 break;
1726 }
1727 }
1728
1729 return false;
1730}
1731
1733 const MachineFunction &MF) const {
1734
1735 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1736 const MachineFrameInfo &MFI = MF.getFrameInfo();
1737 const SIInstrInfo *TII = ST.getInstrInfo();
1738 uint64_t EstStackSize = MFI.estimateStackSize(MF);
1739 uint64_t MaxOffset = EstStackSize - 1;
1740
1741 // We need the emergency stack slots to be allocated in range of the
1742 // MUBUF/flat scratch immediate offset from the base register, so assign these
1743 // first at the incoming SP position.
1744 //
1745 // TODO: We could try sorting the objects to find a hole in the first bytes
1746 // rather than allocating as close to possible. This could save a lot of space
1747 // on frames with alignment requirements.
1748 if (ST.enableFlatScratch()) {
1749 if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1751 return false;
1752 } else {
1753 if (TII->isLegalMUBUFImmOffset(MaxOffset))
1754 return false;
1755 }
1756
1757 return true;
1758}
1759
1761 MachineFunction &MF,
1764 int64_t Amount = I->getOperand(0).getImm();
1765 if (Amount == 0)
1766 return MBB.erase(I);
1767
1768 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1769 const SIInstrInfo *TII = ST.getInstrInfo();
1770 const DebugLoc &DL = I->getDebugLoc();
1771 unsigned Opc = I->getOpcode();
1772 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1773 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1774
1775 if (!hasReservedCallFrame(MF)) {
1776 Amount = alignTo(Amount, getStackAlign());
1777 assert(isUInt<32>(Amount) && "exceeded stack address space size");
1780
1781 Amount *= getScratchScaleFactor(ST);
1782 if (IsDestroy)
1783 Amount = -Amount;
1784 auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
1785 .addReg(SPReg)
1786 .addImm(Amount);
1787 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1788 } else if (CalleePopAmount != 0) {
1789 llvm_unreachable("is this used?");
1790 }
1791
1792 return MBB.erase(I);
1793}
1794
1795/// Returns true if the frame will require a reference to the stack pointer.
1796///
1797/// This is the set of conditions common to setting up the stack pointer in a
1798/// kernel, and for using a frame pointer in a callable function.
1799///
1800/// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
1801/// references SP.
1803 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
1804}
1805
1806// The FP for kernels is always known 0, so we never really need to setup an
1807// explicit register for it. However, DisableFramePointerElim will force us to
1808// use a register for it.
1810 const MachineFrameInfo &MFI = MF.getFrameInfo();
1811
1812 // For entry & chain functions we can use an immediate offset in most cases,
1813 // so the presence of calls doesn't imply we need a distinct frame pointer.
1814 if (MFI.hasCalls() &&
1817 // All offsets are unsigned, so need to be addressed in the same direction
1818 // as stack growth.
1819
1820 // FIXME: This function is pretty broken, since it can be called before the
1821 // frame layout is determined or CSR spills are inserted.
1822 return MFI.getStackSize() != 0;
1823 }
1824
1825 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
1826 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
1827 MF) ||
1829}
1830
1831// This is essentially a reduced version of hasFP for entry functions. Since the
1832// stack pointer is known 0 on entry to kernels, we never really need an FP
1833// register. We may need to initialize the stack pointer depending on the frame
1834// properties, which logically overlaps many of the cases where an ordinary
1835// function would require an FP.
1836// Also used for chain functions. While not technically entry functions, chain
1837// functions may need to set up a stack pointer in some situations.
1839 const MachineFunction &MF) const {
1840 // Callable functions always require a stack pointer reference.
1843 "only expected to call this for entry points and chain functions");
1844
1845 const MachineFrameInfo &MFI = MF.getFrameInfo();
1846
1847 // Entry points ordinarily don't need to initialize SP. We have to set it up
1848 // for callees if there are any. Also note tail calls are impossible/don't
1849 // make any sense for kernels.
1850 if (MFI.hasCalls())
1851 return true;
1852
1853 // We still need to initialize the SP if we're doing anything weird that
1854 // references the SP, like variable sized stack objects.
1855 return frameTriviallyRequiresSP(MFI);
1856}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB)
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static const Function * getParent(const Value *V)
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A set of register units.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static constexpr Register SPReg
static constexpr Register FPReg
This file declares the machine register scavenger class.
static void buildEpilogRestore(const GCNSubtarget &ST, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &FuncInfo, LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SpillReg, int FI, Register FrameReg, int64_t DwordOff=0)
static cl::opt< bool > EnableSpillVGPRToAGPR("amdgpu-spill-vgpr-to-agpr", cl::desc("Enable spilling VGPRs to AGPRs"), cl::ReallyHidden, cl::init(true))
static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR, const TargetRegisterClass &RC=AMDGPU::SReg_32_XM0_XEXECRegClass, bool IncludeScratchCopy=true)
Query target location for spilling SGPRs IncludeScratchCopy : Also look for free scratch SGPRs.
static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, const SIInstrInfo *TII, Register TargetReg)
static bool allStackObjectsAreDead(const MachineFrameInfo &MFI)
static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &FuncInfo, LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SpillReg, int FI, Register FrameReg, int64_t DwordOff=0)
static Register buildScratchExecCopy(LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool IsProlog, bool EnableInactiveLanes)
static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI)
Returns true if the frame will require a reference to the stack pointer.
static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI, const SIMachineFunctionInfo *FuncInfo, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, bool IsProlog)
static bool allSGPRSpillsAreDead(const MachineFunction &MF)
static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits, const TargetRegisterClass &RC, bool Unused=false)
static MCRegister findUnusedRegister(MachineRegisterInfo &MRI, const LiveRegUnits &LiveUnits, const TargetRegisterClass &RC)
static unsigned getScratchScaleFactor(const GCNSubtarget &ST)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:198
bool test(unsigned Idx) const
Definition: BitVector.h:461
BitVector & reset()
Definition: BitVector.h:392
void clearBitsNotInMask(const uint32_t *Mask, unsigned MaskWords=~0u)
clearBitsNotInMask - Clear a bit in this vector for every '0' bit in Mask.
Definition: BitVector.h:725
BitVector & set()
Definition: BitVector.h:351
bool any() const
any - Returns true if any bit is set.
Definition: BitVector.h:170
void clearBitsInMask(const uint32_t *Mask, unsigned MaskWords=~0u)
clearBitsInMask - Clear any bits in this vector that are set in Mask.
Definition: BitVector.h:713
iterator_range< const_set_bits_iterator > set_bits() const
Definition: BitVector.h:140
bool empty() const
empty - Tests whether there are no bits in this bitvector.
Definition: BitVector.h:156
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
bool hasImplicitBufferPtr() const
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
TargetInstrInfo overrides.
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
If the specified machine instruction is a direct store to a stack slot, return the virtual or physica...
A set of register units used to track register liveness.
Definition: LiveRegUnits.h:30
bool available(MCPhysReg Reg) const
Returns true if no part of physical register Reg is live.
Definition: LiveRegUnits.h:116
void init(const TargetRegisterInfo &TRI)
Initialize and clear the set.
Definition: LiveRegUnits.h:73
void stepBackward(const MachineInstr &MI)
Updates liveness when stepping backwards over the instruction MI.
void addReg(MCPhysReg Reg)
Adds register units covered by physical register Reg.
Definition: LiveRegUnits.h:86
void addLiveOuts(const MachineBasicBlock &MBB)
Adds registers living out of block MBB.
bool empty() const
Returns true if the set is empty.
Definition: LiveRegUnits.h:83
void addLiveIns(const MachineBasicBlock &MBB)
Adds registers living into block MBB.
void removeReg(MCPhysReg Reg)
Removes all register units covered by physical register Reg.
Definition: LiveRegUnits.h:102
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void sortUniqueLiveIns()
Sorts and uniques the LiveIns vector.
iterator getLastNonDebugInstr(bool SkipPseudoOp=true)
Returns an iterator to the last non-debug instruction in the basic block, or end().
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasVarSizedObjects() const
This method may be called any time after instruction selection is complete to determine if the stack ...
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
bool hasCalls() const
Return true if the current function has any function calls.
bool isFrameAddressTaken() const
This method may be called any time after instruction selection is complete to determine if there is a...
Align getMaxAlign() const
Return the alignment in bytes that this function must be aligned to, which is greater than the defaul...
bool hasPatchPoint() const
This method may be called any time after instruction selection is complete to determine if there is a...
bool hasTailCall() const
Returns true if the function contains a tail call.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasStackMap() const
This method may be called any time after instruction selection is complete to determine if there is a...
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getObjectIndexBegin() const
Return the minimum frame object index.
bool isDeadObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a dead object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:691
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
void setIsDead(bool Val=true)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const MCPhysReg * getCalleeSavedRegs() const
Returns list of callee saved registers.
void addLiveIn(MCRegister Reg, Register vreg=Register())
addLiveIn - Add the specified register as a live-in.
PrologEpilogSGPRSpillBuilder(Register Reg, const PrologEpilogSGPRSaveRestoreInfo SI, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, const SIInstrInfo *TII, const SIRegisterInfo &TRI, LiveRegUnits &LiveUnits, Register FrameReg)
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
void backward()
Update internal register state and move MBB iterator backwards.
void addScavengingFrameIndex(int FI)
Add a scavenging frame index.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
void determinePrologEpilogSGPRSaves(MachineFunction &MF, BitVector &SavedRegs, bool NeedExecCopyReservedReg) const
StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, Register &FrameReg) const override
getFrameIndexReference - This method should return the base register and offset used to reference a f...
void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS=nullptr) const override
processFunctionBeforeFrameFinalized - This method is called immediately before the specified function...
bool allocateScavengingFrameIndexesNearIncomingSP(const MachineFunction &MF) const override
Control the placement of special register scavenging spill slots when allocating a stack frame.
bool requiresStackPointerReference(const MachineFunction &MF) const
void emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const override
This method determines which of the registers reported by TargetRegisterInfo::getCalleeSavedRegs() sh...
void emitCSRSpillStores(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, Register FrameReg, Register FramePtrRegScratchCopy) const
bool hasFPImpl(const MachineFunction &MF) const override
bool assignCalleeSavedSpillSlots(MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector< CalleeSavedInfo > &CSI) const override
void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override
void emitCSRSpillRestores(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, Register FrameReg, Register FramePtrRegScratchCopy) const
void processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF, RegScavenger *RS=nullptr) const override
processFunctionBeforeFrameIndicesReplaced - This method is called immediately before MO_FrameIndex op...
bool isSupportedStackID(TargetStackID::Value ID) const override
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override
emitProlog/emitEpilog - These methods insert prolog and epilog code into the function.
MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
This method is called during prolog/epilog code insertion to eliminate call frame setup and destroy p...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
ArrayRef< PrologEpilogSGPRSpill > getPrologEpilogSGPRSpills() const
const WWMSpillsMap & getWWMSpills() const
void getAllScratchSGPRCopyDstRegs(SmallVectorImpl< Register > &Regs) const
ArrayRef< MCPhysReg > getAGPRSpillVGPRs() const
void shiftWwmVGPRsToLowestRange(MachineFunction &MF, SmallVectorImpl< Register > &WWMVGPRs, BitVector &SavedVGPRs)
GCNUserSGPRUsageInfo & getUserSGPRInfo()
void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size=4, Align Alignment=Align(4))
void setVGPRToAGPRSpillDead(int FrameIndex)
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< MCPhysReg > getVGPRSpillAGPRs() const
int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI)
bool hasPrologEpilogSGPRSpillEntry(Register Reg) const
Register getGITPtrLoReg(const MachineFunction &MF) const
void setVGPRForAGPRCopy(Register NewVGPRForAGPRCopy)
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR)
Reserve AGPRs or VGPRs to support spilling for FrameIndex FI.
void splitWWMSpillRegisters(MachineFunction &MF, SmallVectorImpl< std::pair< Register, int > > &CalleeSavedRegs, SmallVectorImpl< std::pair< Register, int > > &ScratchRegs) const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const
bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, bool SpillToPhysVGPRLane=false, bool IsPrologEpilog=false)
void setLongBranchReservedReg(Register Reg)
bool removeDeadFrameIndices(MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs)
If ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill to the default stack.
MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
bool checkIndexInPrologEpilogSGPRSpills(int FI) const
const ReservedRegSet & getWWMReservedRegs() const
Register getImplicitBufferPtrUserSGPR() const
const PrologEpilogSGPRSaveRestoreInfo & getPrologEpilogSGPRSaveRestoreInfo(Register Reg) const
void setIsStackRealigned(bool Realigned=true)
void addToPrologEpilogSGPRSpills(Register Reg, PrologEpilogSGPRSaveRestoreInfo SI)
Register getScratchSGPRCopyDstReg(Register Reg) const
Register getFrameRegister(const MachineFunction &MF) const override
bool empty() const
Definition: SmallVector.h:81
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
int64_t getFixed() const
Returns the fixed component of the stack.
Definition: TypeSize.h:49
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
virtual bool hasReservedCallFrame(const MachineFunction &MF) const
hasReservedCallFrame - Under normal circumstances, when a frame pointer is not required,...
virtual void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const
This method determines which of the registers reported by TargetRegisterInfo::getCalleeSavedRegs() sh...
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetOptions Options
bool DisableFramePointerElim(const MachineFunction &MF) const
DisableFramePointerElim - This returns true if frame pointer elimination optimization should be disab...
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ PRIVATE_ADDRESS
Address space for private memory.
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isCompute(CallingConv::ID cc)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.