LLVM 23.0.0git
SIRegisterInfo.cpp
Go to the documentation of this file.
1//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI implementation of the TargetRegisterInfo class.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
16#include "GCNSubtarget.h"
20#include "SIRegisterInfo.h"
26
27using namespace llvm;
28
29#define GET_REGINFO_TARGET_DESC
30#include "AMDGPUGenRegisterInfo.inc"
31
33 "amdgpu-spill-sgpr-to-vgpr",
34 cl::desc("Enable spilling SGPRs to VGPRs"),
36 cl::init(true));
37
39 "amdgpu-spill-cfi-saved-regs",
40 cl::desc("Enable spilling the registers required for CFI emission"),
42
43std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
44std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
45
46// Map numbers of DWORDs to indexes in SubRegFromChannelTable.
47// Valid indexes are shifted 1, such that a 0 mapping means unsupported.
48// e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
49// meaning index 7 in SubRegFromChannelTable.
50static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
51 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
52
53static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI,
54 const Twine &ErrMsg) {
56 DiagnosticInfoUnsupported(Fn, ErrMsg, MI.getDebugLoc()));
57}
58
59namespace llvm {
60
61// A temporary struct to spill SGPRs.
62// This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
63// just v_writelane and v_readlane.
64//
65// When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
66// is saved to scratch (or the other way around for loads).
67// For this, a VGPR is required where the needed lanes can be clobbered. The
68// RegScavenger can provide a VGPR where currently active lanes can be
69// clobbered, but we still need to save inactive lanes.
70// The high-level steps are:
71// - Try to scavenge SGPR(s) to save exec
72// - Try to scavenge VGPR
73// - Save needed, all or inactive lanes of a TmpVGPR
74// - Spill/Restore SGPRs using TmpVGPR
75// - Restore TmpVGPR
76//
77// To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
78// cannot scavenge temporary SGPRs to save exec, we use the following code:
79// buffer_store_dword TmpVGPR ; only if active lanes need to be saved
80// s_not exec, exec
81// buffer_store_dword TmpVGPR ; save inactive lanes
82// s_not exec, exec
84 struct PerVGPRData {
85 unsigned PerVGPR;
86 unsigned NumVGPRs;
87 int64_t VGPRLanes;
88 };
89
90 // The SGPR to save
94 unsigned NumSubRegs;
95 bool IsKill;
96 const DebugLoc &DL;
97
98 /* When spilling to stack */
99 // The SGPRs are written into this VGPR, which is then written to scratch
100 // (or vice versa for loads).
101 Register TmpVGPR = AMDGPU::NoRegister;
102 // Temporary spill slot to save TmpVGPR to.
104 // If TmpVGPR is live before the spill or if it is scavenged.
105 bool TmpVGPRLive = false;
106 // Scavenged SGPR to save EXEC.
107 Register SavedExecReg = AMDGPU::NoRegister;
108 // Stack index to write the SGPRs to.
109 int Index;
110 unsigned EltSize = 4;
111
120 unsigned MovOpc;
121 unsigned NotOpc;
122
126 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
127 MI->getOperand(0).isKill(), Index, RS) {}
128
131 bool IsKill, int Index, RegScavenger *RS)
132 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
133 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
134 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
136 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
137 SplitParts = TRI.getRegSplitParts(RC, EltSize);
138 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
139
140 if (IsWave32) {
141 ExecReg = AMDGPU::EXEC_LO;
142 MovOpc = AMDGPU::S_MOV_B32;
143 NotOpc = AMDGPU::S_NOT_B32;
144 } else {
145 ExecReg = AMDGPU::EXEC;
146 MovOpc = AMDGPU::S_MOV_B64;
147 NotOpc = AMDGPU::S_NOT_B64;
148 }
149
150 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
151 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
152 SuperReg != AMDGPU::EXEC && "exec should never spill");
153 }
154
157 Data.PerVGPR = IsWave32 ? 32 : 64;
158 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
159 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
160 return Data;
161 }
162
163 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
164 // free.
165 // Writes these instructions if an SGPR can be scavenged:
166 // s_mov_b64 s[6:7], exec ; Save exec
167 // s_mov_b64 exec, 3 ; Wanted lanemask
168 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
169 //
170 // Writes these instructions if no SGPR can be scavenged:
171 // buffer_store_dword v0 ; Only if no free VGPR was found
172 // s_not_b64 exec, exec
173 // buffer_store_dword v0 ; Save inactive lanes
174 // ; exec stays inverted, it is flipped back in
175 // ; restore.
176 void prepare() {
177 // Scavenged temporary VGPR to use. It must be scavenged once for any number
178 // of spilled subregs.
179 // FIXME: The liveness analysis is limited and does not tell if a register
180 // is in use in lanes that are currently inactive. We can never be sure if
181 // a register as actually in use in another lane, so we need to save all
182 // used lanes of the chosen VGPR.
183 assert(RS && "Cannot spill SGPR to memory without RegScavenger");
184 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
185 0, false);
186
187 // Reserve temporary stack slot
188 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
189 if (TmpVGPR) {
190 // Found a register that is dead in the currently active lanes, we only
191 // need to spill inactive lanes.
192 TmpVGPRLive = false;
193 } else {
194 // Pick v0 because it doesn't make a difference.
195 TmpVGPR = AMDGPU::VGPR0;
196 TmpVGPRLive = true;
197 }
198
199 if (TmpVGPRLive) {
200 // We need to inform the scavenger that this index is already in use until
201 // we're done with the custom emergency spill.
202 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR);
203 }
204
205 // We may end up recursively calling the scavenger, and don't want to re-use
206 // the same register.
207 RS->setRegUsed(TmpVGPR);
208
209 // Try to scavenge SGPRs to save exec
210 assert(!SavedExecReg && "Exec is already saved, refuse to save again");
211 const TargetRegisterClass &RC =
212 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
213 RS->setRegUsed(SuperReg);
214 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
215
216 int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
217
218 if (SavedExecReg) {
219 RS->setRegUsed(SavedExecReg);
220 // Set exec to needed lanes
222 auto I =
223 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
224 if (!TmpVGPRLive)
226 // Spill needed lanes
227 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
228 } else {
229 // The modify and restore of exec clobber SCC, which we would have to save
230 // and restore. FIXME: We probably would need to reserve a register for
231 // this.
232 if (RS->isRegUsed(AMDGPU::SCC))
233 emitUnsupportedError(MF.getFunction(), *MI,
234 "unhandled SGPR spill to memory");
235
236 // Spill active lanes
237 if (TmpVGPRLive)
238 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
239 /*IsKill*/ false);
240 // Spill inactive lanes
241 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
242 if (!TmpVGPRLive)
244 I->getOperand(2).setIsDead(); // Mark SCC as dead.
245 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
246 }
247 }
248
249 // Writes these instructions if an SGPR can be scavenged:
250 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
251 // s_waitcnt vmcnt(0) ; If a free VGPR was found
252 // s_mov_b64 exec, s[6:7] ; Save exec
253 //
254 // Writes these instructions if no SGPR can be scavenged:
255 // buffer_load_dword v0 ; Restore inactive lanes
256 // s_waitcnt vmcnt(0) ; If a free VGPR was found
257 // s_not_b64 exec, exec
258 // buffer_load_dword v0 ; Only if no free VGPR was found
259 void restore() {
260 if (SavedExecReg) {
261 // Restore used lanes
262 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
263 /*IsKill*/ false);
264 // Restore exec
265 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
267 // Add an implicit use of the load so it is not dead.
268 // FIXME This inserts an unnecessary waitcnt
269 if (!TmpVGPRLive) {
271 }
272 } else {
273 // Restore inactive lanes
274 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
275 /*IsKill*/ false);
276 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
277 if (!TmpVGPRLive)
279 I->getOperand(2).setIsDead(); // Mark SCC as dead.
280
281 // Restore active lanes
282 if (TmpVGPRLive)
283 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
284 }
285
286 // Inform the scavenger where we're releasing our custom scavenged register.
287 if (TmpVGPRLive) {
288 MachineBasicBlock::iterator RestorePt = std::prev(MI);
289 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt);
290 }
291 }
292
293 // Write TmpVGPR to memory or read TmpVGPR from memory.
294 // Either using a single buffer_load/store if exec is set to the needed mask
295 // or using
296 // buffer_load
297 // s_not exec, exec
298 // buffer_load
299 // s_not exec, exec
300 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
301 if (SavedExecReg) {
302 // Spill needed lanes
303 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
304 } else {
305 // The modify and restore of exec clobber SCC, which we would have to save
306 // and restore. FIXME: We probably would need to reserve a register for
307 // this.
308 if (RS->isRegUsed(AMDGPU::SCC))
309 emitUnsupportedError(MF.getFunction(), *MI,
310 "unhandled SGPR spill to memory");
311
312 // Spill active lanes
313 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
314 /*IsKill*/ false);
315 // Spill inactive lanes
316 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
317 Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
318 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
319 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
320 Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
321 }
322 }
323
325 assert(MBB->getParent() == &MF);
326 MI = NewMI;
327 MBB = NewMBB;
328 }
329};
330
331} // namespace llvm
332
334 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
335 ST.getAMDGPUDwarfFlavour(),
336 /*PC=*/0,
337 ST.getHwMode(MCSubtargetInfo::HwMode_RegInfo)),
338 ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
339
340 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
341 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
342 (getSubRegIndexLaneMask(AMDGPU::lo16) |
343 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
344 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
345 "getNumCoveredRegs() will not work with generated subreg masks!");
346
347 RegPressureIgnoredUnits.resize(getNumRegUnits());
348 RegPressureIgnoredUnits.set(
349 static_cast<unsigned>(*regunits(MCRegister::from(AMDGPU::M0)).begin()));
350 for (auto Reg : AMDGPU::VGPR_16RegClass) {
351 if (AMDGPU::isHi16Reg(Reg, *this))
352 RegPressureIgnoredUnits.set(
353 static_cast<unsigned>(*regunits(Reg).begin()));
354 }
355
356 // HACK: Until this is fully tablegen'd.
357 static llvm::once_flag InitializeRegSplitPartsFlag;
358
359 static auto InitializeRegSplitPartsOnce = [this]() {
360 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
361 unsigned Size = getSubRegIdxSize(Idx);
362 if (Size & 15)
363 continue;
364 std::vector<int16_t> &Vec = RegSplitParts[Size / 16 - 1];
365 unsigned Pos = getSubRegIdxOffset(Idx);
366 if (Pos % Size)
367 continue;
368 Pos /= Size;
369 if (Vec.empty()) {
370 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
371 Vec.resize(MaxNumParts);
372 }
373 Vec[Pos] = Idx;
374 }
375 };
376
377 static llvm::once_flag InitializeSubRegFromChannelTableFlag;
378
379 static auto InitializeSubRegFromChannelTableOnce = [this]() {
380 for (auto &Row : SubRegFromChannelTable)
381 Row.fill(AMDGPU::NoSubRegister);
382 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
383 unsigned Width = getSubRegIdxSize(Idx) / 32;
384 unsigned Offset = getSubRegIdxOffset(Idx) / 32;
386 Width = SubRegFromChannelTableWidthMap[Width];
387 if (Width == 0)
388 continue;
389 unsigned TableIdx = Width - 1;
390 assert(TableIdx < SubRegFromChannelTable.size());
391 assert(Offset < SubRegFromChannelTable[TableIdx].size());
392 SubRegFromChannelTable[TableIdx][Offset] = Idx;
393 }
394 };
395
396 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
397 llvm::call_once(InitializeSubRegFromChannelTableFlag,
398 InitializeSubRegFromChannelTableOnce);
399}
400
401void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
402 MCRegister Reg) const {
403 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
404 Reserved.set(*R);
405}
406
407// Forced to be here by one .inc
409 const MachineFunction *MF) const {
411 switch (CC) {
412 case CallingConv::C:
415 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
416 : CSR_AMDGPU_SaveList;
419 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
420 : CSR_AMDGPU_SI_Gfx_SaveList;
422 return CSR_AMDGPU_CS_ChainPreserve_SaveList;
423 default: {
424 // Dummy to not crash RegisterClassInfo.
425 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
426 return &NoCalleeSavedReg;
427 }
428 }
429}
430
431const MCPhysReg *
433 return nullptr;
434}
435
437 CallingConv::ID CC) const {
438 switch (CC) {
439 case CallingConv::C:
442 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
443 : CSR_AMDGPU_RegMask;
446 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
447 : CSR_AMDGPU_SI_Gfx_RegMask;
450 // Calls to these functions never return, so we can pretend everything is
451 // preserved.
452 return AMDGPU_AllVGPRs_RegMask;
453 default:
454 return nullptr;
455 }
456}
457
459 return CSR_AMDGPU_NoRegs_RegMask;
460}
461
463 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
464}
465
468 const MachineFunction &MF) const {
469 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
470 // equivalent AV class. If used one, the verifier will crash after
471 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
472 // until Instruction selection.
473 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
474 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
475 return &AMDGPU::AV_32RegClass;
476 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
477 return &AMDGPU::AV_64RegClass;
478 if (RC == &AMDGPU::VReg_64_Align2RegClass ||
479 RC == &AMDGPU::AReg_64_Align2RegClass)
480 return &AMDGPU::AV_64_Align2RegClass;
481 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
482 return &AMDGPU::AV_96RegClass;
483 if (RC == &AMDGPU::VReg_96_Align2RegClass ||
484 RC == &AMDGPU::AReg_96_Align2RegClass)
485 return &AMDGPU::AV_96_Align2RegClass;
486 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
487 return &AMDGPU::AV_128RegClass;
488 if (RC == &AMDGPU::VReg_128_Align2RegClass ||
489 RC == &AMDGPU::AReg_128_Align2RegClass)
490 return &AMDGPU::AV_128_Align2RegClass;
491 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
492 return &AMDGPU::AV_160RegClass;
493 if (RC == &AMDGPU::VReg_160_Align2RegClass ||
494 RC == &AMDGPU::AReg_160_Align2RegClass)
495 return &AMDGPU::AV_160_Align2RegClass;
496 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
497 return &AMDGPU::AV_192RegClass;
498 if (RC == &AMDGPU::VReg_192_Align2RegClass ||
499 RC == &AMDGPU::AReg_192_Align2RegClass)
500 return &AMDGPU::AV_192_Align2RegClass;
501 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
502 return &AMDGPU::AV_256RegClass;
503 if (RC == &AMDGPU::VReg_256_Align2RegClass ||
504 RC == &AMDGPU::AReg_256_Align2RegClass)
505 return &AMDGPU::AV_256_Align2RegClass;
506 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
507 return &AMDGPU::AV_512RegClass;
508 if (RC == &AMDGPU::VReg_512_Align2RegClass ||
509 RC == &AMDGPU::AReg_512_Align2RegClass)
510 return &AMDGPU::AV_512_Align2RegClass;
511 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
512 return &AMDGPU::AV_1024RegClass;
513 if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
514 RC == &AMDGPU::AReg_1024_Align2RegClass)
515 return &AMDGPU::AV_1024_Align2RegClass;
516 }
517
519}
520
522 const SIFrameLowering *TFI = ST.getFrameLowering();
524
525 // During ISel lowering we always reserve the stack pointer in entry and chain
526 // functions, but never actually want to reference it when accessing our own
527 // frame. If we need a frame pointer we use it, but otherwise we can just use
528 // an immediate "0" which we represent by returning NoRegister.
529 if (FuncInfo->isBottomOfStack()) {
530 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
531 }
532 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
533 : FuncInfo->getStackPtrOffsetReg();
534}
535
537 // When we need stack realignment, we can't reference off of the
538 // stack pointer, so we reserve a base pointer.
539 return shouldRealignStack(MF);
540}
541
542Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
543
545 return AMDGPU_AllVGPRs_RegMask;
546}
547
549 return AMDGPU_AllAGPRs_RegMask;
550}
551
553 return AMDGPU_AllVectorRegs_RegMask;
554}
555
557 return AMDGPU_AllAllocatableSRegs_RegMask;
558}
559
560unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
561 unsigned NumRegs) {
562 assert(NumRegs < SubRegFromChannelTableWidthMap.size());
563 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
564 assert(NumRegIndex && "Not implemented");
565 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
566 return SubRegFromChannelTable[NumRegIndex - 1][Channel];
567}
568
572
575 const unsigned Align,
576 const TargetRegisterClass *RC) const {
577 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align;
578 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
579 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
580}
581
583 const MachineFunction &MF) const {
584 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
585}
586
588 BitVector Reserved(getNumRegs());
589 Reserved.set(AMDGPU::MODE);
590
592
593 // Reserve special purpose registers.
594 //
595 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
596 // this seems likely to result in bugs, so I'm marking them as reserved.
597 reserveRegisterTuples(Reserved, AMDGPU::EXEC);
598 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
599
600 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
601 reserveRegisterTuples(Reserved, AMDGPU::M0);
602
603 // Reserve src_vccz, src_execz, src_scc.
604 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
605 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
606 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
607
608 // Reserve the memory aperture registers
609 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
610 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
611 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
612 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
613 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_LO);
614 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_HI);
615
616 // Reserve async counters pseudo registers
617 reserveRegisterTuples(Reserved, AMDGPU::ASYNCcnt);
618 reserveRegisterTuples(Reserved, AMDGPU::TENSORcnt);
619
620 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
621 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
622
623 // Reserve xnack_mask registers - support is not implemented in Codegen.
624 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
625
626 // Reserve lds_direct register - support is not implemented in Codegen.
627 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
628
629 // Reserve Trap Handler registers - support is not implemented in Codegen.
630 reserveRegisterTuples(Reserved, AMDGPU::TBA);
631 reserveRegisterTuples(Reserved, AMDGPU::TMA);
632 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
633 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
634 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
635 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
636 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
637 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
638 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
639 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
640
641 // Reserve null register - it shall never be allocated
642 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
643
644 // Reserve SGPRs.
645 //
646 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
647 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
648 for (const TargetRegisterClass *RC : regclasses()) {
649 if (RC->isBaseClass() && isSGPRClass(RC)) {
650 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
651 for (MCPhysReg Reg : *RC) {
652 unsigned Index = getHWRegIndex(Reg);
653 if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs &&
654 Reg != AMDGPU::VCC_LO && Reg != AMDGPU::VCC_HI &&
655 Reg != AMDGPU::VCC)
656 Reserved.set(Reg);
657 }
658 }
659 }
660
661 Register ScratchRSrcReg = MFI->getScratchRSrcReg();
662 if (ScratchRSrcReg != AMDGPU::NoRegister) {
663 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
664 // need to spill.
665 // TODO: May need to reserve a VGPR if doing LDS spilling.
666 reserveRegisterTuples(Reserved, ScratchRSrcReg);
667 }
668
669 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
670 if (LongBranchReservedReg)
671 reserveRegisterTuples(Reserved, LongBranchReservedReg);
672
673 // We have to assume the SP is needed in case there are calls in the function,
674 // which is detected after the function is lowered. If we aren't really going
675 // to need SP, don't bother reserving it.
676 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
677 if (StackPtrReg) {
678 reserveRegisterTuples(Reserved, StackPtrReg);
679 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
680 }
681
682 MCRegister FrameReg = MFI->getFrameOffsetReg();
683 if (FrameReg) {
684 reserveRegisterTuples(Reserved, FrameReg);
685 assert(!isSubRegister(ScratchRSrcReg, FrameReg));
686 }
687
688 if (hasBasePointer(MF)) {
689 MCRegister BasePtrReg = getBaseRegister();
690 reserveRegisterTuples(Reserved, BasePtrReg);
691 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
692 }
693
694 // FIXME: Use same reserved register introduced in D149775
695 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
696 Register ExecCopyReg = MFI->getSGPRForEXECCopy();
697 if (ExecCopyReg)
698 reserveRegisterTuples(Reserved, ExecCopyReg);
699
700 // Reserve VGPRs/AGPRs.
701 //
702 auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction());
703
704 for (const TargetRegisterClass *RC : regclasses()) {
705 if (RC->isBaseClass() && isVGPRClass(RC)) {
706 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
707 for (MCPhysReg Reg : *RC) {
708 unsigned Index = getHWRegIndex(Reg);
709 if (Index + NumRegs > MaxNumVGPRs)
710 Reserved.set(Reg);
711 }
712 }
713 }
714
715 // Reserve all the AGPRs if there are no instructions to use it.
716 if (!ST.hasMAIInsts())
717 MaxNumAGPRs = 0;
718 for (const TargetRegisterClass *RC : regclasses()) {
719 if (RC->isBaseClass() && isAGPRClass(RC)) {
720 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
721 for (MCPhysReg Reg : *RC) {
722 unsigned Index = getHWRegIndex(Reg);
723 if (Index + NumRegs > MaxNumAGPRs)
724 Reserved.set(Reg);
725 }
726 }
727 }
728
729 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
730 // VGPR available at all times.
731 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
732 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
733 }
734
735 // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The
736 // MFI->getNonWWMRegMask() field will have a valid bitmask only during
737 // wwm-regalloc and it would be empty otherwise.
738 BitVector NonWWMRegMask = MFI->getNonWWMRegMask();
739 if (!NonWWMRegMask.empty()) {
740 for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs;
741 RegI < RegE; ++RegI) {
742 if (NonWWMRegMask.test(RegI))
743 reserveRegisterTuples(Reserved, RegI);
744 }
745 }
746
747 for (Register Reg : MFI->getWWMReservedRegs())
748 reserveRegisterTuples(Reserved, Reg);
749
750 // FIXME: Stop using reserved registers for this.
751 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
752 reserveRegisterTuples(Reserved, Reg);
753
754 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
755 reserveRegisterTuples(Reserved, Reg);
756
757 return Reserved;
758}
759
761 MCRegister PhysReg) const {
762 return !MF.getRegInfo().isReserved(PhysReg);
763}
764
767 // On entry or in chain functions, the base address is 0, so it can't possibly
768 // need any more alignment.
769
770 // FIXME: Should be able to specify the entry frame alignment per calling
771 // convention instead.
772 if (Info->isBottomOfStack())
773 return false;
774
776}
777
780 if (Info->isEntryFunction()) {
781 const MachineFrameInfo &MFI = Fn.getFrameInfo();
782 return MFI.hasStackObjects() || MFI.hasCalls();
783 }
784
785 // May need scavenger for dealing with callee saved registers.
786 return true;
787}
788
790 const MachineFunction &MF) const {
791 // Do not use frame virtual registers. They used to be used for SGPRs, but
792 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
793 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
794 // spill.
795 return false;
796}
797
799 const MachineFunction &MF) const {
800 const MachineFrameInfo &MFI = MF.getFrameInfo();
801 return MFI.hasStackObjects();
802}
803
805 const MachineFunction &) const {
806 // There are no special dedicated stack or frame pointers.
807 return true;
808}
809
812
813 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
814 AMDGPU::OpName::offset);
815 return MI->getOperand(OffIdx).getImm();
816}
817
819 int Idx) const {
820 switch (MI->getOpcode()) {
821 case AMDGPU::V_ADD_U32_e32:
822 case AMDGPU::V_ADD_U32_e64:
823 case AMDGPU::V_ADD_CO_U32_e32: {
824 int OtherIdx = Idx == 1 ? 2 : 1;
825 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
826 return OtherOp.isImm() ? OtherOp.getImm() : 0;
827 }
828 case AMDGPU::V_ADD_CO_U32_e64: {
829 int OtherIdx = Idx == 2 ? 3 : 2;
830 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
831 return OtherOp.isImm() ? OtherOp.getImm() : 0;
832 }
833 default:
834 break;
835 }
836
838 return 0;
839
840 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
841 AMDGPU::OpName::vaddr) ||
842 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
843 AMDGPU::OpName::saddr))) &&
844 "Should never see frame index on non-address operand");
845
847}
848
850 const MachineInstr &MI) {
851 assert(MI.getDesc().isAdd());
852 const MachineOperand &Src0 = MI.getOperand(1);
853 const MachineOperand &Src1 = MI.getOperand(2);
854
855 if (Src0.isFI()) {
856 return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
857 Src1.getReg()));
858 }
859
860 if (Src1.isFI()) {
861 return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
862 Src0.getReg()));
863 }
864
865 return false;
866}
867
869 // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
870 switch (MI->getOpcode()) {
871 case AMDGPU::V_ADD_U32_e32: {
872 // TODO: We could handle this but it requires work to avoid violating
873 // operand restrictions.
874 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
875 !isFIPlusImmOrVGPR(*this, *MI))
876 return false;
877 [[fallthrough]];
878 }
879 case AMDGPU::V_ADD_U32_e64:
880 // FIXME: This optimization is barely profitable hasFlatScratchEnabled
881 // as-is.
882 //
883 // Much of the benefit with the MUBUF handling is we avoid duplicating the
884 // shift of the frame register, which isn't needed with scratch.
885 //
886 // materializeFrameBaseRegister doesn't know the register classes of the
887 // uses, and unconditionally uses an s_add_i32, which will end up using a
888 // copy for the vector uses.
889 return !ST.hasFlatScratchEnabled();
890 case AMDGPU::V_ADD_CO_U32_e32:
891 if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
892 !isFIPlusImmOrVGPR(*this, *MI))
893 return false;
894 // We can't deal with the case where the carry out has a use (though this
895 // should never happen)
896 return MI->getOperand(3).isDead();
897 case AMDGPU::V_ADD_CO_U32_e64:
898 // TODO: Should we check use_empty instead?
899 return MI->getOperand(1).isDead();
900 default:
901 break;
902 }
903
905 return false;
906
907 int64_t FullOffset = Offset + getScratchInstrOffset(MI);
908
909 const SIInstrInfo *TII = ST.getInstrInfo();
911 return !TII->isLegalMUBUFImmOffset(FullOffset);
912
913 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
915}
916
918 int FrameIdx,
919 int64_t Offset) const {
920 MachineBasicBlock::iterator Ins = MBB->begin();
921 DebugLoc DL; // Defaults to "unknown"
922
923 if (Ins != MBB->end())
924 DL = Ins->getDebugLoc();
925
926 MachineFunction *MF = MBB->getParent();
927 const SIInstrInfo *TII = ST.getInstrInfo();
928 MachineRegisterInfo &MRI = MF->getRegInfo();
929 unsigned MovOpc =
930 ST.hasFlatScratchEnabled() ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
931
932 Register BaseReg = MRI.createVirtualRegister(
933 ST.hasFlatScratchEnabled() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
934 : &AMDGPU::VGPR_32RegClass);
935
936 if (Offset == 0) {
937 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
938 .addFrameIndex(FrameIdx);
939 return BaseReg;
940 }
941
942 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
943
944 Register FIReg = MRI.createVirtualRegister(ST.hasFlatScratchEnabled()
945 ? &AMDGPU::SReg_32_XM0RegClass
946 : &AMDGPU::VGPR_32RegClass);
947
948 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
949 .addImm(Offset);
950 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
951 .addFrameIndex(FrameIdx);
952
953 if (ST.hasFlatScratchEnabled()) {
954 // FIXME: Make sure scc isn't live in.
955 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
956 .addReg(OffsetReg, RegState::Kill)
957 .addReg(FIReg)
958 .setOperandDead(3); // scc
959 return BaseReg;
960 }
961
962 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
963 .addReg(OffsetReg, RegState::Kill)
964 .addReg(FIReg)
965 .addImm(0); // clamp bit
966
967 return BaseReg;
968}
969
971 int64_t Offset) const {
972 const SIInstrInfo *TII = ST.getInstrInfo();
973
974 switch (MI.getOpcode()) {
975 case AMDGPU::V_ADD_U32_e32:
976 case AMDGPU::V_ADD_CO_U32_e32: {
977 MachineOperand *FIOp = &MI.getOperand(2);
978 MachineOperand *ImmOp = &MI.getOperand(1);
979 if (!FIOp->isFI())
980 std::swap(FIOp, ImmOp);
981
982 if (!ImmOp->isImm()) {
983 assert(Offset == 0);
984 FIOp->ChangeToRegister(BaseReg, false);
985 TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
986 return;
987 }
988
989 int64_t TotalOffset = ImmOp->getImm() + Offset;
990 if (TotalOffset == 0) {
991 MI.setDesc(TII->get(AMDGPU::COPY));
992 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
993 MI.removeOperand(I);
994
995 MI.getOperand(1).ChangeToRegister(BaseReg, false);
996 return;
997 }
998
999 ImmOp->setImm(TotalOffset);
1000
1001 MachineBasicBlock *MBB = MI.getParent();
1002 MachineFunction *MF = MBB->getParent();
1003 MachineRegisterInfo &MRI = MF->getRegInfo();
1004
1005 // FIXME: materializeFrameBaseRegister does not know the register class of
1006 // the uses of the frame index, and assumes SGPR for hasFlatScratchEnabled.
1007 // Emit a copy so we have a legal operand and hope the register coalescer
1008 // can clean it up.
1009 if (isSGPRReg(MRI, BaseReg)) {
1010 Register BaseRegVGPR =
1011 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1012 BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
1013 .addReg(BaseReg);
1014 MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
1015 } else {
1016 MI.getOperand(2).ChangeToRegister(BaseReg, false);
1017 }
1018 return;
1019 }
1020 case AMDGPU::V_ADD_U32_e64:
1021 case AMDGPU::V_ADD_CO_U32_e64: {
1022 int Src0Idx = MI.getNumExplicitDefs();
1023 MachineOperand *FIOp = &MI.getOperand(Src0Idx);
1024 MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
1025 if (!FIOp->isFI())
1026 std::swap(FIOp, ImmOp);
1027
1028 if (!ImmOp->isImm()) {
1029 FIOp->ChangeToRegister(BaseReg, false);
1030 TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
1031 return;
1032 }
1033
1034 int64_t TotalOffset = ImmOp->getImm() + Offset;
1035 if (TotalOffset == 0) {
1036 MI.setDesc(TII->get(AMDGPU::COPY));
1037
1038 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1039 MI.removeOperand(I);
1040
1041 MI.getOperand(1).ChangeToRegister(BaseReg, false);
1042 } else {
1043 FIOp->ChangeToRegister(BaseReg, false);
1044 ImmOp->setImm(TotalOffset);
1045 }
1046
1047 return;
1048 }
1049 default:
1050 break;
1051 }
1052
1053 bool IsFlat = TII->isFLATScratch(MI);
1054
1055#ifndef NDEBUG
1056 // FIXME: Is it possible to be storing a frame index to itself?
1057 bool SeenFI = false;
1058 for (const MachineOperand &MO: MI.operands()) {
1059 if (MO.isFI()) {
1060 if (SeenFI)
1061 llvm_unreachable("should not see multiple frame indices");
1062
1063 SeenFI = true;
1064 }
1065 }
1066#endif
1067
1068 MachineOperand *FIOp =
1069 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
1070 : AMDGPU::OpName::vaddr);
1071
1072 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
1073 int64_t NewOffset = OffsetOp->getImm() + Offset;
1074
1075 assert(FIOp && FIOp->isFI() && "frame index must be address operand");
1076 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
1077
1078 if (IsFlat) {
1079 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1081 "offset should be legal");
1082 FIOp->ChangeToRegister(BaseReg, false);
1083 OffsetOp->setImm(NewOffset);
1084 return;
1085 }
1086
1087#ifndef NDEBUG
1088 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
1089 assert(SOffset->isImm() && SOffset->getImm() == 0);
1090#endif
1091
1092 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
1093
1094 FIOp->ChangeToRegister(BaseReg, false);
1095 OffsetOp->setImm(NewOffset);
1096}
1097
1099 Register BaseReg,
1100 int64_t Offset) const {
1101
1102 switch (MI->getOpcode()) {
1103 case AMDGPU::V_ADD_U32_e32:
1104 case AMDGPU::V_ADD_CO_U32_e32:
1105 return true;
1106 case AMDGPU::V_ADD_U32_e64:
1107 case AMDGPU::V_ADD_CO_U32_e64:
1108 return ST.hasVOP3Literal() || AMDGPU::isInlinableIntLiteral(Offset);
1109 default:
1110 break;
1111 }
1112
1114 return false;
1115
1116 int64_t NewOffset = Offset + getScratchInstrOffset(MI);
1117
1118 const SIInstrInfo *TII = ST.getInstrInfo();
1120 return TII->isLegalMUBUFImmOffset(NewOffset);
1121
1122 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1124}
1125
1126const TargetRegisterClass *
1128 // This is inaccurate. It depends on the instruction and address space. The
1129 // only place where we should hit this is for dealing with frame indexes /
1130 // private accesses, so this is correct in that case.
1131 return &AMDGPU::VGPR_32RegClass;
1132}
1133
1134const TargetRegisterClass *
1136 return RC == &AMDGPU::SCC_CLASSRegClass ? &AMDGPU::SReg_32RegClass : RC;
1137}
1138
1140 const SIInstrInfo *TII) {
1141
1142 unsigned Op = MI.getOpcode();
1143 switch (Op) {
1144 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
1145 case AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE:
1146 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
1147 // FIXME: This assumes the mask is statically known and not computed at
1148 // runtime. However, some ABIs may want to compute the mask dynamically and
1149 // this will need to be updated.
1150 return llvm::popcount(
1151 (uint64_t)TII->getNamedOperand(MI, AMDGPU::OpName::mask)->getImm());
1152 case AMDGPU::SI_SPILL_S1024_SAVE:
1153 case AMDGPU::SI_SPILL_S1024_CFI_SAVE:
1154 case AMDGPU::SI_SPILL_S1024_RESTORE:
1155 case AMDGPU::SI_SPILL_V1024_SAVE:
1156 case AMDGPU::SI_SPILL_V1024_CFI_SAVE:
1157 case AMDGPU::SI_SPILL_V1024_RESTORE:
1158 case AMDGPU::SI_SPILL_A1024_SAVE:
1159 case AMDGPU::SI_SPILL_A1024_CFI_SAVE:
1160 case AMDGPU::SI_SPILL_A1024_RESTORE:
1161 case AMDGPU::SI_SPILL_AV1024_SAVE:
1162 case AMDGPU::SI_SPILL_AV1024_CFI_SAVE:
1163 case AMDGPU::SI_SPILL_AV1024_RESTORE:
1164 return 32;
1165 case AMDGPU::SI_SPILL_S512_SAVE:
1166 case AMDGPU::SI_SPILL_S512_CFI_SAVE:
1167 case AMDGPU::SI_SPILL_S512_RESTORE:
1168 case AMDGPU::SI_SPILL_V512_SAVE:
1169 case AMDGPU::SI_SPILL_V512_CFI_SAVE:
1170 case AMDGPU::SI_SPILL_V512_RESTORE:
1171 case AMDGPU::SI_SPILL_A512_SAVE:
1172 case AMDGPU::SI_SPILL_A512_CFI_SAVE:
1173 case AMDGPU::SI_SPILL_A512_RESTORE:
1174 case AMDGPU::SI_SPILL_AV512_SAVE:
1175 case AMDGPU::SI_SPILL_AV512_CFI_SAVE:
1176 case AMDGPU::SI_SPILL_AV512_RESTORE:
1177 return 16;
1178 case AMDGPU::SI_SPILL_S384_SAVE:
1179 case AMDGPU::SI_SPILL_S384_RESTORE:
1180 case AMDGPU::SI_SPILL_V384_SAVE:
1181 case AMDGPU::SI_SPILL_V384_RESTORE:
1182 case AMDGPU::SI_SPILL_A384_SAVE:
1183 case AMDGPU::SI_SPILL_A384_RESTORE:
1184 case AMDGPU::SI_SPILL_AV384_SAVE:
1185 case AMDGPU::SI_SPILL_AV384_RESTORE:
1186 return 12;
1187 case AMDGPU::SI_SPILL_S352_SAVE:
1188 case AMDGPU::SI_SPILL_S352_RESTORE:
1189 case AMDGPU::SI_SPILL_V352_SAVE:
1190 case AMDGPU::SI_SPILL_V352_RESTORE:
1191 case AMDGPU::SI_SPILL_A352_SAVE:
1192 case AMDGPU::SI_SPILL_A352_RESTORE:
1193 case AMDGPU::SI_SPILL_AV352_SAVE:
1194 case AMDGPU::SI_SPILL_AV352_RESTORE:
1195 return 11;
1196 case AMDGPU::SI_SPILL_S320_SAVE:
1197 case AMDGPU::SI_SPILL_S320_RESTORE:
1198 case AMDGPU::SI_SPILL_V320_SAVE:
1199 case AMDGPU::SI_SPILL_V320_RESTORE:
1200 case AMDGPU::SI_SPILL_A320_SAVE:
1201 case AMDGPU::SI_SPILL_A320_RESTORE:
1202 case AMDGPU::SI_SPILL_AV320_SAVE:
1203 case AMDGPU::SI_SPILL_AV320_RESTORE:
1204 return 10;
1205 case AMDGPU::SI_SPILL_S288_SAVE:
1206 case AMDGPU::SI_SPILL_S288_RESTORE:
1207 case AMDGPU::SI_SPILL_V288_SAVE:
1208 case AMDGPU::SI_SPILL_V288_RESTORE:
1209 case AMDGPU::SI_SPILL_A288_SAVE:
1210 case AMDGPU::SI_SPILL_A288_RESTORE:
1211 case AMDGPU::SI_SPILL_AV288_SAVE:
1212 case AMDGPU::SI_SPILL_AV288_RESTORE:
1213 return 9;
1214 case AMDGPU::SI_SPILL_S256_SAVE:
1215 case AMDGPU::SI_SPILL_S256_CFI_SAVE:
1216 case AMDGPU::SI_SPILL_S256_RESTORE:
1217 case AMDGPU::SI_SPILL_V256_SAVE:
1218 case AMDGPU::SI_SPILL_V256_CFI_SAVE:
1219 case AMDGPU::SI_SPILL_V256_RESTORE:
1220 case AMDGPU::SI_SPILL_A256_SAVE:
1221 case AMDGPU::SI_SPILL_A256_CFI_SAVE:
1222 case AMDGPU::SI_SPILL_A256_RESTORE:
1223 case AMDGPU::SI_SPILL_AV256_SAVE:
1224 case AMDGPU::SI_SPILL_AV256_CFI_SAVE:
1225 case AMDGPU::SI_SPILL_AV256_RESTORE:
1226 return 8;
1227 case AMDGPU::SI_SPILL_S224_SAVE:
1228 case AMDGPU::SI_SPILL_S224_CFI_SAVE:
1229 case AMDGPU::SI_SPILL_S224_RESTORE:
1230 case AMDGPU::SI_SPILL_V224_SAVE:
1231 case AMDGPU::SI_SPILL_V224_CFI_SAVE:
1232 case AMDGPU::SI_SPILL_V224_RESTORE:
1233 case AMDGPU::SI_SPILL_A224_SAVE:
1234 case AMDGPU::SI_SPILL_A224_CFI_SAVE:
1235 case AMDGPU::SI_SPILL_A224_RESTORE:
1236 case AMDGPU::SI_SPILL_AV224_SAVE:
1237 case AMDGPU::SI_SPILL_AV224_CFI_SAVE:
1238 case AMDGPU::SI_SPILL_AV224_RESTORE:
1239 return 7;
1240 case AMDGPU::SI_SPILL_S192_SAVE:
1241 case AMDGPU::SI_SPILL_S192_CFI_SAVE:
1242 case AMDGPU::SI_SPILL_S192_RESTORE:
1243 case AMDGPU::SI_SPILL_V192_SAVE:
1244 case AMDGPU::SI_SPILL_V192_CFI_SAVE:
1245 case AMDGPU::SI_SPILL_V192_RESTORE:
1246 case AMDGPU::SI_SPILL_A192_SAVE:
1247 case AMDGPU::SI_SPILL_A192_CFI_SAVE:
1248 case AMDGPU::SI_SPILL_A192_RESTORE:
1249 case AMDGPU::SI_SPILL_AV192_SAVE:
1250 case AMDGPU::SI_SPILL_AV192_CFI_SAVE:
1251 case AMDGPU::SI_SPILL_AV192_RESTORE:
1252 return 6;
1253 case AMDGPU::SI_SPILL_S160_SAVE:
1254 case AMDGPU::SI_SPILL_S160_CFI_SAVE:
1255 case AMDGPU::SI_SPILL_S160_RESTORE:
1256 case AMDGPU::SI_SPILL_V160_SAVE:
1257 case AMDGPU::SI_SPILL_V160_CFI_SAVE:
1258 case AMDGPU::SI_SPILL_V160_RESTORE:
1259 case AMDGPU::SI_SPILL_A160_SAVE:
1260 case AMDGPU::SI_SPILL_A160_CFI_SAVE:
1261 case AMDGPU::SI_SPILL_A160_RESTORE:
1262 case AMDGPU::SI_SPILL_AV160_SAVE:
1263 case AMDGPU::SI_SPILL_AV160_CFI_SAVE:
1264 case AMDGPU::SI_SPILL_AV160_RESTORE:
1265 return 5;
1266 case AMDGPU::SI_SPILL_S128_SAVE:
1267 case AMDGPU::SI_SPILL_S128_CFI_SAVE:
1268 case AMDGPU::SI_SPILL_S128_RESTORE:
1269 case AMDGPU::SI_SPILL_V128_SAVE:
1270 case AMDGPU::SI_SPILL_V128_CFI_SAVE:
1271 case AMDGPU::SI_SPILL_V128_RESTORE:
1272 case AMDGPU::SI_SPILL_A128_SAVE:
1273 case AMDGPU::SI_SPILL_A128_CFI_SAVE:
1274 case AMDGPU::SI_SPILL_A128_RESTORE:
1275 case AMDGPU::SI_SPILL_AV128_SAVE:
1276 case AMDGPU::SI_SPILL_AV128_CFI_SAVE:
1277 case AMDGPU::SI_SPILL_AV128_RESTORE:
1278 return 4;
1279 case AMDGPU::SI_SPILL_S96_SAVE:
1280 case AMDGPU::SI_SPILL_S96_CFI_SAVE:
1281 case AMDGPU::SI_SPILL_S96_RESTORE:
1282 case AMDGPU::SI_SPILL_V96_SAVE:
1283 case AMDGPU::SI_SPILL_V96_CFI_SAVE:
1284 case AMDGPU::SI_SPILL_V96_RESTORE:
1285 case AMDGPU::SI_SPILL_A96_SAVE:
1286 case AMDGPU::SI_SPILL_A96_CFI_SAVE:
1287 case AMDGPU::SI_SPILL_A96_RESTORE:
1288 case AMDGPU::SI_SPILL_AV96_SAVE:
1289 case AMDGPU::SI_SPILL_AV96_CFI_SAVE:
1290 case AMDGPU::SI_SPILL_AV96_RESTORE:
1291 return 3;
1292 case AMDGPU::SI_SPILL_S64_SAVE:
1293 case AMDGPU::SI_SPILL_S64_CFI_SAVE:
1294 case AMDGPU::SI_SPILL_S64_RESTORE:
1295 case AMDGPU::SI_SPILL_V64_SAVE:
1296 case AMDGPU::SI_SPILL_V64_CFI_SAVE:
1297 case AMDGPU::SI_SPILL_V64_RESTORE:
1298 case AMDGPU::SI_SPILL_A64_SAVE:
1299 case AMDGPU::SI_SPILL_A64_CFI_SAVE:
1300 case AMDGPU::SI_SPILL_A64_RESTORE:
1301 case AMDGPU::SI_SPILL_AV64_SAVE:
1302 case AMDGPU::SI_SPILL_AV64_CFI_SAVE:
1303 case AMDGPU::SI_SPILL_AV64_RESTORE:
1304 return 2;
1305 case AMDGPU::SI_SPILL_S32_SAVE:
1306 case AMDGPU::SI_SPILL_S32_CFI_SAVE:
1307 case AMDGPU::SI_SPILL_S32_RESTORE:
1308 case AMDGPU::SI_SPILL_V32_SAVE:
1309 case AMDGPU::SI_SPILL_V32_CFI_SAVE:
1310 case AMDGPU::SI_SPILL_V32_RESTORE:
1311 case AMDGPU::SI_SPILL_A32_SAVE:
1312 case AMDGPU::SI_SPILL_A32_CFI_SAVE:
1313 case AMDGPU::SI_SPILL_A32_RESTORE:
1314 case AMDGPU::SI_SPILL_AV32_SAVE:
1315 case AMDGPU::SI_SPILL_AV32_CFI_SAVE:
1316 case AMDGPU::SI_SPILL_AV32_RESTORE:
1317 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
1318 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
1319 case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
1320 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
1321 case AMDGPU::SI_SPILL_V16_SAVE:
1322 case AMDGPU::SI_SPILL_V16_RESTORE:
1323 return 1;
1324 default: llvm_unreachable("Invalid spill opcode");
1325 }
1326}
1327
1328static int getOffsetMUBUFStore(unsigned Opc) {
1329 switch (Opc) {
1330 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
1331 return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1332 case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
1333 return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
1334 case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
1335 return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
1336 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
1337 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
1338 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
1339 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
1340 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
1341 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
1342 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
1343 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
1344 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
1345 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
1346 default:
1347 return -1;
1348 }
1349}
1350
1351static int getOffsetMUBUFLoad(unsigned Opc) {
1352 switch (Opc) {
1353 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
1354 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1355 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
1356 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
1357 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
1358 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
1359 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
1360 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
1361 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
1362 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
1363 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
1364 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
1365 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
1366 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
1367 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
1368 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
1369 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
1370 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
1371 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
1372 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
1373 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
1374 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
1375 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
1376 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
1377 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
1378 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
1379 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
1380 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
1381 default:
1382 return -1;
1383 }
1384}
1385
1386static int getOffenMUBUFStore(unsigned Opc) {
1387 switch (Opc) {
1388 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
1389 return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
1390 case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
1391 return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
1392 case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
1393 return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
1394 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
1395 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
1396 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
1397 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
1398 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
1399 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
1400 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
1401 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
1402 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
1403 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
1404 default:
1405 return -1;
1406 }
1407}
1408
1409static int getOffenMUBUFLoad(unsigned Opc) {
1410 switch (Opc) {
1411 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
1412 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
1413 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
1414 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
1415 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
1416 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
1417 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
1418 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
1419 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
1420 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
1421 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
1422 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
1423 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
1424 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
1425 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
1426 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
1427 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
1428 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
1429 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
1430 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
1431 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
1432 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
1433 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
1434 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
1435 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
1436 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
1437 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
1438 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
1439 default:
1440 return -1;
1441 }
1442}
1443
1446 MachineBasicBlock::iterator MI, int Index, unsigned Lane,
1447 unsigned ValueReg, bool IsKill, bool NeedsCFI) {
1448 MachineFunction *MF = MBB.getParent();
1450 const SIInstrInfo *TII = ST.getInstrInfo();
1451 const SIFrameLowering *TFL = ST.getFrameLowering();
1452
1453 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
1454
1455 if (Reg == AMDGPU::NoRegister)
1456 return MachineInstrBuilder();
1457
1458 bool IsStore = MI->mayStore();
1459 MachineRegisterInfo &MRI = MF->getRegInfo();
1460 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1461
1462 unsigned Dst = IsStore ? Reg : ValueReg;
1463 unsigned Src = IsStore ? ValueReg : Reg;
1464 bool IsVGPR = TRI->isVGPR(MRI, Reg);
1465 const DebugLoc &DL = MI->getDebugLoc();
1466 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
1467 // Spiller during regalloc may restore a spilled register to its superclass.
1468 // It could result in AGPR spills restored to VGPRs or the other way around,
1469 // making the src and dst with identical regclasses at this point. It just
1470 // needs a copy in such cases.
1471 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
1472 .addReg(Src, getKillRegState(IsKill));
1474 if (NeedsCFI)
1475 TFL->buildCFIForVRegToVRegSpill(MBB, MI, DL, Src, Dst);
1476 return CopyMIB;
1477 }
1478 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1479 : AMDGPU::V_ACCVGPR_READ_B32_e64;
1480
1481 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
1482 .addReg(Src, getKillRegState(IsKill));
1484 if (NeedsCFI)
1485 TFL->buildCFIForVRegToVRegSpill(MBB, MI, DL, Src, Dst);
1486 return MIB;
1487}
1488
1489// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1490// need to handle the case where an SGPR may need to be spilled while spilling.
1492 MachineFrameInfo &MFI,
1494 int Index,
1495 int64_t Offset) {
1496 const SIInstrInfo *TII = ST.getInstrInfo();
1497 MachineBasicBlock *MBB = MI->getParent();
1498 const DebugLoc &DL = MI->getDebugLoc();
1499 bool IsStore = MI->mayStore();
1500
1501 unsigned Opc = MI->getOpcode();
1502 int LoadStoreOp = IsStore ?
1504 if (LoadStoreOp == -1)
1505 return false;
1506
1507 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
1508 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false, false)
1509 .getInstr())
1510 return true;
1511
1512 MachineInstrBuilder NewMI =
1513 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
1514 .add(*Reg)
1515 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
1516 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
1517 .addImm(Offset)
1518 .addImm(0) // cpol
1519 .addImm(0) // swz
1520 .cloneMemRefs(*MI);
1521
1522 const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
1523 AMDGPU::OpName::vdata_in);
1524 if (VDataIn)
1525 NewMI.add(*VDataIn);
1526 return true;
1527}
1528
1530 unsigned LoadStoreOp,
1531 unsigned EltSize) {
1532 bool IsStore = TII->get(LoadStoreOp).mayStore();
1533 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr);
1534 bool UseST =
1535 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
1536
1537 // Handle block load/store first.
1538 if (TII->isBlockLoadStore(LoadStoreOp))
1539 return LoadStoreOp;
1540
1541 switch (EltSize) {
1542 case 4:
1543 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1544 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
1545 break;
1546 case 8:
1547 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1548 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
1549 break;
1550 case 12:
1551 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1552 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
1553 break;
1554 case 16:
1555 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1556 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
1557 break;
1558 default:
1559 llvm_unreachable("Unexpected spill load/store size!");
1560 }
1561
1562 if (HasVAddr)
1563 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1564 else if (UseST)
1565 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1566
1567 return LoadStoreOp;
1568}
1569
1572 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1573 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1574 RegScavenger *RS, LiveRegUnits *LiveUnits, bool NeedsCFI) const {
1575 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
1576
1577 MachineFunction *MF = MBB.getParent();
1578 const SIInstrInfo *TII = ST.getInstrInfo();
1579 const MachineFrameInfo &MFI = MF->getFrameInfo();
1580 const SIFrameLowering *TFL = ST.getFrameLowering();
1581 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1582
1583 const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
1584 bool IsStore = Desc->mayStore();
1585 bool IsFlat = TII->isFLATScratch(LoadStoreOp);
1586 bool IsBlock = TII->isBlockLoadStore(LoadStoreOp);
1587
1588 bool CanClobberSCC = false;
1589 bool Scavenged = false;
1590 MCRegister SOffset = ScratchOffsetReg;
1591
1592 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
1593 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1594 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
1595 unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
1596
1597 // On targets with register tuple alignment requirements,
1598 // for unaligned tuples, spill the first sub-reg as a 32-bit spill,
1599 // and spill the rest as a regular aligned tuple.
1600 // eg: SPILL_V224 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
1601 // will be spilt as:
1602 // SPILL_SCRATCH_DWORD $vgpr1
1603 // SPILL_SCRATCH_DWORDx4 $vgpr2_vgpr3_vgpr4_vgpr5
1604 // SPILL_SCRATCH_DWORDx2 $vgpr6_vgpr7
1605 bool IsRegMisaligned = false;
1606 if (!IsBlock && !IsAGPR && RegWidth > 4) {
1607 unsigned SpillOpcode =
1608 getFlatScratchSpillOpcode(TII, LoadStoreOp, std::min(RegWidth, 16u));
1609 int VDataIdx =
1610 IsStore ? AMDGPU::getNamedOperandIdx(SpillOpcode, AMDGPU::OpName::vdata)
1611 : 0; // Restore Ops have data reg as the first (output) operand.
1612 const TargetRegisterClass *ExpectedRC =
1613 TII->getRegClass(TII->get(SpillOpcode), VDataIdx);
1614 if (!ExpectedRC->contains(ValueReg)) {
1615 unsigned NumRegs = std::min(AMDGPU::getRegBitWidth(*ExpectedRC) / 4, 4u);
1616 unsigned SubIdx = getSubRegFromChannel(0, NumRegs);
1617 const TargetRegisterClass *MatchRC =
1618 getMatchingSuperRegClass(RC, ExpectedRC, SubIdx);
1619 if (!MatchRC || !MatchRC->contains(ValueReg))
1620 IsRegMisaligned = true;
1621 }
1622 }
1623 // The first sub-register will be spilled as a 32-bit value
1624 if (IsRegMisaligned)
1625 RegWidth -= 4u;
1626 // Always use 4 byte operations for AGPRs because we need to scavenge
1627 // a temporary VGPR.
1628 // If we're using a block operation, the element should be the whole block.
1629 unsigned EltSize = IsBlock ? RegWidth
1630 : (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u)
1631 : 4u;
1632 unsigned NumSubRegs = RegWidth / EltSize;
1633 unsigned Size = NumSubRegs * EltSize;
1634 unsigned RemSize = RegWidth - Size;
1635 unsigned NumRemSubRegs = RemSize ? 1 : 0;
1636 // An additional sub-register is needed to spill the misaligned component.
1637 if (IsRegMisaligned)
1638 NumSubRegs += 1;
1639 int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
1640 int64_t MaterializedOffset = Offset;
1641
1642 // Maxoffset is the starting offset for the last chunk to be spilled.
1643 // In case of non-zero remainder element, max offset will be the
1644 // last address(offset + Size) after spilling all the EltSize chunks.
1645 int64_t MaxOffset = Offset + Size - (RemSize ? 0 : EltSize);
1646 int64_t ScratchOffsetRegDelta = 0;
1647 int64_t AdditionalCFIOffset = 0;
1648
1649 if (IsFlat && EltSize > 4) {
1650 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1651 Desc = &TII->get(LoadStoreOp);
1652 }
1653
1654 Align Alignment = MFI.getObjectAlign(Index);
1655 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
1656
1657 assert((IsFlat || ((Offset % EltSize) == 0)) &&
1658 "unexpected VGPR spill offset");
1659
1660 // Track a VGPR to use for a constant offset we need to materialize.
1661 Register TmpOffsetVGPR;
1662
1663 // Track a VGPR to use as an intermediate value.
1664 Register TmpIntermediateVGPR;
1665 bool UseVGPROffset = false;
1666
1667 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1668 // combination.
1669 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
1670 int64_t VOffset) {
1671 // We are using a VGPR offset
1672 if (IsFlat && SGPRBase) {
1673 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1674 // SGPR, so perform the add as vector.
1675 // We don't need a base SGPR in the kernel.
1676
1677 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
1678 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
1679 .addReg(SGPRBase)
1680 .addImm(VOffset)
1681 .addImm(0); // clamp
1682 } else {
1683 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1684 .addReg(SGPRBase);
1685 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
1686 .addImm(VOffset)
1687 .addReg(TmpOffsetVGPR);
1688 }
1689 } else {
1690 assert(TmpOffsetVGPR);
1691 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1692 .addImm(VOffset);
1693 }
1694 };
1695
1696 bool IsOffsetLegal =
1697 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1699 : TII->isLegalMUBUFImmOffset(MaxOffset);
1700 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
1701 SOffset = MCRegister();
1702
1703 // We don't have access to the register scavenger if this function is called
1704 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1705 // TODO: Clobbering SCC is not necessary for scratch instructions in the
1706 // entry.
1707 if (RS) {
1708 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false);
1709
1710 // Piggy back on the liveness scan we just did see if SCC is dead.
1711 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
1712 } else if (LiveUnits) {
1713 CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
1714 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
1715 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1716 SOffset = Reg;
1717 break;
1718 }
1719 }
1720 }
1721
1722 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
1723 SOffset = Register();
1724
1725 if (!SOffset) {
1726 UseVGPROffset = true;
1727
1728 if (RS) {
1729 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1730 } else {
1731 assert(LiveUnits);
1732 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
1733 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1734 TmpOffsetVGPR = Reg;
1735 break;
1736 }
1737 }
1738 }
1739
1740 assert(TmpOffsetVGPR);
1741 } else if (!SOffset && CanClobberSCC) {
1742 // There are no free SGPRs, and since we are in the process of spilling
1743 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
1744 // on SI/CI and on VI it is true until we implement spilling using scalar
1745 // stores), we have no way to free up an SGPR. Our solution here is to
1746 // add the offset directly to the ScratchOffset or StackPtrOffset
1747 // register, and then subtract the offset after the spill to return the
1748 // register to it's original value.
1749
1750 // TODO: If we don't have to do an emergency stack slot spill, converting
1751 // to use the VGPR offset is fewer instructions.
1752 if (!ScratchOffsetReg)
1753 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
1754 SOffset = ScratchOffsetReg;
1755 ScratchOffsetRegDelta = Offset;
1756 } else {
1757 Scavenged = true;
1758 }
1759
1760 AdditionalCFIOffset = Offset;
1761 // We currently only support spilling VGPRs to EltSize boundaries, meaning
1762 // we can simplify the adjustment of Offset here to just scale with
1763 // WavefrontSize.
1764 if (!IsFlat && !UseVGPROffset)
1765 Offset *= ST.getWavefrontSize();
1766
1767 if (!UseVGPROffset && !SOffset)
1768 report_fatal_error("could not scavenge SGPR to spill in entry function");
1769
1770 if (UseVGPROffset) {
1771 // We are using a VGPR offset
1772 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
1773 } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
1774 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
1775 } else {
1776 assert(Offset != 0);
1777 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1778 .addReg(ScratchOffsetReg)
1779 .addImm(Offset);
1780 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1781 }
1782
1783 Offset = 0;
1784 }
1785
1786 if (IsFlat && SOffset == AMDGPU::NoRegister) {
1787 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
1788 && "Unexpected vaddr for flat scratch with a FI operand");
1789
1790 if (UseVGPROffset) {
1791 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1792 } else {
1793 assert(ST.hasFlatScratchSTMode());
1794 assert(!TII->isBlockLoadStore(LoadStoreOp) && "Block ops don't have ST");
1795 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1796 }
1797
1798 Desc = &TII->get(LoadStoreOp);
1799 }
1800
1801 // Save a copy of the original element size before its potentially changed for
1802 // misaligned tuples.
1803 unsigned OrigEltSize = EltSize;
1804 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
1805 ++i, RegOffset += EltSize) {
1806 if (IsRegMisaligned) {
1807 if (i == 0) {
1808 // For misaligned register tuples, spill only the first sub-reg in the
1809 // first iteration.
1810 EltSize = 4u;
1811 } else {
1812 // The misaligned register was spilt. Now the rest of the tuple is
1813 // properly aligned.
1814 IsRegMisaligned = false;
1815 EltSize = OrigEltSize;
1816 }
1817 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1818 }
1819 if (i == NumSubRegs) {
1820 EltSize = RemSize;
1821 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1822 }
1823 Desc = &TII->get(LoadStoreOp);
1824
1825 if (!IsFlat && UseVGPROffset) {
1826 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
1827 : getOffenMUBUFLoad(LoadStoreOp);
1828 Desc = &TII->get(NewLoadStoreOp);
1829 }
1830
1831 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
1832 // If we are spilling an AGPR beyond the range of the memory instruction
1833 // offset and need to use a VGPR offset, we ideally have at least 2
1834 // scratch VGPRs. If we don't have a second free VGPR without spilling,
1835 // recycle the VGPR used for the offset which requires resetting after
1836 // each subregister.
1837
1838 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
1839 }
1840
1841 unsigned NumRegs = EltSize / 4;
1842 Register SubReg = e == 1
1843 ? ValueReg
1844 : Register(getSubReg(ValueReg,
1845 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1846
1847 RegState SOffsetRegState = {};
1848 RegState SrcDstRegState = getDefRegState(!IsStore);
1849 const bool IsLastSubReg = i + 1 == e;
1850 const bool IsFirstSubReg = i == 0;
1851 if (IsLastSubReg) {
1852 SOffsetRegState |= getKillRegState(Scavenged);
1853 // The last implicit use carries the "Kill" flag.
1854 SrcDstRegState |= getKillRegState(IsKill);
1855 }
1856
1857 // Make sure the whole register is defined if there are undef components by
1858 // adding an implicit def of the super-reg on the first instruction.
1859 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg;
1860 bool NeedSuperRegImpOperand = e > 1;
1861
1862 // Remaining element size to spill into memory after some parts of it
1863 // spilled into either AGPRs or VGPRs.
1864 unsigned RemEltSize = EltSize;
1865
1866 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1867 // starting from the last lane. In case if a register cannot be completely
1868 // spilled into another register that will ensure its alignment does not
1869 // change. For targets with VGPR alignment requirement this is important
1870 // in case of flat scratch usage as we might get a scratch_load or
1871 // scratch_store of an unaligned register otherwise.
1872 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
1873 LaneE = RegOffset / 4;
1874 Lane >= LaneE; --Lane) {
1875 bool IsSubReg = e > 1 || EltSize > 4;
1876 Register Sub = IsSubReg
1877 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
1878 : ValueReg;
1879 auto MIB =
1880 spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill, NeedsCFI);
1881 if (!MIB.getInstr())
1882 break;
1883 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
1884 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1885 NeedSuperRegDef = false;
1886 }
1887 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
1888 NeedSuperRegImpOperand = true;
1889 RegState State = SrcDstRegState;
1890 if (!IsLastSubReg || (Lane != LaneE))
1891 State &= ~RegState::Kill;
1892 if (!IsFirstSubReg || (Lane != LaneS))
1893 State &= ~RegState::Define;
1894 MIB.addReg(ValueReg, RegState::Implicit | State);
1895 }
1896 RemEltSize -= 4;
1897 }
1898
1899 if (!RemEltSize) // Fully spilled into AGPRs.
1900 continue;
1901
1902 if (RemEltSize != EltSize) { // Partially spilled to AGPRs
1903 assert(IsFlat && EltSize > 4);
1904
1905 unsigned NumRegs = RemEltSize / 4;
1906 SubReg = Register(getSubReg(ValueReg,
1907 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1908 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
1909 Desc = &TII->get(Opc);
1910 }
1911
1912 unsigned FinalReg = SubReg;
1913
1914 if (IsAGPR) {
1915 assert(EltSize == 4);
1916
1917 if (!TmpIntermediateVGPR) {
1918 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
1919 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
1920 }
1921 if (IsStore) {
1922 auto AccRead = BuildMI(MBB, MI, DL,
1923 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
1924 TmpIntermediateVGPR)
1925 .addReg(SubReg, getKillRegState(IsKill));
1926 if (NeedSuperRegDef)
1927 AccRead.addReg(ValueReg, RegState::ImplicitDefine);
1928 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1929 AccRead.addReg(ValueReg, RegState::Implicit);
1931 }
1932 SubReg = TmpIntermediateVGPR;
1933 } else if (UseVGPROffset) {
1934 if (!TmpOffsetVGPR) {
1935 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1936 MI, false, 0);
1937 RS->setRegUsed(TmpOffsetVGPR);
1938 }
1939 }
1940
1941 Register FinalValueReg = ValueReg;
1942 if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_SADDR) {
1943 // If we are loading 16-bit value with SRAMECC endabled we need a temp
1944 // 32-bit VGPR to load and extract 16-bits into the final register.
1945 ValueReg =
1946 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1947 SubReg = ValueReg;
1948 IsKill = false;
1949 }
1950
1951 // Create the MMO, additional set the NonVolatile flag as scratch memory
1952 // used for spills will not be used outside the thread.
1953 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
1955 PInfo, MMO->getFlags() | MOThreadPrivate, RemEltSize,
1956 commonAlignment(Alignment, RegOffset));
1957
1958 auto MIB =
1959 BuildMI(MBB, MI, DL, *Desc)
1960 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
1961
1962 if (UseVGPROffset) {
1963 // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1964 // intermediate accvgpr_write.
1965 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
1966 }
1967
1968 if (!IsFlat)
1969 MIB.addReg(FuncInfo->getScratchRSrcReg());
1970
1971 if (SOffset == AMDGPU::NoRegister) {
1972 if (!IsFlat) {
1973 if (UseVGPROffset && ScratchOffsetReg) {
1974 MIB.addReg(ScratchOffsetReg);
1975 } else {
1976 assert(FuncInfo->isBottomOfStack());
1977 MIB.addImm(0);
1978 }
1979 }
1980 } else {
1981 MIB.addReg(SOffset, SOffsetRegState);
1982 }
1983
1984 MIB.addImm(Offset + RegOffset);
1985
1986 bool LastUse = MMO->getFlags() & MOLastUse;
1987 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1988
1989 if (!IsFlat)
1990 MIB.addImm(0); // swz
1991 MIB.addMemOperand(NewMMO);
1992
1993 if (FinalValueReg != ValueReg) {
1994 // Extract 16-bit from the loaded 32-bit value.
1995 ValueReg = getSubReg(ValueReg, AMDGPU::lo16);
1996 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e64))
1997 .addReg(FinalValueReg, getDefRegState(true))
1998 .addImm(0)
1999 .addReg(ValueReg, getKillRegState(true))
2000 .addImm(0);
2001 ValueReg = FinalValueReg;
2002 }
2003
2004 if (IsStore && NeedsCFI) {
2005 if (TII->isBlockLoadStore(LoadStoreOp)) {
2006 assert(RegOffset == 0 &&
2007 "expected whole register block to be treated as single element");
2009 } else {
2011 MBB, MI, DebugLoc(), SubReg,
2012 (Offset + RegOffset) * ST.getWavefrontSize() + AdditionalCFIOffset);
2013 }
2014 }
2015
2016 if (!IsAGPR && NeedSuperRegDef)
2017 MIB.addReg(ValueReg, RegState::ImplicitDefine);
2018
2019 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
2020 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
2021 FinalReg)
2022 .addReg(TmpIntermediateVGPR, RegState::Kill);
2024 }
2025
2026 bool IsSrcDstDef = hasRegState(SrcDstRegState, RegState::Define);
2027 bool PartialReloadCopy = (RemEltSize != EltSize) && !IsStore;
2028 if (NeedSuperRegImpOperand &&
2029 (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) {
2030 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
2031 if (PartialReloadCopy)
2032 MIB.addReg(ValueReg, RegState::Implicit);
2033 }
2034
2035 // The epilog restore of a wwm-scratch register can cause undesired
2036 // optimization during machine-cp post PrologEpilogInserter if the same
2037 // register was assigned for return value ABI lowering with a COPY
2038 // instruction. As given below, with the epilog reload, the earlier COPY
2039 // appeared to be dead during machine-cp.
2040 // ...
2041 // v0 in WWM operation, needs the WWM spill at prolog/epilog.
2042 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
2043 // ...
2044 // Epilog block:
2045 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
2046 // ...
2047 // WWM spill restore to preserve the inactive lanes of v0.
2048 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
2049 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
2050 // $exec = S_MOV_B64 killed $sgpr4_sgpr5
2051 // ...
2052 // SI_RETURN implicit $vgpr0
2053 // ...
2054 // To fix it, mark the same reg as a tied op for such restore instructions
2055 // so that it marks a usage for the preceding COPY.
2056 if (!IsStore && MI != MBB.end() && MI->isReturn() &&
2057 MI->readsRegister(SubReg, this)) {
2058 MIB.addReg(SubReg, RegState::Implicit);
2059 MIB->tieOperands(0, MIB->getNumOperands() - 1);
2060 }
2061
2062 // If we're building a block load, we should add artificial uses for the
2063 // CSR VGPRs that are *not* being transferred. This is because liveness
2064 // analysis is not aware of the mask, so we need to somehow inform it that
2065 // those registers are not available before the load and they should not be
2066 // scavenged.
2067 if (!IsStore && TII->isBlockLoadStore(LoadStoreOp))
2068 addImplicitUsesForBlockCSRLoad(MIB, ValueReg);
2069 }
2070
2071 if (ScratchOffsetRegDelta != 0) {
2072 // Subtract the offset we added to the ScratchOffset register.
2073 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
2074 .addReg(SOffset)
2075 .addImm(-ScratchOffsetRegDelta);
2076 }
2077}
2078
2080 Register BlockReg) const {
2081 const MachineFunction *MF = MIB->getMF();
2082 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
2083 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
2084 Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
2085 for (unsigned RegOffset = 1; RegOffset < 32; ++RegOffset)
2086 if (!(Mask & (1 << RegOffset)) &&
2087 isCalleeSavedPhysReg(BaseVGPR + RegOffset, *MF))
2088 MIB.addUse(BaseVGPR + RegOffset, RegState::Implicit);
2089}
2090
2093 Register BlockReg,
2094 int64_t Offset) const {
2095 const MachineFunction *MF = MBB.getParent();
2096 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
2097 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
2098 Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
2099 for (unsigned RegOffset = 0; RegOffset < 32; ++RegOffset) {
2100 Register VGPR = BaseVGPR + RegOffset;
2101 if (Mask & (1 << RegOffset)) {
2102 assert(isCalleeSavedPhysReg(VGPR, *MF));
2103 ST.getFrameLowering()->buildCFIForVGPRToVMEMSpill(
2104 MBB, MBBI, DebugLoc(), VGPR,
2105 (Offset + RegOffset) * ST.getWavefrontSize());
2106 } else if (isCalleeSavedPhysReg(VGPR, *MF)) {
2107 // FIXME: This is a workaround for the fact that FrameLowering's
2108 // emitPrologueEntryCFI considers the block load to clobber all registers
2109 // in the block.
2110 ST.getFrameLowering()->buildCFIForSameValue(MBB, MBBI, DebugLoc(),
2111 BaseVGPR + RegOffset);
2112 }
2113 }
2114}
2115
2117 int Offset, bool IsLoad,
2118 bool IsKill) const {
2119 // Load/store VGPR
2120 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
2121 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
2122
2123 Register FrameReg =
2124 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
2125 ? getBaseRegister()
2126 : getFrameRegister(SB.MF);
2127
2128 Align Alignment = FrameInfo.getObjectAlign(Index);
2132 SB.EltSize, Alignment);
2133
2134 if (IsLoad) {
2135 unsigned Opc = ST.hasFlatScratchEnabled()
2136 ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2137 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2138 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
2139 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
2140 } else {
2141 unsigned Opc = ST.hasFlatScratchEnabled()
2142 ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2143 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2144 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
2145 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
2146 // This only ever adds one VGPR spill
2147 SB.MFI.addToSpilledVGPRs(1);
2148 }
2149}
2150
2152 RegScavenger *RS, SlotIndexes *Indexes,
2153 LiveIntervals *LIS, bool OnlyToVGPR,
2154 bool SpillToPhysVGPRLane, bool NeedsCFI) const {
2155 assert(!MI->getOperand(0).isUndef() &&
2156 "undef spill should have been deleted earlier");
2157
2158 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2159
2160 ArrayRef<SpilledReg> VGPRSpills =
2161 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2163 bool SpillToVGPR = !VGPRSpills.empty();
2164 if (OnlyToVGPR && !SpillToVGPR)
2165 return false;
2166
2167 const SIFrameLowering *TFL = ST.getFrameLowering();
2168
2169 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
2170 SB.SuperReg != SB.MFI.getFrameOffsetReg()));
2171
2172 if (SpillToVGPR) {
2173
2174 // Since stack slot coloring pass is trying to optimize SGPR spills,
2175 // VGPR lanes (mapped from spill stack slot) may be shared for SGPR
2176 // spills of different sizes. This accounts for number of VGPR lanes alloted
2177 // equal to the largest SGPR being spilled in them.
2178 assert(SB.NumSubRegs <= VGPRSpills.size() &&
2179 "Num of SGPRs spilled should be less than or equal to num of "
2180 "the VGPR lanes.");
2181
2182 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2183 Register SubReg =
2184 SB.NumSubRegs == 1
2185 ? SB.SuperReg
2186 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2187 SpilledReg Spill = VGPRSpills[i];
2188
2189 bool IsFirstSubreg = i == 0;
2190 bool IsLastSubreg = i == SB.NumSubRegs - 1;
2191 bool UseKill = SB.IsKill && IsLastSubreg;
2192
2193
2194 // Mark the "old value of vgpr" input undef only if this is the first sgpr
2195 // spill to this specific vgpr in the first basic block.
2196 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2197 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
2198 .addReg(SubReg, getKillRegState(UseKill))
2199 .addImm(Spill.Lane)
2200 .addReg(Spill.VGPR);
2201
2202 MachineInstr *CFI = nullptr;
2203 if (NeedsCFI) {
2204 if (SB.SuperReg == SB.TRI.getReturnAddressReg(SB.MF)) {
2205 if (i == e - 1)
2206 CFI = TFL->buildCFIForSGPRToVGPRSpill(*SB.MBB, MI, DebugLoc(),
2207 AMDGPU::PC_REG, VGPRSpills);
2208 } else {
2209 CFI = TFL->buildCFIForSGPRToVGPRSpill(*SB.MBB, MI, DebugLoc(), SubReg,
2210 Spill.VGPR, Spill.Lane);
2211 }
2212 }
2213
2214 if (Indexes) {
2215 if (IsFirstSubreg)
2216 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2217 else
2218 Indexes->insertMachineInstrInMaps(*MIB);
2219
2220 if (CFI)
2221 Indexes->insertMachineInstrInMaps(*CFI);
2222 }
2223
2224 if (IsFirstSubreg && SB.NumSubRegs > 1) {
2225 // We may be spilling a super-register which is only partially defined,
2226 // and need to ensure later spills think the value is defined.
2227 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2228 }
2229
2230 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg))
2231 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit);
2232
2233 // FIXME: Since this spills to another register instead of an actual
2234 // frame index, we should delete the frame index when all references to
2235 // it are fixed.
2236 }
2237 } else {
2238 SB.prepare();
2239
2240 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
2241 RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2242
2243 // Per VGPR helper data
2244 auto PVD = SB.getPerVGPRData();
2245
2246 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2247 RegState TmpVGPRFlags = RegState::Undef;
2248
2249 // Write sub registers into the VGPR
2250 for (unsigned i = Offset * PVD.PerVGPR,
2251 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2252 i < e; ++i) {
2253 Register SubReg =
2254 SB.NumSubRegs == 1
2255 ? SB.SuperReg
2256 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2257
2258 MachineInstrBuilder WriteLane =
2259 BuildMI(*SB.MBB, MI, SB.DL,
2260 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
2261 .addReg(SubReg, SubKillState)
2262 .addImm(i % PVD.PerVGPR)
2263 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2264 TmpVGPRFlags = {};
2265
2266 if (Indexes) {
2267 if (i == 0)
2268 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane);
2269 else
2270 Indexes->insertMachineInstrInMaps(*WriteLane);
2271 }
2272
2273 // There could be undef components of a spilled super register.
2274 // TODO: Can we detect this and skip the spill?
2275 if (SB.NumSubRegs > 1) {
2276 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2277 RegState SuperKillState = {};
2278 if (i + 1 == SB.NumSubRegs)
2279 SuperKillState |= getKillRegState(SB.IsKill);
2280 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2281 }
2282 }
2283
2284 // Write out VGPR
2285 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
2286
2287 // TODO: Implement CFI for SpillToVMEM for all scenarios.
2288 MachineInstr *CFI = nullptr;
2289 if (NeedsCFI && SB.SuperReg == SB.TRI.getReturnAddressReg(SB.MF)) {
2290 int64_t CFIOffset = (Offset * SB.EltSize +
2291 SB.MF.getFrameInfo().getObjectOffset(Index)) *
2292 ST.getWavefrontSize();
2293 CFI = TFL->buildCFIForSGPRToVMEMSpill(*SB.MBB, MI, DebugLoc(),
2294 AMDGPU::PC_REG, CFIOffset);
2295 }
2296 if (Indexes && CFI)
2297 Indexes->insertMachineInstrInMaps(*CFI);
2298 }
2299
2300 SB.restore();
2301 }
2302
2303 MI->eraseFromParent();
2305
2306 if (LIS)
2308
2309 return true;
2310}
2311
2313 RegScavenger *RS, SlotIndexes *Indexes,
2314 LiveIntervals *LIS, bool OnlyToVGPR,
2315 bool SpillToPhysVGPRLane) const {
2316 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2317
2318 ArrayRef<SpilledReg> VGPRSpills =
2319 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2321 bool SpillToVGPR = !VGPRSpills.empty();
2322 if (OnlyToVGPR && !SpillToVGPR)
2323 return false;
2324
2325 if (SpillToVGPR) {
2326 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2327 Register SubReg =
2328 SB.NumSubRegs == 1
2329 ? SB.SuperReg
2330 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2331
2332 SpilledReg Spill = VGPRSpills[i];
2333 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2334 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2335 .addReg(Spill.VGPR)
2336 .addImm(Spill.Lane);
2337 if (SB.NumSubRegs > 1 && i == 0)
2339 if (Indexes) {
2340 if (i == e - 1)
2341 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2342 else
2343 Indexes->insertMachineInstrInMaps(*MIB);
2344 }
2345 }
2346 } else {
2347 SB.prepare();
2348
2349 // Per VGPR helper data
2350 auto PVD = SB.getPerVGPRData();
2351
2352 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2353 // Load in VGPR data
2354 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
2355
2356 // Unpack lanes
2357 for (unsigned i = Offset * PVD.PerVGPR,
2358 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2359 i < e; ++i) {
2360 Register SubReg =
2361 SB.NumSubRegs == 1
2362 ? SB.SuperReg
2363 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2364
2365 bool LastSubReg = (i + 1 == e);
2366 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2367 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2368 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2369 .addImm(i);
2370 if (SB.NumSubRegs > 1 && i == 0)
2372 if (Indexes) {
2373 if (i == e - 1)
2374 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2375 else
2376 Indexes->insertMachineInstrInMaps(*MIB);
2377 }
2378 }
2379 }
2380
2381 SB.restore();
2382 }
2383
2384 MI->eraseFromParent();
2385
2386 if (LIS)
2388
2389 return true;
2390}
2391
2393 MachineBasicBlock &RestoreMBB,
2394 Register SGPR, RegScavenger *RS) const {
2395 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
2396 RS);
2397 SB.prepare();
2398 // Generate the spill of SGPR to SB.TmpVGPR.
2399 RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2400 auto PVD = SB.getPerVGPRData();
2401 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2402 RegState TmpVGPRFlags = RegState::Undef;
2403 // Write sub registers into the VGPR
2404 for (unsigned i = Offset * PVD.PerVGPR,
2405 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2406 i < e; ++i) {
2407 Register SubReg =
2408 SB.NumSubRegs == 1
2409 ? SB.SuperReg
2410 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2411
2412 MachineInstrBuilder WriteLane =
2413 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
2414 SB.TmpVGPR)
2415 .addReg(SubReg, SubKillState)
2416 .addImm(i % PVD.PerVGPR)
2417 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2418 TmpVGPRFlags = {};
2419 // There could be undef components of a spilled super register.
2420 // TODO: Can we detect this and skip the spill?
2421 if (SB.NumSubRegs > 1) {
2422 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2423 RegState SuperKillState = {};
2424 if (i + 1 == SB.NumSubRegs)
2425 SuperKillState |= getKillRegState(SB.IsKill);
2426 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2427 }
2428 }
2429 // Don't need to write VGPR out.
2430 }
2431
2432 // Restore clobbered registers in the specified restore block.
2433 MI = RestoreMBB.end();
2434 SB.setMI(&RestoreMBB, MI);
2435 // Generate the restore of SGPR from SB.TmpVGPR.
2436 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2437 // Don't need to load VGPR in.
2438 // Unpack lanes
2439 for (unsigned i = Offset * PVD.PerVGPR,
2440 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2441 i < e; ++i) {
2442 Register SubReg =
2443 SB.NumSubRegs == 1
2444 ? SB.SuperReg
2445 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2446
2447 assert(SubReg.isPhysical());
2448 bool LastSubReg = (i + 1 == e);
2449 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
2450 SubReg)
2451 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2452 .addImm(i);
2453 if (SB.NumSubRegs > 1 && i == 0)
2455 }
2456 }
2457 SB.restore();
2458
2460 return false;
2461}
2462
2463/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2464/// a VGPR and the stack slot can be safely eliminated when all other users are
2465/// handled.
2468 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
2469 bool NeedsCFI = false;
2470 switch (MI->getOpcode()) {
2471 case AMDGPU::SI_SPILL_S1024_CFI_SAVE:
2472 case AMDGPU::SI_SPILL_S512_CFI_SAVE:
2473 case AMDGPU::SI_SPILL_S256_CFI_SAVE:
2474 case AMDGPU::SI_SPILL_S224_CFI_SAVE:
2475 case AMDGPU::SI_SPILL_S192_CFI_SAVE:
2476 case AMDGPU::SI_SPILL_S160_CFI_SAVE:
2477 case AMDGPU::SI_SPILL_S128_CFI_SAVE:
2478 case AMDGPU::SI_SPILL_S96_CFI_SAVE:
2479 case AMDGPU::SI_SPILL_S64_CFI_SAVE:
2480 case AMDGPU::SI_SPILL_S32_CFI_SAVE:
2481 NeedsCFI = true;
2482 [[fallthrough]];
2483 case AMDGPU::SI_SPILL_S1024_SAVE:
2484 case AMDGPU::SI_SPILL_S512_SAVE:
2485 case AMDGPU::SI_SPILL_S384_SAVE:
2486 case AMDGPU::SI_SPILL_S352_SAVE:
2487 case AMDGPU::SI_SPILL_S320_SAVE:
2488 case AMDGPU::SI_SPILL_S288_SAVE:
2489 case AMDGPU::SI_SPILL_S256_SAVE:
2490 case AMDGPU::SI_SPILL_S224_SAVE:
2491 case AMDGPU::SI_SPILL_S192_SAVE:
2492 case AMDGPU::SI_SPILL_S160_SAVE:
2493 case AMDGPU::SI_SPILL_S128_SAVE:
2494 case AMDGPU::SI_SPILL_S96_SAVE:
2495 case AMDGPU::SI_SPILL_S64_SAVE:
2496 case AMDGPU::SI_SPILL_S32_SAVE:
2497 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane,
2498 NeedsCFI);
2499 case AMDGPU::SI_SPILL_S1024_RESTORE:
2500 case AMDGPU::SI_SPILL_S512_RESTORE:
2501 case AMDGPU::SI_SPILL_S384_RESTORE:
2502 case AMDGPU::SI_SPILL_S352_RESTORE:
2503 case AMDGPU::SI_SPILL_S320_RESTORE:
2504 case AMDGPU::SI_SPILL_S288_RESTORE:
2505 case AMDGPU::SI_SPILL_S256_RESTORE:
2506 case AMDGPU::SI_SPILL_S224_RESTORE:
2507 case AMDGPU::SI_SPILL_S192_RESTORE:
2508 case AMDGPU::SI_SPILL_S160_RESTORE:
2509 case AMDGPU::SI_SPILL_S128_RESTORE:
2510 case AMDGPU::SI_SPILL_S96_RESTORE:
2511 case AMDGPU::SI_SPILL_S64_RESTORE:
2512 case AMDGPU::SI_SPILL_S32_RESTORE:
2513 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2514 default:
2515 llvm_unreachable("not an SGPR spill instruction");
2516 }
2517}
2518
2520 int SPAdj, unsigned FIOperandNum,
2521 RegScavenger *RS) const {
2522 MachineFunction *MF = MI->getMF();
2523 MachineBasicBlock *MBB = MI->getParent();
2525 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2526 const SIInstrInfo *TII = ST.getInstrInfo();
2527 const DebugLoc &DL = MI->getDebugLoc();
2528
2529 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
2530
2532 "unreserved scratch RSRC register");
2533
2534 MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
2535 int Index = MI->getOperand(FIOperandNum).getIndex();
2536
2537 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
2538 ? getBaseRegister()
2539 : getFrameRegister(*MF);
2540
2541 bool NeedsCFI = false;
2542
2543 switch (MI->getOpcode()) {
2544 // SGPR register spill
2545 case AMDGPU::SI_SPILL_S1024_CFI_SAVE:
2546 case AMDGPU::SI_SPILL_S512_CFI_SAVE:
2547 case AMDGPU::SI_SPILL_S256_CFI_SAVE:
2548 case AMDGPU::SI_SPILL_S224_CFI_SAVE:
2549 case AMDGPU::SI_SPILL_S192_CFI_SAVE:
2550 case AMDGPU::SI_SPILL_S160_CFI_SAVE:
2551 case AMDGPU::SI_SPILL_S128_CFI_SAVE:
2552 case AMDGPU::SI_SPILL_S96_CFI_SAVE:
2553 case AMDGPU::SI_SPILL_S64_CFI_SAVE:
2554 case AMDGPU::SI_SPILL_S32_CFI_SAVE: {
2555 NeedsCFI = true;
2556 [[fallthrough]];
2557 }
2558 case AMDGPU::SI_SPILL_S1024_SAVE:
2559 case AMDGPU::SI_SPILL_S512_SAVE:
2560 case AMDGPU::SI_SPILL_S384_SAVE:
2561 case AMDGPU::SI_SPILL_S352_SAVE:
2562 case AMDGPU::SI_SPILL_S320_SAVE:
2563 case AMDGPU::SI_SPILL_S288_SAVE:
2564 case AMDGPU::SI_SPILL_S256_SAVE:
2565 case AMDGPU::SI_SPILL_S224_SAVE:
2566 case AMDGPU::SI_SPILL_S192_SAVE:
2567 case AMDGPU::SI_SPILL_S160_SAVE:
2568 case AMDGPU::SI_SPILL_S128_SAVE:
2569 case AMDGPU::SI_SPILL_S96_SAVE:
2570 case AMDGPU::SI_SPILL_S64_SAVE:
2571 case AMDGPU::SI_SPILL_S32_SAVE: {
2572 return spillSGPR(MI, Index, RS, nullptr, nullptr,
2573 FrameInfo.getStackID(Index) == TargetStackID::SGPRSpill,
2574 false, NeedsCFI);
2575 }
2576
2577 // SGPR register restore
2578 case AMDGPU::SI_SPILL_S1024_RESTORE:
2579 case AMDGPU::SI_SPILL_S512_RESTORE:
2580 case AMDGPU::SI_SPILL_S384_RESTORE:
2581 case AMDGPU::SI_SPILL_S352_RESTORE:
2582 case AMDGPU::SI_SPILL_S320_RESTORE:
2583 case AMDGPU::SI_SPILL_S288_RESTORE:
2584 case AMDGPU::SI_SPILL_S256_RESTORE:
2585 case AMDGPU::SI_SPILL_S224_RESTORE:
2586 case AMDGPU::SI_SPILL_S192_RESTORE:
2587 case AMDGPU::SI_SPILL_S160_RESTORE:
2588 case AMDGPU::SI_SPILL_S128_RESTORE:
2589 case AMDGPU::SI_SPILL_S96_RESTORE:
2590 case AMDGPU::SI_SPILL_S64_RESTORE:
2591 case AMDGPU::SI_SPILL_S32_RESTORE: {
2592 return restoreSGPR(MI, Index, RS, nullptr, nullptr,
2593 FrameInfo.getStackID(Index) ==
2595 }
2596
2597 // VGPR register spill
2598 case AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE:
2599 case AMDGPU::SI_SPILL_V1024_CFI_SAVE:
2600 case AMDGPU::SI_SPILL_V512_CFI_SAVE:
2601 case AMDGPU::SI_SPILL_V256_CFI_SAVE:
2602 case AMDGPU::SI_SPILL_V224_CFI_SAVE:
2603 case AMDGPU::SI_SPILL_V192_CFI_SAVE:
2604 case AMDGPU::SI_SPILL_V160_CFI_SAVE:
2605 case AMDGPU::SI_SPILL_V128_CFI_SAVE:
2606 case AMDGPU::SI_SPILL_V96_CFI_SAVE:
2607 case AMDGPU::SI_SPILL_V64_CFI_SAVE:
2608 case AMDGPU::SI_SPILL_V32_CFI_SAVE:
2609 case AMDGPU::SI_SPILL_A1024_CFI_SAVE:
2610 case AMDGPU::SI_SPILL_A512_CFI_SAVE:
2611 case AMDGPU::SI_SPILL_A256_CFI_SAVE:
2612 case AMDGPU::SI_SPILL_A224_CFI_SAVE:
2613 case AMDGPU::SI_SPILL_A192_CFI_SAVE:
2614 case AMDGPU::SI_SPILL_A160_CFI_SAVE:
2615 case AMDGPU::SI_SPILL_A128_CFI_SAVE:
2616 case AMDGPU::SI_SPILL_A96_CFI_SAVE:
2617 case AMDGPU::SI_SPILL_A64_CFI_SAVE:
2618 case AMDGPU::SI_SPILL_A32_CFI_SAVE:
2619 case AMDGPU::SI_SPILL_AV1024_CFI_SAVE:
2620 case AMDGPU::SI_SPILL_AV512_CFI_SAVE:
2621 case AMDGPU::SI_SPILL_AV256_CFI_SAVE:
2622 case AMDGPU::SI_SPILL_AV224_CFI_SAVE:
2623 case AMDGPU::SI_SPILL_AV192_CFI_SAVE:
2624 case AMDGPU::SI_SPILL_AV160_CFI_SAVE:
2625 case AMDGPU::SI_SPILL_AV128_CFI_SAVE:
2626 case AMDGPU::SI_SPILL_AV96_CFI_SAVE:
2627 case AMDGPU::SI_SPILL_AV64_CFI_SAVE:
2628 case AMDGPU::SI_SPILL_AV32_CFI_SAVE:
2629 NeedsCFI = true;
2630 [[fallthrough]];
2631 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
2632 case AMDGPU::SI_SPILL_V1024_SAVE:
2633 case AMDGPU::SI_SPILL_V512_SAVE:
2634 case AMDGPU::SI_SPILL_V384_SAVE:
2635 case AMDGPU::SI_SPILL_V352_SAVE:
2636 case AMDGPU::SI_SPILL_V320_SAVE:
2637 case AMDGPU::SI_SPILL_V288_SAVE:
2638 case AMDGPU::SI_SPILL_V256_SAVE:
2639 case AMDGPU::SI_SPILL_V224_SAVE:
2640 case AMDGPU::SI_SPILL_V192_SAVE:
2641 case AMDGPU::SI_SPILL_V160_SAVE:
2642 case AMDGPU::SI_SPILL_V128_SAVE:
2643 case AMDGPU::SI_SPILL_V96_SAVE:
2644 case AMDGPU::SI_SPILL_V64_SAVE:
2645 case AMDGPU::SI_SPILL_V32_SAVE:
2646 case AMDGPU::SI_SPILL_V16_SAVE:
2647 case AMDGPU::SI_SPILL_A1024_SAVE:
2648 case AMDGPU::SI_SPILL_A512_SAVE:
2649 case AMDGPU::SI_SPILL_A384_SAVE:
2650 case AMDGPU::SI_SPILL_A352_SAVE:
2651 case AMDGPU::SI_SPILL_A320_SAVE:
2652 case AMDGPU::SI_SPILL_A288_SAVE:
2653 case AMDGPU::SI_SPILL_A256_SAVE:
2654 case AMDGPU::SI_SPILL_A224_SAVE:
2655 case AMDGPU::SI_SPILL_A192_SAVE:
2656 case AMDGPU::SI_SPILL_A160_SAVE:
2657 case AMDGPU::SI_SPILL_A128_SAVE:
2658 case AMDGPU::SI_SPILL_A96_SAVE:
2659 case AMDGPU::SI_SPILL_A64_SAVE:
2660 case AMDGPU::SI_SPILL_A32_SAVE:
2661 case AMDGPU::SI_SPILL_AV1024_SAVE:
2662 case AMDGPU::SI_SPILL_AV512_SAVE:
2663 case AMDGPU::SI_SPILL_AV384_SAVE:
2664 case AMDGPU::SI_SPILL_AV352_SAVE:
2665 case AMDGPU::SI_SPILL_AV320_SAVE:
2666 case AMDGPU::SI_SPILL_AV288_SAVE:
2667 case AMDGPU::SI_SPILL_AV256_SAVE:
2668 case AMDGPU::SI_SPILL_AV224_SAVE:
2669 case AMDGPU::SI_SPILL_AV192_SAVE:
2670 case AMDGPU::SI_SPILL_AV160_SAVE:
2671 case AMDGPU::SI_SPILL_AV128_SAVE:
2672 case AMDGPU::SI_SPILL_AV96_SAVE:
2673 case AMDGPU::SI_SPILL_AV64_SAVE:
2674 case AMDGPU::SI_SPILL_AV32_SAVE:
2675 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
2676 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
2677 assert(
2678 MI->getOpcode() != AMDGPU::SI_BLOCK_SPILL_V1024_SAVE &&
2679 "block spill does not currenty support spilling non-CSR registers");
2680
2681 if (MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE)
2682 // Put mask into M0.
2683 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2684 AMDGPU::M0)
2685 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2686
2687 const MachineOperand *VData = TII->getNamedOperand(*MI,
2688 AMDGPU::OpName::vdata);
2689 if (VData->isUndef()) {
2690 MI->eraseFromParent();
2691 return true;
2692 }
2693
2694 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2695 MFI->getStackPtrOffsetReg());
2696
2697 unsigned Opc;
2698 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) {
2699 assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!");
2700 Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
2701 } else {
2702 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE
2703 ? AMDGPU::SCRATCH_STORE_BLOCK_SADDR
2704 : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2705 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2706 }
2707
2708 auto *MBB = MI->getParent();
2709 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2710 if (IsWWMRegSpill) {
2711 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2712 RS->isRegUsed(AMDGPU::SCC));
2713 }
2715 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2716 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2717 *MI->memoperands_begin(), RS, nullptr, NeedsCFI);
2719 if (IsWWMRegSpill)
2720 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2721
2722 MI->eraseFromParent();
2723 return true;
2724 }
2725 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: {
2726 // Put mask into M0.
2727 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2728 AMDGPU::M0)
2729 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2730 [[fallthrough]];
2731 }
2732 case AMDGPU::SI_SPILL_V16_RESTORE:
2733 case AMDGPU::SI_SPILL_V32_RESTORE:
2734 case AMDGPU::SI_SPILL_V64_RESTORE:
2735 case AMDGPU::SI_SPILL_V96_RESTORE:
2736 case AMDGPU::SI_SPILL_V128_RESTORE:
2737 case AMDGPU::SI_SPILL_V160_RESTORE:
2738 case AMDGPU::SI_SPILL_V192_RESTORE:
2739 case AMDGPU::SI_SPILL_V224_RESTORE:
2740 case AMDGPU::SI_SPILL_V256_RESTORE:
2741 case AMDGPU::SI_SPILL_V288_RESTORE:
2742 case AMDGPU::SI_SPILL_V320_RESTORE:
2743 case AMDGPU::SI_SPILL_V352_RESTORE:
2744 case AMDGPU::SI_SPILL_V384_RESTORE:
2745 case AMDGPU::SI_SPILL_V512_RESTORE:
2746 case AMDGPU::SI_SPILL_V1024_RESTORE:
2747 case AMDGPU::SI_SPILL_A32_RESTORE:
2748 case AMDGPU::SI_SPILL_A64_RESTORE:
2749 case AMDGPU::SI_SPILL_A96_RESTORE:
2750 case AMDGPU::SI_SPILL_A128_RESTORE:
2751 case AMDGPU::SI_SPILL_A160_RESTORE:
2752 case AMDGPU::SI_SPILL_A192_RESTORE:
2753 case AMDGPU::SI_SPILL_A224_RESTORE:
2754 case AMDGPU::SI_SPILL_A256_RESTORE:
2755 case AMDGPU::SI_SPILL_A288_RESTORE:
2756 case AMDGPU::SI_SPILL_A320_RESTORE:
2757 case AMDGPU::SI_SPILL_A352_RESTORE:
2758 case AMDGPU::SI_SPILL_A384_RESTORE:
2759 case AMDGPU::SI_SPILL_A512_RESTORE:
2760 case AMDGPU::SI_SPILL_A1024_RESTORE:
2761 case AMDGPU::SI_SPILL_AV32_RESTORE:
2762 case AMDGPU::SI_SPILL_AV64_RESTORE:
2763 case AMDGPU::SI_SPILL_AV96_RESTORE:
2764 case AMDGPU::SI_SPILL_AV128_RESTORE:
2765 case AMDGPU::SI_SPILL_AV160_RESTORE:
2766 case AMDGPU::SI_SPILL_AV192_RESTORE:
2767 case AMDGPU::SI_SPILL_AV224_RESTORE:
2768 case AMDGPU::SI_SPILL_AV256_RESTORE:
2769 case AMDGPU::SI_SPILL_AV288_RESTORE:
2770 case AMDGPU::SI_SPILL_AV320_RESTORE:
2771 case AMDGPU::SI_SPILL_AV352_RESTORE:
2772 case AMDGPU::SI_SPILL_AV384_RESTORE:
2773 case AMDGPU::SI_SPILL_AV512_RESTORE:
2774 case AMDGPU::SI_SPILL_AV1024_RESTORE:
2775 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
2776 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
2777 const MachineOperand *VData = TII->getNamedOperand(*MI,
2778 AMDGPU::OpName::vdata);
2779 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2780 MFI->getStackPtrOffsetReg());
2781
2782 unsigned Opc;
2783 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
2784 assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!");
2785 Opc = ST.d16PreservesUnusedBits()
2786 ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16
2787 : AMDGPU::SCRATCH_LOAD_USHORT_SADDR;
2788 } else {
2789 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE
2790 ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR
2791 : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2792 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2793 }
2794
2795 auto *MBB = MI->getParent();
2796 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2797 if (IsWWMRegSpill) {
2798 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2799 RS->isRegUsed(AMDGPU::SCC));
2800 }
2801
2803 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2804 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2805 *MI->memoperands_begin(), RS);
2806
2807 if (IsWWMRegSpill)
2808 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2809
2810 MI->eraseFromParent();
2811 return true;
2812 }
2813 case AMDGPU::V_ADD_U32_e32:
2814 case AMDGPU::V_ADD_U32_e64:
2815 case AMDGPU::V_ADD_CO_U32_e32:
2816 case AMDGPU::V_ADD_CO_U32_e64: {
2817 // TODO: Handle sub, and, or.
2818 unsigned NumDefs = MI->getNumExplicitDefs();
2819 unsigned Src0Idx = NumDefs;
2820
2821 bool HasClamp = false;
2822 MachineOperand *VCCOp = nullptr;
2823
2824 switch (MI->getOpcode()) {
2825 case AMDGPU::V_ADD_U32_e32:
2826 break;
2827 case AMDGPU::V_ADD_U32_e64:
2828 HasClamp = MI->getOperand(3).getImm();
2829 break;
2830 case AMDGPU::V_ADD_CO_U32_e32:
2831 VCCOp = &MI->getOperand(3);
2832 break;
2833 case AMDGPU::V_ADD_CO_U32_e64:
2834 VCCOp = &MI->getOperand(1);
2835 HasClamp = MI->getOperand(4).getImm();
2836 break;
2837 default:
2838 break;
2839 }
2840 bool DeadVCC = !VCCOp || VCCOp->isDead();
2841 MachineOperand &DstOp = MI->getOperand(0);
2842 Register DstReg = DstOp.getReg();
2843
2844 unsigned OtherOpIdx =
2845 FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2846 MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2847
2848 unsigned Src1Idx = Src0Idx + 1;
2849 Register MaterializedReg = FrameReg;
2850 Register ScavengedVGPR;
2851
2852 int64_t Offset = FrameInfo.getObjectOffset(Index);
2853 // For the non-immediate case, we could fall through to the default
2854 // handling, but we do an in-place update of the result register here to
2855 // avoid scavenging another register.
2856 if (OtherOp->isImm()) {
2857 int64_t TotalOffset = OtherOp->getImm() + Offset;
2858
2859 if (!ST.hasVOP3Literal() && SIInstrInfo::isVOP3(*MI) &&
2860 !AMDGPU::isInlinableIntLiteral(TotalOffset)) {
2861 // If we can't support a VOP3 literal in the VALU instruction, we
2862 // can't specially fold into the add.
2863 // TODO: Handle VOP3->VOP2 shrink to support the fold.
2864 break;
2865 }
2866
2867 OtherOp->setImm(TotalOffset);
2868 Offset = 0;
2869 }
2870
2871 if (FrameReg && !ST.hasFlatScratchEnabled()) {
2872 // We should just do an in-place update of the result register. However,
2873 // the value there may also be used by the add, in which case we need a
2874 // temporary register.
2875 //
2876 // FIXME: The scavenger is not finding the result register in the
2877 // common case where the add does not read the register.
2878
2879 ScavengedVGPR = RS->scavengeRegisterBackwards(
2880 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2881
2882 // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2883 // shift.
2884 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2885 .addDef(ScavengedVGPR, RegState::Renamable)
2886 .addImm(ST.getWavefrontSizeLog2())
2887 .addReg(FrameReg);
2888 MaterializedReg = ScavengedVGPR;
2889 }
2890
2891 if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2892 if (ST.hasFlatScratchEnabled() &&
2893 !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2894 // We didn't need the shift above, so we have an SGPR for the frame
2895 // register, but may have a VGPR only operand.
2896 //
2897 // TODO: On gfx10+, we can easily change the opcode to the e64 version
2898 // and use the higher constant bus restriction to avoid this copy.
2899
2900 if (!ScavengedVGPR) {
2901 ScavengedVGPR = RS->scavengeRegisterBackwards(
2902 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2903 /*SPAdj=*/0);
2904 }
2905
2906 assert(ScavengedVGPR != DstReg);
2907
2908 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2909 .addReg(MaterializedReg,
2910 getKillRegState(MaterializedReg != FrameReg));
2911 MaterializedReg = ScavengedVGPR;
2912 }
2913
2914 // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2915 // is not live, we could use a scalar add + vector add instead of 2
2916 // vector adds.
2917 auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2918 .addDef(DstReg, RegState::Renamable);
2919 if (NumDefs == 2)
2920 AddI32.add(MI->getOperand(1));
2921
2922 RegState MaterializedRegFlags =
2923 getKillRegState(MaterializedReg != FrameReg);
2924
2925 if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2926 // If we know we have a VGPR already, it's more likely the other
2927 // operand is a legal vsrc0.
2928 AddI32
2929 .add(*OtherOp)
2930 .addReg(MaterializedReg, MaterializedRegFlags);
2931 } else {
2932 // Commute operands to avoid violating VOP2 restrictions. This will
2933 // typically happen when using scratch.
2934 AddI32
2935 .addReg(MaterializedReg, MaterializedRegFlags)
2936 .add(*OtherOp);
2937 }
2938
2939 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2940 MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2941 AddI32.addImm(0); // clamp
2942
2943 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2944 AddI32.setOperandDead(3); // Dead vcc
2945
2946 MaterializedReg = DstReg;
2947
2948 OtherOp->ChangeToRegister(MaterializedReg, false);
2949 OtherOp->setIsKill(true);
2951 Offset = 0;
2952 } else if (Offset != 0) {
2953 assert(!MaterializedReg);
2955 Offset = 0;
2956 } else {
2957 if (DeadVCC && !HasClamp) {
2958 assert(Offset == 0);
2959
2960 // TODO: Losing kills and implicit operands. Just mutate to copy and
2961 // let lowerCopy deal with it?
2962 if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2963 // Folded to an identity copy.
2964 MI->eraseFromParent();
2965 return true;
2966 }
2967
2968 // The immediate value should be in OtherOp
2969 MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2970 MI->removeOperand(FIOperandNum);
2971
2972 unsigned NumOps = MI->getNumOperands();
2973 for (unsigned I = NumOps - 2; I >= NumDefs + 1; --I)
2974 MI->removeOperand(I);
2975
2976 if (NumDefs == 2)
2977 MI->removeOperand(1);
2978
2979 // The code below can't deal with a mov.
2980 return true;
2981 }
2982
2983 // This folded to a constant, but we have to keep the add around for
2984 // pointless implicit defs or clamp modifier.
2985 FIOp->ChangeToImmediate(0);
2986 }
2987
2988 // Try to improve legality by commuting.
2989 if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2990 std::swap(FIOp, OtherOp);
2991 std::swap(FIOperandNum, OtherOpIdx);
2992 }
2993
2994 // We need at most one mov to satisfy the operand constraints. Prefer to
2995 // move the FI operand first, as it may be a literal in a VOP3
2996 // instruction.
2997 for (unsigned SrcIdx : {FIOperandNum, OtherOpIdx}) {
2998 if (!TII->isOperandLegal(*MI, SrcIdx)) {
2999 // If commuting didn't make the operands legal, we need to materialize
3000 // in a register.
3001 // TODO: Can use SGPR on gfx10+ in some cases.
3002 if (!ScavengedVGPR) {
3003 ScavengedVGPR = RS->scavengeRegisterBackwards(
3004 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
3005 /*SPAdj=*/0);
3006 }
3007
3008 assert(ScavengedVGPR != DstReg);
3009
3010 MachineOperand &Src = MI->getOperand(SrcIdx);
3011 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
3012 .add(Src);
3013
3014 Src.ChangeToRegister(ScavengedVGPR, false);
3015 Src.setIsKill(true);
3016 break;
3017 }
3018 }
3019
3020 // Fold out add of 0 case that can appear in kernels.
3021 if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
3022 if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
3023 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
3024 }
3025
3026 MI->eraseFromParent();
3027 }
3028
3029 return true;
3030 }
3031 case AMDGPU::S_ADD_I32:
3032 case AMDGPU::S_ADD_U32: {
3033 // TODO: Handle s_or_b32, s_and_b32.
3034 unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
3035 MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
3036
3037 assert(FrameReg || MFI->isBottomOfStack());
3038
3039 MachineOperand &DstOp = MI->getOperand(0);
3040 const DebugLoc &DL = MI->getDebugLoc();
3041 Register MaterializedReg = FrameReg;
3042
3043 // Defend against live scc, which should never happen in practice.
3044 bool DeadSCC = MI->getOperand(3).isDead();
3045
3046 Register TmpReg;
3047
3048 // FIXME: Scavenger should figure out that the result register is
3049 // available. Also should do this for the v_add case.
3050 if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg())
3051 TmpReg = DstOp.getReg();
3052
3053 if (FrameReg && !ST.hasFlatScratchEnabled()) {
3054 // FIXME: In the common case where the add does not also read its result
3055 // (i.e. this isn't a reg += fi), it's not finding the dest reg as
3056 // available.
3057 if (!TmpReg)
3058 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3059 MI, /*RestoreAfter=*/false, 0,
3060 /*AllowSpill=*/false);
3061 if (TmpReg) {
3062 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
3063 .addDef(TmpReg, RegState::Renamable)
3064 .addReg(FrameReg)
3065 .addImm(ST.getWavefrontSizeLog2())
3066 .setOperandDead(3); // Set SCC dead
3067 }
3068 MaterializedReg = TmpReg;
3069 }
3070
3071 int64_t Offset = FrameInfo.getObjectOffset(Index);
3072
3073 // For the non-immediate case, we could fall through to the default
3074 // handling, but we do an in-place update of the result register here to
3075 // avoid scavenging another register.
3076 if (OtherOp.isImm()) {
3077 OtherOp.setImm(OtherOp.getImm() + Offset);
3078 Offset = 0;
3079
3080 if (MaterializedReg)
3081 FIOp->ChangeToRegister(MaterializedReg, false);
3082 else
3083 FIOp->ChangeToImmediate(0);
3084 } else if (MaterializedReg) {
3085 // If we can't fold the other operand, do another increment.
3086 Register DstReg = DstOp.getReg();
3087
3088 if (!TmpReg && MaterializedReg == FrameReg) {
3089 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3090 MI, /*RestoreAfter=*/false, 0,
3091 /*AllowSpill=*/false);
3092 DstReg = TmpReg;
3093 }
3094
3095 if (TmpReg) {
3096 auto AddI32 = BuildMI(*MBB, *MI, DL, MI->getDesc())
3097 .addDef(DstReg, RegState::Renamable)
3098 .addReg(MaterializedReg, RegState::Kill)
3099 .add(OtherOp);
3100 if (DeadSCC)
3101 AddI32.setOperandDead(3);
3102
3103 MaterializedReg = DstReg;
3104
3105 OtherOp.ChangeToRegister(MaterializedReg, false);
3106 OtherOp.setIsKill(true);
3107 OtherOp.setIsRenamable(true);
3108 }
3110 } else {
3111 // If we don't have any other offset to apply, we can just directly
3112 // interpret the frame index as the offset.
3114 }
3115
3116 if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
3117 assert(Offset == 0);
3118 MI->removeOperand(3);
3119 MI->removeOperand(OtherOpIdx);
3120 MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
3121 } else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
3122 assert(Offset == 0);
3123 MI->removeOperand(3);
3124 MI->removeOperand(FIOperandNum);
3125 MI->setDesc(
3126 TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
3127 }
3128
3129 assert(!FIOp->isFI());
3130 return true;
3131 }
3132 default: {
3133 break;
3134 }
3135 }
3136
3137 int64_t Offset = FrameInfo.getObjectOffset(Index);
3138 if (ST.hasFlatScratchEnabled()) {
3139 if (TII->isFLATScratch(*MI)) {
3140 assert(
3141 (int16_t)FIOperandNum ==
3142 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::saddr));
3143
3144 // The offset is always swizzled, just replace it
3145 if (FrameReg)
3146 FIOp->ChangeToRegister(FrameReg, false);
3147
3149 TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
3150 int64_t NewOffset = Offset + OffsetOp->getImm();
3151 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
3153 OffsetOp->setImm(NewOffset);
3154 if (FrameReg)
3155 return false;
3156 Offset = 0;
3157 }
3158
3159 if (!Offset) {
3160 unsigned Opc = MI->getOpcode();
3161 int NewOpc = -1;
3162 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
3164 } else if (ST.hasFlatScratchSTMode()) {
3165 // On GFX10 we have ST mode to use no registers for an address.
3166 // Otherwise we need to materialize 0 into an SGPR.
3168 }
3169
3170 if (NewOpc != -1) {
3171 // removeOperand doesn't fixup tied operand indexes as it goes, so
3172 // it asserts. Untie vdst_in for now and retie them afterwards.
3173 int VDstIn =
3174 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
3175 bool TiedVDst = VDstIn != -1 && MI->getOperand(VDstIn).isReg() &&
3176 MI->getOperand(VDstIn).isTied();
3177 if (TiedVDst)
3178 MI->untieRegOperand(VDstIn);
3179
3180 MI->removeOperand(
3181 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
3182
3183 if (TiedVDst) {
3184 int NewVDst =
3185 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
3186 int NewVDstIn =
3187 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
3188 assert(NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
3189 MI->tieOperands(NewVDst, NewVDstIn);
3190 }
3191 MI->setDesc(TII->get(NewOpc));
3192 return false;
3193 }
3194 }
3195 }
3196
3197 if (!FrameReg) {
3199 if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
3200 return false;
3201 }
3202
3203 // We need to use register here. Check if we can use an SGPR or need
3204 // a VGPR.
3205 FIOp->ChangeToRegister(AMDGPU::M0, false);
3206 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
3207
3208 if (!Offset && FrameReg && UseSGPR) {
3209 FIOp->setReg(FrameReg);
3210 return false;
3211 }
3212
3213 const TargetRegisterClass *RC =
3214 UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass;
3215
3216 Register TmpReg =
3217 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
3218 FIOp->setReg(TmpReg);
3219 FIOp->setIsKill();
3220
3221 if ((!FrameReg || !Offset) && TmpReg) {
3222 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
3223 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
3224 if (FrameReg)
3225 MIB.addReg(FrameReg);
3226 else
3227 MIB.addImm(Offset);
3228
3229 return false;
3230 }
3231
3232 bool NeedSaveSCC = (RS->isRegUsed(AMDGPU::SCC) &&
3233 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr)) ||
3234 MI->readsRegister(AMDGPU::SCC, /*TRI=*/nullptr);
3235
3236 Register TmpSReg =
3237 UseSGPR ? TmpReg
3238 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3239 MI, false, 0, !UseSGPR);
3240
3241 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) {
3242 int SVOpcode = AMDGPU::getFlatScratchInstSVfromSS(MI->getOpcode());
3243 if (ST.hasFlatScratchSVSMode() && SVOpcode != -1) {
3244 Register TmpVGPR = RS->scavengeRegisterBackwards(
3245 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3246
3247 // Materialize the frame register.
3248 auto MIB =
3249 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR);
3250 if (FrameReg)
3251 MIB.addReg(FrameReg);
3252 else
3253 MIB.addImm(Offset);
3254
3255 // Add the offset to the frame register.
3256 if (FrameReg && Offset)
3257 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), FrameReg)
3258 .addReg(FrameReg, RegState::Kill)
3259 .addImm(Offset);
3260
3261 BuildMI(*MBB, MI, DL, TII->get(SVOpcode))
3262 .add(MI->getOperand(0)) // $vdata
3263 .addReg(TmpVGPR) // $vaddr
3264 .addImm(0) // Offset
3265 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::cpol));
3266 MI->eraseFromParent();
3267 return true;
3268 }
3269 report_fatal_error("Cannot scavenge register in FI elimination!");
3270 }
3271
3272 if (!TmpSReg) {
3273 // Use frame register and restore it after.
3274 TmpSReg = FrameReg;
3275 FIOp->setReg(FrameReg);
3276 FIOp->setIsKill(false);
3277 }
3278
3279 if (NeedSaveSCC) {
3280 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
3281 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
3282 .addReg(FrameReg)
3283 .addImm(Offset);
3284 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
3285 .addReg(TmpSReg)
3286 .addImm(0);
3287 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
3288 .addImm(0)
3289 .addReg(TmpSReg);
3290 } else {
3291 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
3292 .addReg(FrameReg)
3293 .addImm(Offset);
3294 }
3295
3296 if (!UseSGPR)
3297 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3298 .addReg(TmpSReg, RegState::Kill);
3299
3300 if (TmpSReg == FrameReg) {
3301 // Undo frame register modification.
3302 if (NeedSaveSCC &&
3303 !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
3305 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
3306 TmpSReg)
3307 .addReg(FrameReg)
3308 .addImm(-Offset);
3309 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
3310 .addReg(TmpSReg)
3311 .addImm(0);
3312 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
3313 TmpSReg)
3314 .addImm(0)
3315 .addReg(TmpSReg);
3316 } else {
3317 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
3318 FrameReg)
3319 .addReg(FrameReg)
3320 .addImm(-Offset);
3321 }
3322 }
3323
3324 return false;
3325 }
3326
3327 bool IsMUBUF = TII->isMUBUF(*MI);
3328
3329 if (!IsMUBUF && !MFI->isBottomOfStack()) {
3330 // Convert to a swizzled stack address by scaling by the wave size.
3331 // In an entry function/kernel the offset is already swizzled.
3332 bool IsSALU = isSGPRClass(TII->getRegClass(MI->getDesc(), FIOperandNum));
3333 bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
3334 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
3335 const TargetRegisterClass *RC = IsSALU && !LiveSCC
3336 ? &AMDGPU::SReg_32RegClass
3337 : &AMDGPU::VGPR_32RegClass;
3338 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
3339 MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
3340 MI->getOpcode() == AMDGPU::S_MOV_B32;
3341 Register ResultReg =
3342 IsCopy ? MI->getOperand(0).getReg()
3343 : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
3344
3345 int64_t Offset = FrameInfo.getObjectOffset(Index);
3346 if (Offset == 0) {
3347 unsigned OpCode =
3348 IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64;
3349 Register TmpResultReg = ResultReg;
3350 if (IsSALU && LiveSCC) {
3351 TmpResultReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
3352 MI, false, 0);
3353 }
3354
3355 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
3356 if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
3357 // For V_LSHRREV, the operands are reversed (the shift count goes
3358 // first).
3359 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
3360 else
3361 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
3362 if (IsSALU && !LiveSCC)
3363 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
3364 if (IsSALU && LiveSCC) {
3365 Register NewDest;
3366 if (IsCopy) {
3367 assert(ResultReg.isPhysical());
3368 NewDest = ResultReg;
3369 } else {
3370 NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3371 Shift, false, 0);
3372 }
3373 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
3374 .addReg(TmpResultReg);
3375 ResultReg = NewDest;
3376 }
3377 } else {
3379 if (!IsSALU) {
3380 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
3381 nullptr) {
3382 // Reuse ResultReg in intermediate step.
3383 Register ScaledReg = ResultReg;
3384
3385 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3386 ScaledReg)
3387 .addImm(ST.getWavefrontSizeLog2())
3388 .addReg(FrameReg);
3389
3390 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
3391
3392 // TODO: Fold if use instruction is another add of a constant.
3393 if (IsVOP2 ||
3394 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
3395 // FIXME: This can fail
3396 MIB.addImm(Offset);
3397 MIB.addReg(ScaledReg, RegState::Kill);
3398 if (!IsVOP2)
3399 MIB.addImm(0); // clamp bit
3400 } else {
3401 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
3402 "Need to reuse carry out register");
3403
3404 // Use scavenged unused carry out as offset register.
3405 Register ConstOffsetReg;
3406 if (!isWave32)
3407 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
3408 else
3409 ConstOffsetReg = MIB.getReg(1);
3410
3411 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32),
3412 ConstOffsetReg)
3413 .addImm(Offset);
3414 MIB.addReg(ConstOffsetReg, RegState::Kill);
3415 MIB.addReg(ScaledReg, RegState::Kill);
3416 MIB.addImm(0); // clamp bit
3417 }
3418 }
3419 }
3420 if (!MIB || IsSALU) {
3421 // We have to produce a carry out, and there isn't a free SGPR pair
3422 // for it. We can keep the whole computation on the SALU to avoid
3423 // clobbering an additional register at the cost of an extra mov.
3424
3425 // We may have 1 free scratch SGPR even though a carry out is
3426 // unavailable. Only one additional mov is needed.
3427 Register TmpScaledReg = IsCopy && IsSALU
3428 ? ResultReg
3429 : RS->scavengeRegisterBackwards(
3430 AMDGPU::SReg_32_XM0RegClass, MI,
3431 false, 0, /*AllowSpill=*/false);
3432 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
3433 Register TmpResultReg = ScaledReg;
3434
3435 if (!LiveSCC) {
3436 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
3437 .addReg(FrameReg)
3438 .addImm(ST.getWavefrontSizeLog2());
3439 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
3440 .addReg(TmpResultReg, RegState::Kill)
3441 .addImm(Offset);
3442 } else {
3443 TmpResultReg = RS->scavengeRegisterBackwards(
3444 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3445
3447 if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
3448 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3449 TmpResultReg)
3450 .addImm(ST.getWavefrontSizeLog2())
3451 .addReg(FrameReg);
3452 if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
3453 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32), ResultReg)
3454 .addImm(Offset);
3455 Add.addReg(ResultReg, RegState::Kill)
3456 .addReg(TmpResultReg, RegState::Kill)
3457 .addImm(0);
3458 } else
3459 Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
3460 } else {
3461 assert(Offset > 0 && isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
3462 "offset is unsafe for v_mad_u32_u24");
3463
3464 // We start with a frame pointer with a wave space value, and
3465 // an offset in lane-space. We are materializing a lane space
3466 // value. We can either do a right shift of the frame pointer
3467 // to get to lane space, or a left shift of the offset to get
3468 // to wavespace. We can right shift after the computation to
3469 // get back to the desired per-lane value. We are using the
3470 // mad_u32_u24 primarily as an add with no carry out clobber.
3471 bool IsInlinableLiteral =
3472 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm());
3473 if (!IsInlinableLiteral) {
3474 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
3475 TmpResultReg)
3476 .addImm(Offset);
3477 }
3478
3479 Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
3480 TmpResultReg);
3481
3482 if (!IsInlinableLiteral) {
3483 Add.addReg(TmpResultReg, RegState::Kill);
3484 } else {
3485 // We fold the offset into mad itself if its inlinable.
3486 Add.addImm(Offset);
3487 }
3488 Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
3489 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3490 TmpResultReg)
3491 .addImm(ST.getWavefrontSizeLog2())
3492 .addReg(TmpResultReg);
3493 }
3494
3495 Register NewDest;
3496 if (IsCopy) {
3497 NewDest = ResultReg;
3498 } else {
3499 NewDest = RS->scavengeRegisterBackwards(
3500 AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
3501 /*AllowSpill=*/true);
3502 }
3503
3504 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
3505 NewDest)
3506 .addReg(TmpResultReg);
3507 ResultReg = NewDest;
3508 }
3509 if (!IsSALU)
3510 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
3511 .addReg(TmpResultReg, RegState::Kill);
3512 // If there were truly no free SGPRs, we need to undo everything.
3513 if (!TmpScaledReg.isValid()) {
3514 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
3515 .addReg(ScaledReg, RegState::Kill)
3516 .addImm(-Offset);
3517 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
3518 .addReg(FrameReg)
3519 .addImm(ST.getWavefrontSizeLog2());
3520 }
3521 }
3522 }
3523
3524 // Don't introduce an extra copy if we're just materializing in a mov.
3525 if (IsCopy) {
3526 MI->eraseFromParent();
3527 return true;
3528 }
3529 FIOp->ChangeToRegister(ResultReg, false, false, true);
3530 return false;
3531 }
3532
3533 if (IsMUBUF) {
3534 // Disable offen so we don't need a 0 vgpr base.
3535 assert(
3536 static_cast<int>(FIOperandNum) ==
3537 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr));
3538
3539 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
3540 assert((SOffset.isImm() && SOffset.getImm() == 0));
3541
3542 if (FrameReg != AMDGPU::NoRegister)
3543 SOffset.ChangeToRegister(FrameReg, false);
3544
3545 int64_t Offset = FrameInfo.getObjectOffset(Index);
3546 int64_t OldImm =
3547 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
3548 int64_t NewOffset = OldImm + Offset;
3549
3550 if (TII->isLegalMUBUFImmOffset(NewOffset) &&
3551 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
3552 MI->eraseFromParent();
3553 return true;
3554 }
3555 }
3556
3557 // If the offset is simply too big, don't convert to a scratch wave offset
3558 // relative index.
3559
3561 if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
3562 Register TmpReg =
3563 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
3564 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3565 .addImm(Offset);
3566 FIOp->ChangeToRegister(TmpReg, false, false, true);
3567 }
3568
3569 return false;
3570}
3571
3575
3577 return getEncodingValue(Reg) & AMDGPU::HWEncoding::REG_IDX_MASK;
3578}
3579
3581 return getRegBitWidth(RC.getID());
3582}
3583
3584static const TargetRegisterClass *
3586 if (BitWidth == 64)
3587 return &AMDGPU::VReg_64RegClass;
3588 if (BitWidth == 96)
3589 return &AMDGPU::VReg_96RegClass;
3590 if (BitWidth == 128)
3591 return &AMDGPU::VReg_128RegClass;
3592 if (BitWidth == 160)
3593 return &AMDGPU::VReg_160RegClass;
3594 if (BitWidth == 192)
3595 return &AMDGPU::VReg_192RegClass;
3596 if (BitWidth == 224)
3597 return &AMDGPU::VReg_224RegClass;
3598 if (BitWidth == 256)
3599 return &AMDGPU::VReg_256RegClass;
3600 if (BitWidth == 288)
3601 return &AMDGPU::VReg_288RegClass;
3602 if (BitWidth == 320)
3603 return &AMDGPU::VReg_320RegClass;
3604 if (BitWidth == 352)
3605 return &AMDGPU::VReg_352RegClass;
3606 if (BitWidth == 384)
3607 return &AMDGPU::VReg_384RegClass;
3608 if (BitWidth == 512)
3609 return &AMDGPU::VReg_512RegClass;
3610 if (BitWidth == 1024)
3611 return &AMDGPU::VReg_1024RegClass;
3612
3613 return nullptr;
3614}
3615
3616static const TargetRegisterClass *
3618 if (BitWidth == 64)
3619 return &AMDGPU::VReg_64_Align2RegClass;
3620 if (BitWidth == 96)
3621 return &AMDGPU::VReg_96_Align2RegClass;
3622 if (BitWidth == 128)
3623 return &AMDGPU::VReg_128_Align2RegClass;
3624 if (BitWidth == 160)
3625 return &AMDGPU::VReg_160_Align2RegClass;
3626 if (BitWidth == 192)
3627 return &AMDGPU::VReg_192_Align2RegClass;
3628 if (BitWidth == 224)
3629 return &AMDGPU::VReg_224_Align2RegClass;
3630 if (BitWidth == 256)
3631 return &AMDGPU::VReg_256_Align2RegClass;
3632 if (BitWidth == 288)
3633 return &AMDGPU::VReg_288_Align2RegClass;
3634 if (BitWidth == 320)
3635 return &AMDGPU::VReg_320_Align2RegClass;
3636 if (BitWidth == 352)
3637 return &AMDGPU::VReg_352_Align2RegClass;
3638 if (BitWidth == 384)
3639 return &AMDGPU::VReg_384_Align2RegClass;
3640 if (BitWidth == 512)
3641 return &AMDGPU::VReg_512_Align2RegClass;
3642 if (BitWidth == 1024)
3643 return &AMDGPU::VReg_1024_Align2RegClass;
3644
3645 return nullptr;
3646}
3647
3648const TargetRegisterClass *
3650 if (BitWidth == 1)
3651 return &AMDGPU::VReg_1RegClass;
3652 if (BitWidth == 16)
3653 return &AMDGPU::VGPR_16RegClass;
3654 if (BitWidth == 32)
3655 return &AMDGPU::VGPR_32RegClass;
3656 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
3658}
3659
3660const TargetRegisterClass *
3662 if (BitWidth <= 32)
3663 return &AMDGPU::VGPR_32_Lo256RegClass;
3664 if (BitWidth <= 64)
3665 return &AMDGPU::VReg_64_Lo256_Align2RegClass;
3666 if (BitWidth <= 96)
3667 return &AMDGPU::VReg_96_Lo256_Align2RegClass;
3668 if (BitWidth <= 128)
3669 return &AMDGPU::VReg_128_Lo256_Align2RegClass;
3670 if (BitWidth <= 160)
3671 return &AMDGPU::VReg_160_Lo256_Align2RegClass;
3672 if (BitWidth <= 192)
3673 return &AMDGPU::VReg_192_Lo256_Align2RegClass;
3674 if (BitWidth <= 224)
3675 return &AMDGPU::VReg_224_Lo256_Align2RegClass;
3676 if (BitWidth <= 256)
3677 return &AMDGPU::VReg_256_Lo256_Align2RegClass;
3678 if (BitWidth <= 288)
3679 return &AMDGPU::VReg_288_Lo256_Align2RegClass;
3680 if (BitWidth <= 320)
3681 return &AMDGPU::VReg_320_Lo256_Align2RegClass;
3682 if (BitWidth <= 352)
3683 return &AMDGPU::VReg_352_Lo256_Align2RegClass;
3684 if (BitWidth <= 384)
3685 return &AMDGPU::VReg_384_Lo256_Align2RegClass;
3686 if (BitWidth <= 512)
3687 return &AMDGPU::VReg_512_Lo256_Align2RegClass;
3688 if (BitWidth <= 1024)
3689 return &AMDGPU::VReg_1024_Lo256_Align2RegClass;
3690
3691 return nullptr;
3692}
3693
3694static const TargetRegisterClass *
3696 if (BitWidth == 64)
3697 return &AMDGPU::AReg_64RegClass;
3698 if (BitWidth == 96)
3699 return &AMDGPU::AReg_96RegClass;
3700 if (BitWidth == 128)
3701 return &AMDGPU::AReg_128RegClass;
3702 if (BitWidth == 160)
3703 return &AMDGPU::AReg_160RegClass;
3704 if (BitWidth == 192)
3705 return &AMDGPU::AReg_192RegClass;
3706 if (BitWidth == 224)
3707 return &AMDGPU::AReg_224RegClass;
3708 if (BitWidth == 256)
3709 return &AMDGPU::AReg_256RegClass;
3710 if (BitWidth == 288)
3711 return &AMDGPU::AReg_288RegClass;
3712 if (BitWidth == 320)
3713 return &AMDGPU::AReg_320RegClass;
3714 if (BitWidth == 352)
3715 return &AMDGPU::AReg_352RegClass;
3716 if (BitWidth == 384)
3717 return &AMDGPU::AReg_384RegClass;
3718 if (BitWidth == 512)
3719 return &AMDGPU::AReg_512RegClass;
3720 if (BitWidth == 1024)
3721 return &AMDGPU::AReg_1024RegClass;
3722
3723 return nullptr;
3724}
3725
3726static const TargetRegisterClass *
3728 if (BitWidth == 64)
3729 return &AMDGPU::AReg_64_Align2RegClass;
3730 if (BitWidth == 96)
3731 return &AMDGPU::AReg_96_Align2RegClass;
3732 if (BitWidth == 128)
3733 return &AMDGPU::AReg_128_Align2RegClass;
3734 if (BitWidth == 160)
3735 return &AMDGPU::AReg_160_Align2RegClass;
3736 if (BitWidth == 192)
3737 return &AMDGPU::AReg_192_Align2RegClass;
3738 if (BitWidth == 224)
3739 return &AMDGPU::AReg_224_Align2RegClass;
3740 if (BitWidth == 256)
3741 return &AMDGPU::AReg_256_Align2RegClass;
3742 if (BitWidth == 288)
3743 return &AMDGPU::AReg_288_Align2RegClass;
3744 if (BitWidth == 320)
3745 return &AMDGPU::AReg_320_Align2RegClass;
3746 if (BitWidth == 352)
3747 return &AMDGPU::AReg_352_Align2RegClass;
3748 if (BitWidth == 384)
3749 return &AMDGPU::AReg_384_Align2RegClass;
3750 if (BitWidth == 512)
3751 return &AMDGPU::AReg_512_Align2RegClass;
3752 if (BitWidth == 1024)
3753 return &AMDGPU::AReg_1024_Align2RegClass;
3754
3755 return nullptr;
3756}
3757
3758const TargetRegisterClass *
3760 if (BitWidth == 16)
3761 return &AMDGPU::AGPR_LO16RegClass;
3762 if (BitWidth == 32)
3763 return &AMDGPU::AGPR_32RegClass;
3764 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
3766}
3767
3768static const TargetRegisterClass *
3770 if (BitWidth == 64)
3771 return &AMDGPU::AV_64RegClass;
3772 if (BitWidth == 96)
3773 return &AMDGPU::AV_96RegClass;
3774 if (BitWidth == 128)
3775 return &AMDGPU::AV_128RegClass;
3776 if (BitWidth == 160)
3777 return &AMDGPU::AV_160RegClass;
3778 if (BitWidth == 192)
3779 return &AMDGPU::AV_192RegClass;
3780 if (BitWidth == 224)
3781 return &AMDGPU::AV_224RegClass;
3782 if (BitWidth == 256)
3783 return &AMDGPU::AV_256RegClass;
3784 if (BitWidth == 288)
3785 return &AMDGPU::AV_288RegClass;
3786 if (BitWidth == 320)
3787 return &AMDGPU::AV_320RegClass;
3788 if (BitWidth == 352)
3789 return &AMDGPU::AV_352RegClass;
3790 if (BitWidth == 384)
3791 return &AMDGPU::AV_384RegClass;
3792 if (BitWidth == 512)
3793 return &AMDGPU::AV_512RegClass;
3794 if (BitWidth == 1024)
3795 return &AMDGPU::AV_1024RegClass;
3796
3797 return nullptr;
3798}
3799
3800static const TargetRegisterClass *
3802 if (BitWidth == 64)
3803 return &AMDGPU::AV_64_Align2RegClass;
3804 if (BitWidth == 96)
3805 return &AMDGPU::AV_96_Align2RegClass;
3806 if (BitWidth == 128)
3807 return &AMDGPU::AV_128_Align2RegClass;
3808 if (BitWidth == 160)
3809 return &AMDGPU::AV_160_Align2RegClass;
3810 if (BitWidth == 192)
3811 return &AMDGPU::AV_192_Align2RegClass;
3812 if (BitWidth == 224)
3813 return &AMDGPU::AV_224_Align2RegClass;
3814 if (BitWidth == 256)
3815 return &AMDGPU::AV_256_Align2RegClass;
3816 if (BitWidth == 288)
3817 return &AMDGPU::AV_288_Align2RegClass;
3818 if (BitWidth == 320)
3819 return &AMDGPU::AV_320_Align2RegClass;
3820 if (BitWidth == 352)
3821 return &AMDGPU::AV_352_Align2RegClass;
3822 if (BitWidth == 384)
3823 return &AMDGPU::AV_384_Align2RegClass;
3824 if (BitWidth == 512)
3825 return &AMDGPU::AV_512_Align2RegClass;
3826 if (BitWidth == 1024)
3827 return &AMDGPU::AV_1024_Align2RegClass;
3828
3829 return nullptr;
3830}
3831
3832const TargetRegisterClass *
3834 if (BitWidth == 32)
3835 return &AMDGPU::AV_32RegClass;
3836 return ST.needsAlignedVGPRs()
3839}
3840
3841const TargetRegisterClass *
3843 // TODO: In principle this should use AV classes for gfx908 too. This is
3844 // limited to 90a+ to avoid regressing special case copy optimizations which
3845 // need new handling. The core issue is that it's not possible to directly
3846 // copy between AGPRs on gfx908, and the current optimizations around that
3847 // expect to see copies to VGPR.
3848 return ST.hasGFX90AInsts() ? getVectorSuperClassForBitWidth(BitWidth)
3850}
3851
3852const TargetRegisterClass *
3854 if (BitWidth == 16 || BitWidth == 32)
3855 return &AMDGPU::SReg_32RegClass;
3856 if (BitWidth == 64)
3857 return &AMDGPU::SReg_64RegClass;
3858 if (BitWidth == 96)
3859 return &AMDGPU::SGPR_96RegClass;
3860 if (BitWidth == 128)
3861 return &AMDGPU::SGPR_128RegClass;
3862 if (BitWidth == 160)
3863 return &AMDGPU::SGPR_160RegClass;
3864 if (BitWidth == 192)
3865 return &AMDGPU::SGPR_192RegClass;
3866 if (BitWidth == 224)
3867 return &AMDGPU::SGPR_224RegClass;
3868 if (BitWidth == 256)
3869 return &AMDGPU::SGPR_256RegClass;
3870 if (BitWidth == 288)
3871 return &AMDGPU::SGPR_288RegClass;
3872 if (BitWidth == 320)
3873 return &AMDGPU::SGPR_320RegClass;
3874 if (BitWidth == 352)
3875 return &AMDGPU::SGPR_352RegClass;
3876 if (BitWidth == 384)
3877 return &AMDGPU::SGPR_384RegClass;
3878 if (BitWidth == 512)
3879 return &AMDGPU::SGPR_512RegClass;
3880 if (BitWidth == 1024)
3881 return &AMDGPU::SGPR_1024RegClass;
3882
3883 return nullptr;
3884}
3885
3887 Register Reg) const {
3888 const TargetRegisterClass *RC;
3889 if (Reg.isVirtual())
3890 RC = MRI.getRegClass(Reg);
3891 else
3892 RC = getPhysRegBaseClass(Reg);
3893 return RC && isSGPRClass(RC);
3894}
3895
3896const TargetRegisterClass *
3898 unsigned Size = getRegSizeInBits(*SRC);
3899
3900 switch (SRC->getID()) {
3901 default:
3902 break;
3903 case AMDGPU::VS_32_Lo256RegClassID:
3904 case AMDGPU::VS_64_Lo256RegClassID:
3905 return getAllocatableClass(getAlignedLo256VGPRClassForBitWidth(Size));
3906 }
3907
3908 const TargetRegisterClass *VRC =
3909 getAllocatableClass(getVGPRClassForBitWidth(Size));
3910 assert(VRC && "Invalid register class size");
3911 return VRC;
3912}
3913
3914const TargetRegisterClass *
3916 unsigned Size = getRegSizeInBits(*SRC);
3918 assert(ARC && "Invalid register class size");
3919 return ARC;
3920}
3921
3922const TargetRegisterClass *
3924 unsigned Size = getRegSizeInBits(*SRC);
3926 assert(ARC && "Invalid register class size");
3927 return ARC;
3928}
3929
3930const TargetRegisterClass *
3932 unsigned Size = getRegSizeInBits(*VRC);
3933 if (Size == 32)
3934 return &AMDGPU::SGPR_32RegClass;
3936 assert(SRC && "Invalid register class size");
3937 return SRC;
3938}
3939
3940const TargetRegisterClass *
3942 const TargetRegisterClass *SubRC,
3943 unsigned SubIdx) const {
3944 // Ensure this subregister index is aligned in the super register.
3945 const TargetRegisterClass *MatchRC =
3946 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
3947 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
3948}
3949
3950bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
3953 return !ST.hasMFMAInlineLiteralBug();
3954
3955 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
3956 OpType <= AMDGPU::OPERAND_SRC_LAST;
3957}
3958
3959bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
3960 // TODO: 64-bit operands have extending behavior from 32-bit literal.
3961 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
3963}
3964
3965/// Returns a lowest register that is not used at any point in the function.
3966/// If all registers are used, then this function will return
3967/// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
3968/// highest unused register.
3970 const MachineRegisterInfo &MRI, const TargetRegisterClass *RC,
3971 const MachineFunction &MF, bool ReserveHighestRegister) const {
3972 // Never offer VCC as an unused register.
3973 auto isVCC = [](MCRegister Reg) {
3974 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3975 };
3976
3977 if (ReserveHighestRegister) {
3978 for (MCRegister Reg : reverse(*RC))
3979 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) && !isVCC(Reg))
3980 return Reg;
3981 } else {
3982 for (MCRegister Reg : *RC)
3983 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) && !isVCC(Reg))
3984 return Reg;
3985 }
3986 return MCRegister();
3987}
3988
3990 const RegisterBankInfo &RBI,
3991 Register Reg) const {
3992 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo());
3993 if (!RB)
3994 return false;
3995
3996 return !RBI.isDivergentRegBank(RB);
3997}
3998
4000 unsigned EltSize) const {
4001 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
4002 assert(RegBitWidth >= 32 && RegBitWidth <= 1024 && EltSize >= 2);
4003
4004 const unsigned RegHalves = RegBitWidth / 16;
4005 const unsigned EltHalves = EltSize / 2;
4006 assert(RegSplitParts.size() + 1 >= EltHalves);
4007
4008 const std::vector<int16_t> &Parts = RegSplitParts[EltHalves - 1];
4009 const unsigned NumParts = RegHalves / EltHalves;
4010
4011 return ArrayRef(Parts.data(), NumParts);
4012}
4013
4016 Register Reg) const {
4017 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg);
4018}
4019
4020const TargetRegisterClass *
4022 const MachineOperand &MO) const {
4023 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg());
4024 return getSubRegisterClass(SrcRC, MO.getSubReg());
4025}
4026
4028 Register Reg) const {
4029 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
4030 // Registers without classes are unaddressable, SGPR-like registers.
4031 return RC && isVGPRClass(RC);
4032}
4033
4035 Register Reg) const {
4036 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
4037
4038 // Registers without classes are unaddressable, SGPR-like registers.
4039 return RC && isAGPRClass(RC);
4040}
4041
4043 MachineFunction &MF) const {
4044 unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
4045 switch (RC->getID()) {
4046 default:
4047 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
4048 case AMDGPU::VGPR_32RegClassID:
4049 return std::min(
4050 ST.getMaxNumVGPRs(
4051 MinOcc,
4053 ST.getMaxNumVGPRs(MF));
4054 case AMDGPU::SGPR_32RegClassID:
4055 case AMDGPU::SGPR_LO16RegClassID:
4056 return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
4057 }
4058}
4059
4061 unsigned Idx) const {
4062 switch (static_cast<AMDGPU::RegisterPressureSets>(Idx)) {
4063 case AMDGPU::RegisterPressureSets::VGPR_32:
4064 case AMDGPU::RegisterPressureSets::AGPR_32:
4065 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
4066 const_cast<MachineFunction &>(MF));
4067 case AMDGPU::RegisterPressureSets::SReg_32:
4068 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
4069 const_cast<MachineFunction &>(MF));
4070 }
4071
4072 llvm_unreachable("Unexpected register pressure set!");
4073}
4074
4075const int *SIRegisterInfo::getRegUnitPressureSets(MCRegUnit RegUnit) const {
4076 static const int Empty[] = { -1 };
4077
4078 if (RegPressureIgnoredUnits[static_cast<unsigned>(RegUnit)])
4079 return Empty;
4080
4081 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
4082}
4083
4085 ArrayRef<MCPhysReg> Order,
4087 const MachineFunction &MF,
4088 const VirtRegMap *VRM,
4089 const LiveRegMatrix *Matrix) const {
4090
4091 const MachineRegisterInfo &MRI = MF.getRegInfo();
4092 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4093
4094 std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
4095
4096 switch (Hint.first) {
4097 case AMDGPURI::Size32: {
4098 Register Paired = Hint.second;
4099 assert(Paired);
4100 Register PairedPhys;
4101 if (Paired.isPhysical()) {
4102 PairedPhys =
4103 getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
4104 } else if (VRM && VRM->hasPhys(Paired)) {
4105 PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
4106 &AMDGPU::VGPR_32RegClass);
4107 }
4108
4109 // Prefer the paired physreg.
4110 if (PairedPhys)
4111 // isLo(Paired) is implicitly true here from the API of
4112 // getMatchingSuperReg.
4113 Hints.push_back(PairedPhys);
4114 return false;
4115 }
4116 case AMDGPURI::Size16: {
4117 Register Paired = Hint.second;
4118 assert(Paired);
4119 Register PairedPhys;
4120 if (Paired.isPhysical()) {
4121 PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
4122 } else if (VRM && VRM->hasPhys(Paired)) {
4123 PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
4124 }
4125
4126 // First prefer the paired physreg.
4127 if (PairedPhys)
4128 Hints.push_back(PairedPhys);
4129 else {
4130 // Add all the lo16 physregs.
4131 // When the Paired operand has not yet been assigned a physreg it is
4132 // better to try putting VirtReg in a lo16 register, because possibly
4133 // later Paired can be assigned to the overlapping register and the COPY
4134 // can be eliminated.
4135 for (MCPhysReg PhysReg : Order) {
4136 if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this))
4137 continue;
4138 if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
4139 !MRI.isReserved(PhysReg))
4140 Hints.push_back(PhysReg);
4141 }
4142 }
4143 return false;
4144 }
4145 default:
4146 return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
4147 VRM);
4148 }
4149}
4150
4152 // Not a callee saved register.
4153 return AMDGPU::SGPR30_SGPR31;
4154}
4155
4156const TargetRegisterClass *
4158 const RegisterBank &RB) const {
4159 switch (RB.getID()) {
4160 case AMDGPU::VGPRRegBankID:
4162 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
4163 case AMDGPU::VCCRegBankID:
4164 assert(Size == 1);
4165 return getWaveMaskRegClass();
4166 case AMDGPU::SGPRRegBankID:
4167 return getSGPRClassForBitWidth(std::max(32u, Size));
4168 case AMDGPU::AGPRRegBankID:
4169 return getAGPRClassForBitWidth(std::max(32u, Size));
4170 default:
4171 llvm_unreachable("unknown register bank");
4172 }
4173}
4174
4175const TargetRegisterClass *
4177 const MachineRegisterInfo &MRI) const {
4178 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
4179 if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB))
4180 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
4181
4182 if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB))
4183 return getAllocatableClass(RC);
4184
4185 return nullptr;
4186}
4187
4189 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
4190}
4191
4193 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4194}
4195
4197 // VGPR tuples have an alignment requirement on gfx90a variants.
4198 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
4199 : &AMDGPU::VReg_64RegClass;
4200}
4201
4202// Find reaching register definition
4206 LiveIntervals *LIS) const {
4207 auto &MDT = LIS->getDomTree();
4208 SlotIndex UseIdx = LIS->getInstructionIndex(Use);
4209 SlotIndex DefIdx;
4210
4211 if (Reg.isVirtual()) {
4212 if (!LIS->hasInterval(Reg))
4213 return nullptr;
4214 LiveInterval &LI = LIS->getInterval(Reg);
4215 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
4216 : MRI.getMaxLaneMaskForVReg(Reg);
4217 VNInfo *V = nullptr;
4218 if (LI.hasSubRanges()) {
4219 for (auto &S : LI.subranges()) {
4220 if ((S.LaneMask & SubLanes) == SubLanes) {
4221 V = S.getVNInfoAt(UseIdx);
4222 break;
4223 }
4224 }
4225 } else {
4226 V = LI.getVNInfoAt(UseIdx);
4227 }
4228 if (!V)
4229 return nullptr;
4230 DefIdx = V->def;
4231 } else {
4232 // Find last def.
4233 for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
4234 LiveRange &LR = LIS->getRegUnit(Unit);
4235 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
4236 if (!DefIdx.isValid() ||
4237 MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
4238 LIS->getInstructionFromIndex(V->def)))
4239 DefIdx = V->def;
4240 } else {
4241 return nullptr;
4242 }
4243 }
4244 }
4245
4246 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
4247
4248 if (!Def || !MDT.dominates(Def, &Use))
4249 return nullptr;
4250
4251 assert(Def->modifiesRegister(Reg, this));
4252
4253 return Def;
4254}
4255
4257 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32);
4258
4259 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
4260 AMDGPU::SReg_32RegClass,
4261 AMDGPU::AGPR_32RegClass } ) {
4262 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
4263 return Super;
4264 }
4265 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
4266 &AMDGPU::VGPR_32RegClass)) {
4267 return Super;
4268 }
4269
4270 return AMDGPU::NoRegister;
4271}
4272
4274 if (!ST.needsAlignedVGPRs())
4275 return true;
4276
4277 if (isVGPRClass(&RC))
4278 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
4279 if (isAGPRClass(&RC))
4280 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
4281 if (isVectorSuperClass(&RC))
4282 return RC.hasSuperClassEq(
4283 getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
4284
4285 assert(&RC != &AMDGPU::VS_64RegClass);
4286
4287 return true;
4288}
4289
4292 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
4293}
4294
4297 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2);
4298}
4299
4302 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
4303}
4304
4305unsigned
4307 unsigned SubReg) const {
4308 switch (RC->TSFlags & SIRCFlags::RegKindMask) {
4309 case SIRCFlags::HasSGPR:
4310 return std::min(128u, getSubRegIdxSize(SubReg));
4311 case SIRCFlags::HasAGPR:
4312 case SIRCFlags::HasVGPR:
4314 return std::min(32u, getSubRegIdxSize(SubReg));
4315 default:
4316 break;
4317 }
4318 return 0;
4319}
4320
4322 const TargetRegisterClass &RC,
4323 bool IncludeCalls) const {
4324 unsigned NumArchVGPRs = ST.getAddressableNumArchVGPRs();
4326 (RC.getID() == AMDGPU::VGPR_32RegClassID)
4327 ? RC.getRegisters().take_front(NumArchVGPRs)
4328 : RC.getRegisters();
4329 for (MCPhysReg Reg : reverse(Registers)) {
4330 if (Reg != AMDGPU::VCC_LO && Reg != AMDGPU::VCC_HI &&
4331 MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
4332 return getHWRegIndex(Reg) + 1;
4333 }
4334 return 0;
4335}
4336
4339 const MachineFunction &MF) const {
4341 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4342 if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
4343 RegFlags.push_back("WWM_REG");
4344 return RegFlags;
4345}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static const Function * getParent(const Value *V)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Live Register Matrix
A set of register units.
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
if(PassOpts->AAPipeline)
This file declares the machine register scavenger class.
SI Pre allocate WWM Registers
static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, int Index, unsigned Lane, unsigned ValueReg, bool IsKill, bool NeedsCFI)
static int getOffenMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyAGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFLoad(unsigned Opc)
static const std::array< unsigned, 17 > SubRegFromChannelTableWidthMap
static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI, const SIInstrInfo *TII)
static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI, const Twine &ErrMsg)
static const TargetRegisterClass * getAlignedAGPRClassForBitWidth(unsigned BitWidth)
static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, MachineFrameInfo &MFI, MachineBasicBlock::iterator MI, int Index, int64_t Offset)
static cl::opt< bool > EnableSpillCFISavedRegs("amdgpu-spill-cfi-saved-regs", cl::desc("Enable spilling the registers required for CFI emission"), cl::ReallyHidden, cl::init(false), cl::ZeroOrMore)
static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, unsigned LoadStoreOp, unsigned EltSize)
static const TargetRegisterClass * getAlignedVGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyVGPRClassForBitWidth(unsigned BitWidth)
static cl::opt< bool > EnableSpillSGPRToVGPR("amdgpu-spill-sgpr-to-vgpr", cl::desc("Enable spilling SGPRs to VGPRs"), cl::ReallyHidden, cl::init(true))
static const TargetRegisterClass * getAlignedVectorSuperClassForBitWidth(unsigned BitWidth)
static const TargetRegisterClass * getAnyVectorSuperClassForBitWidth(unsigned BitWidth)
static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI, const MachineInstr &MI)
static int getOffenMUBUFLoad(unsigned Opc)
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static const char * getRegisterName(MCRegister Reg)
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
bool test(unsigned Idx) const
Returns true if bit Idx is set.
Definition BitVector.h:482
bool empty() const
Returns whether there are no bits in this bitvector.
Definition BitVector.h:175
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Register getReg() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
bool hasInterval(Register Reg) const
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
MachineDominatorTree & getDomTree()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LiveRange & getRegUnit(MCRegUnit Unit)
Return the live range for register unit Unit.
This class represents the liveness of a register, stack slot, etc.
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
A set of register units used to track register liveness.
bool available(MCRegister Reg) const
Returns true if no part of physical register Reg is live.
Describe properties that are true of each instruction in the target description file.
MCRegAliasIterator enumerates all registers aliasing Reg.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition MCRegister.h:77
Generic base class for all target subtargets.
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasCalls() const
Return true if the current function has any function calls.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
bool hasStackObjects() const
Return true if there are any stack objects in this function.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
void setAsmPrinterFlag(AsmPrinterFlagTy Flag)
Set a flag for the AsmPrinter.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
const RegClassOrRegBank & getRegClassOrRegBank(Register Reg) const
Return the register bank or register class of Reg.
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
bool isAllocatable(MCRegister PhysReg) const
isAllocatable - Returns true when PhysReg belongs to an allocatable register class and it hasn't been...
std::pair< unsigned, Register > getRegAllocationHint(Register VReg) const
getRegAllocationHint - Return the register allocation hint for the specified virtual register.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI LaneBitmask getMaxLaneMaskForVReg(Register Reg) const
Returns a mask covering all bits that can appear in lane masks of subregisters of the virtual registe...
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Holds all the information related to register banks.
virtual bool isDivergentRegBank(const RegisterBank *RB) const
Returns true if the register bank is considered divergent.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
MachineInstr * buildCFIForSGPRToVMEMSpill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, MCRegister SGPR, int64_t Offset) const
Create a CFI index describing a spill of a SGPR to VMEM and build a MachineInstr around it.
MachineInstr * buildCFIForVRegToVRegSpill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCRegister Reg, const MCRegister RegCopy) const
Create a CFI index describing a spill of the VGPR/AGPR Reg to another VGPR/AGPR RegCopy and build a M...
MachineInstr * buildCFIForVGPRToVMEMSpill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, MCRegister VGPR, int64_t Offset) const
Create a CFI index describing a spill of a VGPR to VMEM and build a MachineInstr around it.
MachineInstr * buildCFIForSGPRToVGPRSpill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCRegister SGPR, const MCRegister VGPR, const int Lane) const
Create a CFI index describing a spill of an SGPR to a single lane of a VGPR and build a MachineInstr ...
static bool isFLATScratch(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isVOP3(const MCInstrDesc &Desc)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
ArrayRef< MCPhysReg > getAGPRSpillVGPRs() const
MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< MCPhysReg > getVGPRSpillAGPRs() const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const
uint32_t getMaskForVGPRBlockOps(Register RegisterBlock) const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const
bool checkFlag(Register Reg, uint8_t Flag) const
const ReservedRegSet & getWWMReservedRegs() const
Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, int64_t Offset) const override
int64_t getScratchInstrOffset(const MachineInstr *MI) const
bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const override
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
ArrayRef< MCPhysReg > getAllSGPR64(const MachineFunction &MF) const
Return all SGPR64 which satisfy the waves per execution unit requirement of the subtarget.
MCRegister findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, const MachineFunction &MF, bool ReserveHighestVGPR=false) const
Returns a lowest register that is not used at any point in the function.
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
void buildSpillLoadStore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned LoadStoreOp, int Index, Register ValueReg, bool ValueIsKill, MCRegister ScratchOffsetReg, int64_t InstrOffset, MachineMemOperand *MMO, RegScavenger *RS, LiveRegUnits *LiveUnits=nullptr, bool NeedsCFI=false) const
bool requiresFrameIndexReplacementScavenging(const MachineFunction &MF) const override
bool shouldRealignStack(const MachineFunction &MF) const override
bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
Register getFrameRegister(const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getVectorSuperClassForBitWidth(unsigned BitWidth) const
bool spillEmergencySGPR(MachineBasicBlock::iterator MI, MachineBasicBlock &RestoreMBB, Register SGPR, RegScavenger *RS) const
SIRegisterInfo(const GCNSubtarget &ST)
const uint32_t * getAllVGPRRegMask() const
MCRegister getReturnAddressReg(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
bool hasBasePointer(const MachineFunction &MF) const
const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override
Returns a legal register class to copy a register in the specified class to or from.
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
ArrayRef< MCPhysReg > getAllSGPR32(const MachineFunction &MF) const
Return all SGPR32 which satisfy the waves per execution unit requirement of the subtarget.
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch buffer in case spilling is needed.
bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool SpillToPhysVGPRLane=false) const
Special case of eliminateFrameIndex.
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override
LLVM_READONLY const TargetRegisterClass * getAGPRClassForBitWidth(unsigned BitWidth) const
static bool isChainScratchRegister(Register VGPR)
bool requiresRegisterScavenging(const MachineFunction &Fn) const override
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
bool isUniformReg(const MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, Register Reg) const override
const uint32_t * getNoPreservedMask() const override
StringRef getRegAsmName(MCRegister Reg) const override
const uint32_t * getAllAllocatableSRegMask() const
MCRegister getAlignedHighSGPRForRC(const MachineFunction &MF, const unsigned Align, const TargetRegisterClass *RC) const
Return the largest available SGPR aligned to Align for the register class RC.
void buildCFIForBlockCSRStore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register BlockReg, int64_t Offset) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getHWRegIndex(MCRegister Reg) const
const MCPhysReg * getCalleeSavedRegsViaCopy(const MachineFunction *MF) const
const uint32_t * getAllVectorRegMask() const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
const TargetRegisterClass * getPointerRegClass(unsigned Kind=0) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
bool opCanUseLiteralConstant(unsigned OpType) const
Register getBaseRegister() const
bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override
LLVM_READONLY const TargetRegisterClass * getAlignedLo256VGPRClassForBitWidth(unsigned BitWidth) const
LLVM_READONLY const TargetRegisterClass * getVGPRClassForBitWidth(unsigned BitWidth) const
const TargetRegisterClass * getEquivalentAVClass(const TargetRegisterClass *SRC) const
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override
static bool isVGPRClass(const TargetRegisterClass *RC)
MachineInstr * findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
SmallVector< StringLiteral > getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
ArrayRef< MCPhysReg > getAllSGPR128(const MachineFunction &MF) const
Return all SGPR128 which satisfy the waves per execution unit requirement of the subtarget.
unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const override
BitVector getReservedRegs(const MachineFunction &MF) const override
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override
const TargetRegisterClass * getRegClassForOperandReg(const MachineRegisterInfo &MRI, const MachineOperand &MO) const
void addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB, Register BlockReg) const
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, const TargetRegisterClass &RC, bool IncludeCalls=true) const
const uint32_t * getAllAGPRRegMask() const
const int * getRegUnitPressureSets(MCRegUnit RegUnit) const override
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override
bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false, bool NeedsCFI=false) const
If OnlyToVGPR is true, this will only succeed if this manages to find a free VGPR lane to spill.
MCRegister getExec() const
MCRegister getVCC() const
int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override
bool isVectorSuperClass(const TargetRegisterClass *RC) const
const TargetRegisterClass * getWaveMaskRegClass() const
unsigned getSubRegAlignmentNumBits(const TargetRegisterClass *RC, unsigned SubReg) const
void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override
bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override
const TargetRegisterClass * getVGPR64Class() const
void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill=true) const
bool isCFISavedRegsSpillEnabled() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
bool isValid() const
Returns true if this is a valid index.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
SlotIndex replaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
ReplaceMachineInstrInMaps - Replacing a machine instr with a new one in maps used by register allocat...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
const uint8_t TSFlags
Configurable target specific flags.
ArrayRef< MCPhysReg > getRegisters() const
unsigned getID() const
Return the register class ID number.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
virtual const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &) const
Returns the largest super class of RC that is legal to use in the current sub-target and has the same...
virtual bool shouldRealignStack(const MachineFunction &MF) const
True if storage within the function requires the stack pointer to be aligned more than the normal cal...
virtual bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM=nullptr, const LiveRegMatrix *Matrix=nullptr) const
Get a list of 'hint' registers that the register allocator should try first when allocating a physica...
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
VNInfo - Value Number Information.
MCRegister getPhys(Register virtReg) const
returns the physical register mapped to the specified virtual register
Definition VirtRegMap.h:91
bool hasPhys(Register virtReg) const
returns true if the specified virtual register is mapped to a physical register
Definition VirtRegMap.h:87
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ PRIVATE_ADDRESS
Address space for private memory.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
@ OPERAND_REG_IMM_FIRST
Definition SIDefines.h:255
@ OPERAND_REG_INLINE_AC_FIRST
Definition SIDefines.h:261
@ OPERAND_REG_INLINE_AC_LAST
Definition SIDefines.h:262
@ OPERAND_REG_IMM_LAST
Definition SIDefines.h:256
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY int32_t getFlatScratchInstSVfromSVS(uint32_t Opcode)
LLVM_READONLY int32_t getFlatScratchInstSVfromSS(uint32_t Opcode)
LLVM_READONLY int32_t getFlatScratchInstSTfromSS(uint32_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:558
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Renamable
Register that may be renamed.
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
@ HasSGPR
Definition SIDefines.h:27
@ HasVGPR
Definition SIDefines.h:25
@ RegKindMask
Definition SIDefines.h:30
@ HasAGPR
Definition SIDefines.h:26
constexpr RegState getDefRegState(bool B)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
constexpr bool hasRegState(RegState Value, RegState Test)
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition Threading.h:86
constexpr unsigned BitWidth
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static const MachineMemOperand::Flags MOThreadPrivate
Mark the MMO of accesses to memory locations that are never written to by other threads.
Definition SIInstrInfo.h:63
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI)
ArrayRef< int16_t > SplitParts
SIMachineFunctionInfo & MFI
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, int Index, RegScavenger *RS)
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, bool IsKill, int Index, RegScavenger *RS)
MachineBasicBlock::iterator MI
void readWriteTmpVGPR(unsigned Offset, bool IsLoad)
const SIRegisterInfo & TRI
MachineBasicBlock * MBB
const SIInstrInfo & TII
The llvm::once_flag structure.
Definition Threading.h:67