LLVM 23.0.0git
SIRegisterInfo.cpp
Go to the documentation of this file.
1//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI implementation of the TargetRegisterInfo class.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
16#include "GCNSubtarget.h"
20#include "SIRegisterInfo.h"
26
27using namespace llvm;
28
29#define GET_REGINFO_TARGET_DESC
30#include "AMDGPUGenRegisterInfo.inc"
31
33 "amdgpu-spill-sgpr-to-vgpr",
34 cl::desc("Enable spilling SGPRs to VGPRs"),
36 cl::init(true));
37
39 "amdgpu-spill-cfi-saved-regs",
40 cl::desc("Enable spilling the registers required for CFI emission"),
42
43std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
44std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
45
46// Map numbers of DWORDs to indexes in SubRegFromChannelTable.
47// Valid indexes are shifted 1, such that a 0 mapping means unsupported.
48// e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
49// meaning index 7 in SubRegFromChannelTable.
50static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
51 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
52
53static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI,
54 const Twine &ErrMsg) {
56 DiagnosticInfoUnsupported(Fn, ErrMsg, MI.getDebugLoc()));
57}
58
59namespace llvm {
60
61// A temporary struct to spill SGPRs.
62// This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
63// just v_writelane and v_readlane.
64//
65// When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
66// is saved to scratch (or the other way around for loads).
67// For this, a VGPR is required where the needed lanes can be clobbered. The
68// RegScavenger can provide a VGPR where currently active lanes can be
69// clobbered, but we still need to save inactive lanes.
70// The high-level steps are:
71// - Try to scavenge SGPR(s) to save exec
72// - Try to scavenge VGPR
73// - Save needed, all or inactive lanes of a TmpVGPR
74// - Spill/Restore SGPRs using TmpVGPR
75// - Restore TmpVGPR
76//
77// To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
78// cannot scavenge temporary SGPRs to save exec, we use the following code:
79// buffer_store_dword TmpVGPR ; only if active lanes need to be saved
80// s_not exec, exec
81// buffer_store_dword TmpVGPR ; save inactive lanes
82// s_not exec, exec
84 struct PerVGPRData {
85 unsigned PerVGPR;
86 unsigned NumVGPRs;
87 int64_t VGPRLanes;
88 };
89
90 // The SGPR to save
94 unsigned NumSubRegs;
95 bool IsKill;
96 const DebugLoc &DL;
97
98 /* When spilling to stack */
99 // The SGPRs are written into this VGPR, which is then written to scratch
100 // (or vice versa for loads).
101 Register TmpVGPR = AMDGPU::NoRegister;
102 // Temporary spill slot to save TmpVGPR to.
104 // If TmpVGPR is live before the spill or if it is scavenged.
105 bool TmpVGPRLive = false;
106 // Scavenged SGPR to save EXEC.
107 Register SavedExecReg = AMDGPU::NoRegister;
108 // Stack index to write the SGPRs to.
109 int Index;
110 unsigned EltSize = 4;
111
120 unsigned MovOpc;
121 unsigned NotOpc;
122
126 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
127 MI->getOperand(0).isKill(), Index, RS) {}
128
131 bool IsKill, int Index, RegScavenger *RS)
132 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
133 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
134 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
136 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
137 SplitParts = TRI.getRegSplitParts(RC, EltSize);
138 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
139
140 if (IsWave32) {
141 ExecReg = AMDGPU::EXEC_LO;
142 MovOpc = AMDGPU::S_MOV_B32;
143 NotOpc = AMDGPU::S_NOT_B32;
144 } else {
145 ExecReg = AMDGPU::EXEC;
146 MovOpc = AMDGPU::S_MOV_B64;
147 NotOpc = AMDGPU::S_NOT_B64;
148 }
149
150 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
151 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
152 SuperReg != AMDGPU::EXEC && "exec should never spill");
153 }
154
157 Data.PerVGPR = IsWave32 ? 32 : 64;
158 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
159 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
160 return Data;
161 }
162
163 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
164 // free.
165 // Writes these instructions if an SGPR can be scavenged:
166 // s_mov_b64 s[6:7], exec ; Save exec
167 // s_mov_b64 exec, 3 ; Wanted lanemask
168 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
169 //
170 // Writes these instructions if no SGPR can be scavenged:
171 // buffer_store_dword v0 ; Only if no free VGPR was found
172 // s_not_b64 exec, exec
173 // buffer_store_dword v0 ; Save inactive lanes
174 // ; exec stays inverted, it is flipped back in
175 // ; restore.
176 void prepare() {
177 // Scavenged temporary VGPR to use. It must be scavenged once for any number
178 // of spilled subregs.
179 // FIXME: The liveness analysis is limited and does not tell if a register
180 // is in use in lanes that are currently inactive. We can never be sure if
181 // a register as actually in use in another lane, so we need to save all
182 // used lanes of the chosen VGPR.
183 assert(RS && "Cannot spill SGPR to memory without RegScavenger");
184 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
185 0, false);
186
187 // Reserve temporary stack slot
188 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
189 if (TmpVGPR) {
190 // Found a register that is dead in the currently active lanes, we only
191 // need to spill inactive lanes.
192 TmpVGPRLive = false;
193 } else {
194 // Pick v0 because it doesn't make a difference.
195 TmpVGPR = AMDGPU::VGPR0;
196 TmpVGPRLive = true;
197 }
198
199 if (TmpVGPRLive) {
200 // We need to inform the scavenger that this index is already in use until
201 // we're done with the custom emergency spill.
202 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR);
203 }
204
205 // We may end up recursively calling the scavenger, and don't want to re-use
206 // the same register.
207 RS->setRegUsed(TmpVGPR);
208
209 // Try to scavenge SGPRs to save exec
210 assert(!SavedExecReg && "Exec is already saved, refuse to save again");
211 const TargetRegisterClass &RC =
212 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
213 RS->setRegUsed(SuperReg);
214 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
215
216 int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
217
218 if (SavedExecReg) {
219 RS->setRegUsed(SavedExecReg);
220 // Set exec to needed lanes
222 auto I =
223 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
224 if (!TmpVGPRLive)
226 // Spill needed lanes
227 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
228 } else {
229 // The modify and restore of exec clobber SCC, which we would have to save
230 // and restore. FIXME: We probably would need to reserve a register for
231 // this.
232 if (RS->isRegUsed(AMDGPU::SCC))
233 emitUnsupportedError(MF.getFunction(), *MI,
234 "unhandled SGPR spill to memory");
235
236 // Spill active lanes
237 if (TmpVGPRLive)
238 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
239 /*IsKill*/ false);
240 // Spill inactive lanes
241 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
242 if (!TmpVGPRLive)
244 I->getOperand(2).setIsDead(); // Mark SCC as dead.
245 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
246 }
247 }
248
249 // Writes these instructions if an SGPR can be scavenged:
250 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
251 // s_waitcnt vmcnt(0) ; If a free VGPR was found
252 // s_mov_b64 exec, s[6:7] ; Save exec
253 //
254 // Writes these instructions if no SGPR can be scavenged:
255 // buffer_load_dword v0 ; Restore inactive lanes
256 // s_waitcnt vmcnt(0) ; If a free VGPR was found
257 // s_not_b64 exec, exec
258 // buffer_load_dword v0 ; Only if no free VGPR was found
259 void restore() {
260 if (SavedExecReg) {
261 // Restore used lanes
262 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
263 /*IsKill*/ false);
264 // Restore exec
265 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
267 // Add an implicit use of the load so it is not dead.
268 // FIXME This inserts an unnecessary waitcnt
269 if (!TmpVGPRLive) {
271 }
272 } else {
273 // Restore inactive lanes
274 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
275 /*IsKill*/ false);
276 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
277 if (!TmpVGPRLive)
279 I->getOperand(2).setIsDead(); // Mark SCC as dead.
280
281 // Restore active lanes
282 if (TmpVGPRLive)
283 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
284 }
285
286 // Inform the scavenger where we're releasing our custom scavenged register.
287 if (TmpVGPRLive) {
288 MachineBasicBlock::iterator RestorePt = std::prev(MI);
289 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt);
290 }
291 }
292
293 // Write TmpVGPR to memory or read TmpVGPR from memory.
294 // Either using a single buffer_load/store if exec is set to the needed mask
295 // or using
296 // buffer_load
297 // s_not exec, exec
298 // buffer_load
299 // s_not exec, exec
300 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
301 if (SavedExecReg) {
302 // Spill needed lanes
303 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
304 } else {
305 // The modify and restore of exec clobber SCC, which we would have to save
306 // and restore. FIXME: We probably would need to reserve a register for
307 // this.
308 if (RS->isRegUsed(AMDGPU::SCC))
309 emitUnsupportedError(MF.getFunction(), *MI,
310 "unhandled SGPR spill to memory");
311
312 // Spill active lanes
313 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
314 /*IsKill*/ false);
315 // Spill inactive lanes
316 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
317 Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
318 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
319 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
320 Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
321 }
322 }
323
325 assert(MBB->getParent() == &MF);
326 MI = NewMI;
327 MBB = NewMBB;
328 }
329};
330
331} // namespace llvm
332
334 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
335 ST.getAMDGPUDwarfFlavour(),
336 /*PC=*/0,
337 ST.getHwMode(MCSubtargetInfo::HwMode_RegInfo)),
338 ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
339
340 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
341 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
342 (getSubRegIndexLaneMask(AMDGPU::lo16) |
343 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
344 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
345 "getNumCoveredRegs() will not work with generated subreg masks!");
346
347 RegPressureIgnoredUnits.resize(getNumRegUnits());
348 RegPressureIgnoredUnits.set(
349 static_cast<unsigned>(*regunits(MCRegister::from(AMDGPU::M0)).begin()));
350 for (auto Reg : AMDGPU::VGPR_16RegClass) {
351 if (AMDGPU::isHi16Reg(Reg, *this))
352 RegPressureIgnoredUnits.set(
353 static_cast<unsigned>(*regunits(Reg).begin()));
354 }
355
356 // HACK: Until this is fully tablegen'd.
357 static llvm::once_flag InitializeRegSplitPartsFlag;
358
359 static auto InitializeRegSplitPartsOnce = [this]() {
360 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
361 unsigned Size = getSubRegIdxSize(Idx);
362 if (Size & 15)
363 continue;
364 std::vector<int16_t> &Vec = RegSplitParts[Size / 16 - 1];
365 unsigned Pos = getSubRegIdxOffset(Idx);
366 if (Pos % Size)
367 continue;
368 Pos /= Size;
369 if (Vec.empty()) {
370 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
371 Vec.resize(MaxNumParts);
372 }
373 Vec[Pos] = Idx;
374 }
375 };
376
377 static llvm::once_flag InitializeSubRegFromChannelTableFlag;
378
379 static auto InitializeSubRegFromChannelTableOnce = [this]() {
380 for (auto &Row : SubRegFromChannelTable)
381 Row.fill(AMDGPU::NoSubRegister);
382 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
383 unsigned Width = getSubRegIdxSize(Idx) / 32;
384 unsigned Offset = getSubRegIdxOffset(Idx) / 32;
386 Width = SubRegFromChannelTableWidthMap[Width];
387 if (Width == 0)
388 continue;
389 unsigned TableIdx = Width - 1;
390 assert(TableIdx < SubRegFromChannelTable.size());
391 assert(Offset < SubRegFromChannelTable[TableIdx].size());
392 SubRegFromChannelTable[TableIdx][Offset] = Idx;
393 }
394 };
395
396 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
397 llvm::call_once(InitializeSubRegFromChannelTableFlag,
398 InitializeSubRegFromChannelTableOnce);
399}
400
401void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
402 MCRegister Reg) const {
403 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
404 Reserved.set(*R);
405}
406
407// Forced to be here by one .inc
409 const MachineFunction *MF) const {
411 switch (CC) {
412 case CallingConv::C:
415 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
416 : CSR_AMDGPU_SaveList;
419 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
420 : CSR_AMDGPU_SI_Gfx_SaveList;
422 return CSR_AMDGPU_CS_ChainPreserve_SaveList;
423 default: {
424 // Dummy to not crash RegisterClassInfo.
425 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
426 return &NoCalleeSavedReg;
427 }
428 }
429}
430
431const MCPhysReg *
433 return nullptr;
434}
435
437 CallingConv::ID CC) const {
438 switch (CC) {
439 case CallingConv::C:
442 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
443 : CSR_AMDGPU_RegMask;
446 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
447 : CSR_AMDGPU_SI_Gfx_RegMask;
450 // Calls to these functions never return, so we can pretend everything is
451 // preserved.
452 return AMDGPU_AllVGPRs_RegMask;
453 default:
454 return nullptr;
455 }
456}
457
459 return CSR_AMDGPU_NoRegs_RegMask;
460}
461
463 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
464}
465
468 const MachineFunction &MF) const {
469 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
470 // equivalent AV class. If used one, the verifier will crash after
471 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
472 // until Instruction selection.
473 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
474 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
475 return &AMDGPU::AV_32RegClass;
476 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
477 return &AMDGPU::AV_64RegClass;
478 if (RC == &AMDGPU::VReg_64_Align2RegClass ||
479 RC == &AMDGPU::AReg_64_Align2RegClass)
480 return &AMDGPU::AV_64_Align2RegClass;
481 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
482 return &AMDGPU::AV_96RegClass;
483 if (RC == &AMDGPU::VReg_96_Align2RegClass ||
484 RC == &AMDGPU::AReg_96_Align2RegClass)
485 return &AMDGPU::AV_96_Align2RegClass;
486 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
487 return &AMDGPU::AV_128RegClass;
488 if (RC == &AMDGPU::VReg_128_Align2RegClass ||
489 RC == &AMDGPU::AReg_128_Align2RegClass)
490 return &AMDGPU::AV_128_Align2RegClass;
491 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
492 return &AMDGPU::AV_160RegClass;
493 if (RC == &AMDGPU::VReg_160_Align2RegClass ||
494 RC == &AMDGPU::AReg_160_Align2RegClass)
495 return &AMDGPU::AV_160_Align2RegClass;
496 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
497 return &AMDGPU::AV_192RegClass;
498 if (RC == &AMDGPU::VReg_192_Align2RegClass ||
499 RC == &AMDGPU::AReg_192_Align2RegClass)
500 return &AMDGPU::AV_192_Align2RegClass;
501 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
502 return &AMDGPU::AV_256RegClass;
503 if (RC == &AMDGPU::VReg_256_Align2RegClass ||
504 RC == &AMDGPU::AReg_256_Align2RegClass)
505 return &AMDGPU::AV_256_Align2RegClass;
506 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
507 return &AMDGPU::AV_512RegClass;
508 if (RC == &AMDGPU::VReg_512_Align2RegClass ||
509 RC == &AMDGPU::AReg_512_Align2RegClass)
510 return &AMDGPU::AV_512_Align2RegClass;
511 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
512 return &AMDGPU::AV_1024RegClass;
513 if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
514 RC == &AMDGPU::AReg_1024_Align2RegClass)
515 return &AMDGPU::AV_1024_Align2RegClass;
516 }
517
519}
520
522 const SIFrameLowering *TFI = ST.getFrameLowering();
524
525 // During ISel lowering we always reserve the stack pointer in entry and chain
526 // functions, but never actually want to reference it when accessing our own
527 // frame. If we need a frame pointer we use it, but otherwise we can just use
528 // an immediate "0" which we represent by returning NoRegister.
529 if (FuncInfo->isBottomOfStack()) {
530 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
531 }
532 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
533 : FuncInfo->getStackPtrOffsetReg();
534}
535
537 // When we need stack realignment, we can't reference off of the
538 // stack pointer, so we reserve a base pointer.
539 return shouldRealignStack(MF);
540}
541
542Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
543
545 return AMDGPU_AllVGPRs_RegMask;
546}
547
549 return AMDGPU_AllAGPRs_RegMask;
550}
551
553 return AMDGPU_AllVectorRegs_RegMask;
554}
555
557 return AMDGPU_AllAllocatableSRegs_RegMask;
558}
559
560unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
561 unsigned NumRegs) {
562 assert(NumRegs < SubRegFromChannelTableWidthMap.size());
563 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
564 assert(NumRegIndex && "Not implemented");
565 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
566 return SubRegFromChannelTable[NumRegIndex - 1][Channel];
567}
568
572
575 const unsigned Align,
576 const TargetRegisterClass *RC) const {
577 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align;
578 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
579 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
580}
581
583 const MachineFunction &MF) const {
584 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
585}
586
588 BitVector Reserved(getNumRegs());
589 Reserved.set(AMDGPU::MODE);
590
592
593 // Reserve special purpose registers.
594 //
595 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
596 // this seems likely to result in bugs, so I'm marking them as reserved.
597 reserveRegisterTuples(Reserved, AMDGPU::EXEC);
598 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
599
600 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
601 reserveRegisterTuples(Reserved, AMDGPU::M0);
602
603 // Reserve src_vccz, src_execz, src_scc.
604 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
605 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
606 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
607
608 // Reserve the memory aperture registers
609 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
610 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
611 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
612 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
613 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_LO);
614 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_HI);
615
616 // Reserve async counters pseudo registers
617 reserveRegisterTuples(Reserved, AMDGPU::ASYNCcnt);
618 reserveRegisterTuples(Reserved, AMDGPU::TENSORcnt);
619
620 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
621 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
622
623 // Reserve xnack_mask registers - support is not implemented in Codegen.
624 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
625
626 // Reserve lds_direct register - support is not implemented in Codegen.
627 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
628
629 // Reserve Trap Handler registers - support is not implemented in Codegen.
630 reserveRegisterTuples(Reserved, AMDGPU::TBA);
631 reserveRegisterTuples(Reserved, AMDGPU::TMA);
632 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
633 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
634 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
635 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
636 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
637 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
638 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
639 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
640
641 // Reserve null register - it shall never be allocated
642 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
643
644 // Reserve SGPRs.
645 //
646 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
647 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
648 for (const TargetRegisterClass *RC : regclasses()) {
649 if (RC->isBaseClass() && isSGPRClass(RC)) {
650 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
651 for (MCPhysReg Reg : *RC) {
652 unsigned Index = getHWRegIndex(Reg);
653 if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs &&
654 Reg != AMDGPU::VCC_LO && Reg != AMDGPU::VCC_HI &&
655 Reg != AMDGPU::VCC)
656 Reserved.set(Reg);
657 }
658 }
659 }
660
661 Register ScratchRSrcReg = MFI->getScratchRSrcReg();
662 if (ScratchRSrcReg != AMDGPU::NoRegister) {
663 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
664 // need to spill.
665 // TODO: May need to reserve a VGPR if doing LDS spilling.
666 reserveRegisterTuples(Reserved, ScratchRSrcReg);
667 }
668
669 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
670 if (LongBranchReservedReg)
671 reserveRegisterTuples(Reserved, LongBranchReservedReg);
672
673 // We have to assume the SP is needed in case there are calls in the function,
674 // which is detected after the function is lowered. If we aren't really going
675 // to need SP, don't bother reserving it.
676 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
677 if (StackPtrReg) {
678 reserveRegisterTuples(Reserved, StackPtrReg);
679 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
680 }
681
682 MCRegister FrameReg = MFI->getFrameOffsetReg();
683 if (FrameReg) {
684 reserveRegisterTuples(Reserved, FrameReg);
685 assert(!isSubRegister(ScratchRSrcReg, FrameReg));
686 }
687
688 if (hasBasePointer(MF)) {
689 MCRegister BasePtrReg = getBaseRegister();
690 reserveRegisterTuples(Reserved, BasePtrReg);
691 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
692 }
693
694 // FIXME: Use same reserved register introduced in D149775
695 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
696 Register ExecCopyReg = MFI->getSGPRForEXECCopy();
697 if (ExecCopyReg)
698 reserveRegisterTuples(Reserved, ExecCopyReg);
699
700 // Reserve VGPRs/AGPRs.
701 //
702 auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction());
703
704 for (const TargetRegisterClass *RC : regclasses()) {
705 if (RC->isBaseClass() && isVGPRClass(RC)) {
706 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
707 for (MCPhysReg Reg : *RC) {
708 unsigned Index = getHWRegIndex(Reg);
709 if (Index + NumRegs > MaxNumVGPRs)
710 Reserved.set(Reg);
711 }
712 }
713 }
714
715 // Reserve all the AGPRs if there are no instructions to use it.
716 if (!ST.hasMAIInsts())
717 MaxNumAGPRs = 0;
718 for (const TargetRegisterClass *RC : regclasses()) {
719 if (RC->isBaseClass() && isAGPRClass(RC)) {
720 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
721 for (MCPhysReg Reg : *RC) {
722 unsigned Index = getHWRegIndex(Reg);
723 if (Index + NumRegs > MaxNumAGPRs)
724 Reserved.set(Reg);
725 }
726 }
727 }
728
729 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
730 // VGPR available at all times.
731 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
732 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
733 }
734
735 // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The
736 // MFI->getNonWWMRegMask() field will have a valid bitmask only during
737 // wwm-regalloc and it would be empty otherwise.
738 BitVector NonWWMRegMask = MFI->getNonWWMRegMask();
739 if (!NonWWMRegMask.empty()) {
740 for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs;
741 RegI < RegE; ++RegI) {
742 if (NonWWMRegMask.test(RegI))
743 reserveRegisterTuples(Reserved, RegI);
744 }
745 }
746
747 for (Register Reg : MFI->getWWMReservedRegs())
748 reserveRegisterTuples(Reserved, Reg);
749
750 // FIXME: Stop using reserved registers for this.
751 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
752 reserveRegisterTuples(Reserved, Reg);
753
754 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
755 reserveRegisterTuples(Reserved, Reg);
756
757 return Reserved;
758}
759
761 MCRegister PhysReg) const {
762 return !MF.getRegInfo().isReserved(PhysReg);
763}
764
767 // On entry or in chain functions, the base address is 0, so it can't possibly
768 // need any more alignment.
769
770 // FIXME: Should be able to specify the entry frame alignment per calling
771 // convention instead.
772 if (Info->isBottomOfStack())
773 return false;
774
776}
777
780 if (Info->isEntryFunction()) {
781 const MachineFrameInfo &MFI = Fn.getFrameInfo();
782 return MFI.hasStackObjects() || MFI.hasCalls();
783 }
784
785 // May need scavenger for dealing with callee saved registers.
786 return true;
787}
788
790 const MachineFunction &MF) const {
791 // Do not use frame virtual registers. They used to be used for SGPRs, but
792 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
793 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
794 // spill.
795 return false;
796}
797
799 const MachineFunction &MF) const {
800 const MachineFrameInfo &MFI = MF.getFrameInfo();
801 return MFI.hasStackObjects();
802}
803
805 const MachineFunction &) const {
806 // There are no special dedicated stack or frame pointers.
807 return true;
808}
809
812
813 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
814 AMDGPU::OpName::offset);
815 return MI->getOperand(OffIdx).getImm();
816}
817
819 int Idx) const {
820 switch (MI->getOpcode()) {
821 case AMDGPU::V_ADD_U32_e32:
822 case AMDGPU::V_ADD_U32_e64:
823 case AMDGPU::V_ADD_CO_U32_e32: {
824 int OtherIdx = Idx == 1 ? 2 : 1;
825 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
826 return OtherOp.isImm() ? OtherOp.getImm() : 0;
827 }
828 case AMDGPU::V_ADD_CO_U32_e64: {
829 int OtherIdx = Idx == 2 ? 3 : 2;
830 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
831 return OtherOp.isImm() ? OtherOp.getImm() : 0;
832 }
833 default:
834 break;
835 }
836
838 return 0;
839
840 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
841 AMDGPU::OpName::vaddr) ||
842 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
843 AMDGPU::OpName::saddr))) &&
844 "Should never see frame index on non-address operand");
845
847}
848
850 const MachineInstr &MI) {
851 assert(MI.getDesc().isAdd());
852 const MachineOperand &Src0 = MI.getOperand(1);
853 const MachineOperand &Src1 = MI.getOperand(2);
854
855 if (Src0.isFI()) {
856 return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
857 Src1.getReg()));
858 }
859
860 if (Src1.isFI()) {
861 return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
862 Src0.getReg()));
863 }
864
865 return false;
866}
867
869 // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
870 switch (MI->getOpcode()) {
871 case AMDGPU::V_ADD_U32_e32: {
872 // TODO: We could handle this but it requires work to avoid violating
873 // operand restrictions.
874 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
875 !isFIPlusImmOrVGPR(*this, *MI))
876 return false;
877 [[fallthrough]];
878 }
879 case AMDGPU::V_ADD_U32_e64:
880 // FIXME: This optimization is barely profitable hasFlatScratchEnabled
881 // as-is.
882 //
883 // Much of the benefit with the MUBUF handling is we avoid duplicating the
884 // shift of the frame register, which isn't needed with scratch.
885 //
886 // materializeFrameBaseRegister doesn't know the register classes of the
887 // uses, and unconditionally uses an s_add_i32, which will end up using a
888 // copy for the vector uses.
889 return !ST.hasFlatScratchEnabled();
890 case AMDGPU::V_ADD_CO_U32_e32:
891 if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
892 !isFIPlusImmOrVGPR(*this, *MI))
893 return false;
894 // We can't deal with the case where the carry out has a use (though this
895 // should never happen)
896 return MI->getOperand(3).isDead();
897 case AMDGPU::V_ADD_CO_U32_e64:
898 // TODO: Should we check use_empty instead?
899 return MI->getOperand(1).isDead();
900 default:
901 break;
902 }
903
905 return false;
906
907 int64_t FullOffset = Offset + getScratchInstrOffset(MI);
908
909 const SIInstrInfo *TII = ST.getInstrInfo();
911 return !TII->isLegalMUBUFImmOffset(FullOffset);
912
913 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
915}
916
918 int FrameIdx,
919 int64_t Offset) const {
920 MachineBasicBlock::iterator Ins = MBB->begin();
921 DebugLoc DL; // Defaults to "unknown"
922
923 if (Ins != MBB->end())
924 DL = Ins->getDebugLoc();
925
926 MachineFunction *MF = MBB->getParent();
927 const SIInstrInfo *TII = ST.getInstrInfo();
928 MachineRegisterInfo &MRI = MF->getRegInfo();
929 unsigned MovOpc =
930 ST.hasFlatScratchEnabled() ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
931
932 Register BaseReg = MRI.createVirtualRegister(
933 ST.hasFlatScratchEnabled() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
934 : &AMDGPU::VGPR_32RegClass);
935
936 if (Offset == 0) {
937 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
938 .addFrameIndex(FrameIdx);
939 return BaseReg;
940 }
941
942 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
943
944 Register FIReg = MRI.createVirtualRegister(ST.hasFlatScratchEnabled()
945 ? &AMDGPU::SReg_32_XM0RegClass
946 : &AMDGPU::VGPR_32RegClass);
947
948 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
949 .addImm(Offset);
950 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
951 .addFrameIndex(FrameIdx);
952
953 if (ST.hasFlatScratchEnabled()) {
954 // FIXME: Make sure scc isn't live in.
955 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
956 .addReg(OffsetReg, RegState::Kill)
957 .addReg(FIReg)
958 .setOperandDead(3); // scc
959 return BaseReg;
960 }
961
962 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
963 .addReg(OffsetReg, RegState::Kill)
964 .addReg(FIReg)
965 .addImm(0); // clamp bit
966
967 return BaseReg;
968}
969
971 int64_t Offset) const {
972 const SIInstrInfo *TII = ST.getInstrInfo();
973
974 switch (MI.getOpcode()) {
975 case AMDGPU::V_ADD_U32_e32:
976 case AMDGPU::V_ADD_CO_U32_e32: {
977 MachineOperand *FIOp = &MI.getOperand(2);
978 MachineOperand *ImmOp = &MI.getOperand(1);
979 if (!FIOp->isFI())
980 std::swap(FIOp, ImmOp);
981
982 if (!ImmOp->isImm()) {
983 assert(Offset == 0);
984 FIOp->ChangeToRegister(BaseReg, false);
985 TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
986 return;
987 }
988
989 int64_t TotalOffset = ImmOp->getImm() + Offset;
990 if (TotalOffset == 0) {
991 MI.setDesc(TII->get(AMDGPU::COPY));
992 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
993 MI.removeOperand(I);
994
995 MI.getOperand(1).ChangeToRegister(BaseReg, false);
996 return;
997 }
998
999 ImmOp->setImm(TotalOffset);
1000
1001 MachineBasicBlock *MBB = MI.getParent();
1002 MachineFunction *MF = MBB->getParent();
1003 MachineRegisterInfo &MRI = MF->getRegInfo();
1004
1005 // FIXME: materializeFrameBaseRegister does not know the register class of
1006 // the uses of the frame index, and assumes SGPR for hasFlatScratchEnabled.
1007 // Emit a copy so we have a legal operand and hope the register coalescer
1008 // can clean it up.
1009 if (isSGPRReg(MRI, BaseReg)) {
1010 Register BaseRegVGPR =
1011 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1012 BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
1013 .addReg(BaseReg);
1014 MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
1015 } else {
1016 MI.getOperand(2).ChangeToRegister(BaseReg, false);
1017 }
1018 return;
1019 }
1020 case AMDGPU::V_ADD_U32_e64:
1021 case AMDGPU::V_ADD_CO_U32_e64: {
1022 int Src0Idx = MI.getNumExplicitDefs();
1023 MachineOperand *FIOp = &MI.getOperand(Src0Idx);
1024 MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
1025 if (!FIOp->isFI())
1026 std::swap(FIOp, ImmOp);
1027
1028 if (!ImmOp->isImm()) {
1029 FIOp->ChangeToRegister(BaseReg, false);
1030 TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
1031 return;
1032 }
1033
1034 int64_t TotalOffset = ImmOp->getImm() + Offset;
1035 if (TotalOffset == 0) {
1036 MI.setDesc(TII->get(AMDGPU::COPY));
1037
1038 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1039 MI.removeOperand(I);
1040
1041 MI.getOperand(1).ChangeToRegister(BaseReg, false);
1042 } else {
1043 FIOp->ChangeToRegister(BaseReg, false);
1044 ImmOp->setImm(TotalOffset);
1045 }
1046
1047 return;
1048 }
1049 default:
1050 break;
1051 }
1052
1053 bool IsFlat = TII->isFLATScratch(MI);
1054
1055#ifndef NDEBUG
1056 // FIXME: Is it possible to be storing a frame index to itself?
1057 bool SeenFI = false;
1058 for (const MachineOperand &MO: MI.operands()) {
1059 if (MO.isFI()) {
1060 if (SeenFI)
1061 llvm_unreachable("should not see multiple frame indices");
1062
1063 SeenFI = true;
1064 }
1065 }
1066#endif
1067
1068 MachineOperand *FIOp =
1069 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
1070 : AMDGPU::OpName::vaddr);
1071
1072 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
1073 int64_t NewOffset = OffsetOp->getImm() + Offset;
1074
1075 assert(FIOp && FIOp->isFI() && "frame index must be address operand");
1076 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
1077
1078 if (IsFlat) {
1079 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1081 "offset should be legal");
1082 FIOp->ChangeToRegister(BaseReg, false);
1083 OffsetOp->setImm(NewOffset);
1084 return;
1085 }
1086
1087#ifndef NDEBUG
1088 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
1089 assert(SOffset->isImm() && SOffset->getImm() == 0);
1090#endif
1091
1092 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
1093
1094 FIOp->ChangeToRegister(BaseReg, false);
1095 OffsetOp->setImm(NewOffset);
1096}
1097
1099 Register BaseReg,
1100 int64_t Offset) const {
1101
1102 switch (MI->getOpcode()) {
1103 case AMDGPU::V_ADD_U32_e32:
1104 case AMDGPU::V_ADD_CO_U32_e32:
1105 return true;
1106 case AMDGPU::V_ADD_U32_e64:
1107 case AMDGPU::V_ADD_CO_U32_e64:
1108 return ST.hasVOP3Literal() || AMDGPU::isInlinableIntLiteral(Offset);
1109 default:
1110 break;
1111 }
1112
1114 return false;
1115
1116 int64_t NewOffset = Offset + getScratchInstrOffset(MI);
1117
1118 const SIInstrInfo *TII = ST.getInstrInfo();
1120 return TII->isLegalMUBUFImmOffset(NewOffset);
1121
1122 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1124}
1125
1126const TargetRegisterClass *
1128 // This is inaccurate. It depends on the instruction and address space. The
1129 // only place where we should hit this is for dealing with frame indexes /
1130 // private accesses, so this is correct in that case.
1131 return &AMDGPU::VGPR_32RegClass;
1132}
1133
1134const TargetRegisterClass *
1136 return RC == &AMDGPU::SCC_CLASSRegClass ? &AMDGPU::SReg_32RegClass : RC;
1137}
1138
1140 const SIInstrInfo *TII) {
1141
1142 unsigned Op = MI.getOpcode();
1143 switch (Op) {
1144 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
1145 case AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE:
1146 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
1147 // FIXME: This assumes the mask is statically known and not computed at
1148 // runtime. However, some ABIs may want to compute the mask dynamically and
1149 // this will need to be updated.
1150 return llvm::popcount(
1151 (uint64_t)TII->getNamedOperand(MI, AMDGPU::OpName::mask)->getImm());
1152 case AMDGPU::SI_SPILL_S1024_SAVE:
1153 case AMDGPU::SI_SPILL_S1024_CFI_SAVE:
1154 case AMDGPU::SI_SPILL_S1024_RESTORE:
1155 case AMDGPU::SI_SPILL_V1024_SAVE:
1156 case AMDGPU::SI_SPILL_V1024_CFI_SAVE:
1157 case AMDGPU::SI_SPILL_V1024_RESTORE:
1158 case AMDGPU::SI_SPILL_A1024_SAVE:
1159 case AMDGPU::SI_SPILL_A1024_CFI_SAVE:
1160 case AMDGPU::SI_SPILL_A1024_RESTORE:
1161 case AMDGPU::SI_SPILL_AV1024_SAVE:
1162 case AMDGPU::SI_SPILL_AV1024_CFI_SAVE:
1163 case AMDGPU::SI_SPILL_AV1024_RESTORE:
1164 return 32;
1165 case AMDGPU::SI_SPILL_S512_SAVE:
1166 case AMDGPU::SI_SPILL_S512_CFI_SAVE:
1167 case AMDGPU::SI_SPILL_S512_RESTORE:
1168 case AMDGPU::SI_SPILL_V512_SAVE:
1169 case AMDGPU::SI_SPILL_V512_CFI_SAVE:
1170 case AMDGPU::SI_SPILL_V512_RESTORE:
1171 case AMDGPU::SI_SPILL_A512_SAVE:
1172 case AMDGPU::SI_SPILL_A512_CFI_SAVE:
1173 case AMDGPU::SI_SPILL_A512_RESTORE:
1174 case AMDGPU::SI_SPILL_AV512_SAVE:
1175 case AMDGPU::SI_SPILL_AV512_CFI_SAVE:
1176 case AMDGPU::SI_SPILL_AV512_RESTORE:
1177 return 16;
1178 case AMDGPU::SI_SPILL_S384_SAVE:
1179 case AMDGPU::SI_SPILL_S384_RESTORE:
1180 case AMDGPU::SI_SPILL_V384_SAVE:
1181 case AMDGPU::SI_SPILL_V384_RESTORE:
1182 case AMDGPU::SI_SPILL_A384_SAVE:
1183 case AMDGPU::SI_SPILL_A384_RESTORE:
1184 case AMDGPU::SI_SPILL_AV384_SAVE:
1185 case AMDGPU::SI_SPILL_AV384_RESTORE:
1186 return 12;
1187 case AMDGPU::SI_SPILL_S352_SAVE:
1188 case AMDGPU::SI_SPILL_S352_RESTORE:
1189 case AMDGPU::SI_SPILL_V352_SAVE:
1190 case AMDGPU::SI_SPILL_V352_RESTORE:
1191 case AMDGPU::SI_SPILL_A352_SAVE:
1192 case AMDGPU::SI_SPILL_A352_RESTORE:
1193 case AMDGPU::SI_SPILL_AV352_SAVE:
1194 case AMDGPU::SI_SPILL_AV352_RESTORE:
1195 return 11;
1196 case AMDGPU::SI_SPILL_S320_SAVE:
1197 case AMDGPU::SI_SPILL_S320_RESTORE:
1198 case AMDGPU::SI_SPILL_V320_SAVE:
1199 case AMDGPU::SI_SPILL_V320_RESTORE:
1200 case AMDGPU::SI_SPILL_A320_SAVE:
1201 case AMDGPU::SI_SPILL_A320_RESTORE:
1202 case AMDGPU::SI_SPILL_AV320_SAVE:
1203 case AMDGPU::SI_SPILL_AV320_RESTORE:
1204 return 10;
1205 case AMDGPU::SI_SPILL_S288_SAVE:
1206 case AMDGPU::SI_SPILL_S288_RESTORE:
1207 case AMDGPU::SI_SPILL_V288_SAVE:
1208 case AMDGPU::SI_SPILL_V288_RESTORE:
1209 case AMDGPU::SI_SPILL_A288_SAVE:
1210 case AMDGPU::SI_SPILL_A288_RESTORE:
1211 case AMDGPU::SI_SPILL_AV288_SAVE:
1212 case AMDGPU::SI_SPILL_AV288_RESTORE:
1213 return 9;
1214 case AMDGPU::SI_SPILL_S256_SAVE:
1215 case AMDGPU::SI_SPILL_S256_CFI_SAVE:
1216 case AMDGPU::SI_SPILL_S256_RESTORE:
1217 case AMDGPU::SI_SPILL_V256_SAVE:
1218 case AMDGPU::SI_SPILL_V256_CFI_SAVE:
1219 case AMDGPU::SI_SPILL_V256_RESTORE:
1220 case AMDGPU::SI_SPILL_A256_SAVE:
1221 case AMDGPU::SI_SPILL_A256_CFI_SAVE:
1222 case AMDGPU::SI_SPILL_A256_RESTORE:
1223 case AMDGPU::SI_SPILL_AV256_SAVE:
1224 case AMDGPU::SI_SPILL_AV256_CFI_SAVE:
1225 case AMDGPU::SI_SPILL_AV256_RESTORE:
1226 return 8;
1227 case AMDGPU::SI_SPILL_S224_SAVE:
1228 case AMDGPU::SI_SPILL_S224_CFI_SAVE:
1229 case AMDGPU::SI_SPILL_S224_RESTORE:
1230 case AMDGPU::SI_SPILL_V224_SAVE:
1231 case AMDGPU::SI_SPILL_V224_CFI_SAVE:
1232 case AMDGPU::SI_SPILL_V224_RESTORE:
1233 case AMDGPU::SI_SPILL_A224_SAVE:
1234 case AMDGPU::SI_SPILL_A224_CFI_SAVE:
1235 case AMDGPU::SI_SPILL_A224_RESTORE:
1236 case AMDGPU::SI_SPILL_AV224_SAVE:
1237 case AMDGPU::SI_SPILL_AV224_CFI_SAVE:
1238 case AMDGPU::SI_SPILL_AV224_RESTORE:
1239 return 7;
1240 case AMDGPU::SI_SPILL_S192_SAVE:
1241 case AMDGPU::SI_SPILL_S192_CFI_SAVE:
1242 case AMDGPU::SI_SPILL_S192_RESTORE:
1243 case AMDGPU::SI_SPILL_V192_SAVE:
1244 case AMDGPU::SI_SPILL_V192_CFI_SAVE:
1245 case AMDGPU::SI_SPILL_V192_RESTORE:
1246 case AMDGPU::SI_SPILL_A192_SAVE:
1247 case AMDGPU::SI_SPILL_A192_CFI_SAVE:
1248 case AMDGPU::SI_SPILL_A192_RESTORE:
1249 case AMDGPU::SI_SPILL_AV192_SAVE:
1250 case AMDGPU::SI_SPILL_AV192_CFI_SAVE:
1251 case AMDGPU::SI_SPILL_AV192_RESTORE:
1252 return 6;
1253 case AMDGPU::SI_SPILL_S160_SAVE:
1254 case AMDGPU::SI_SPILL_S160_CFI_SAVE:
1255 case AMDGPU::SI_SPILL_S160_RESTORE:
1256 case AMDGPU::SI_SPILL_V160_SAVE:
1257 case AMDGPU::SI_SPILL_V160_CFI_SAVE:
1258 case AMDGPU::SI_SPILL_V160_RESTORE:
1259 case AMDGPU::SI_SPILL_A160_SAVE:
1260 case AMDGPU::SI_SPILL_A160_CFI_SAVE:
1261 case AMDGPU::SI_SPILL_A160_RESTORE:
1262 case AMDGPU::SI_SPILL_AV160_SAVE:
1263 case AMDGPU::SI_SPILL_AV160_CFI_SAVE:
1264 case AMDGPU::SI_SPILL_AV160_RESTORE:
1265 return 5;
1266 case AMDGPU::SI_SPILL_S128_SAVE:
1267 case AMDGPU::SI_SPILL_S128_CFI_SAVE:
1268 case AMDGPU::SI_SPILL_S128_RESTORE:
1269 case AMDGPU::SI_SPILL_V128_SAVE:
1270 case AMDGPU::SI_SPILL_V128_CFI_SAVE:
1271 case AMDGPU::SI_SPILL_V128_RESTORE:
1272 case AMDGPU::SI_SPILL_A128_SAVE:
1273 case AMDGPU::SI_SPILL_A128_CFI_SAVE:
1274 case AMDGPU::SI_SPILL_A128_RESTORE:
1275 case AMDGPU::SI_SPILL_AV128_SAVE:
1276 case AMDGPU::SI_SPILL_AV128_CFI_SAVE:
1277 case AMDGPU::SI_SPILL_AV128_RESTORE:
1278 return 4;
1279 case AMDGPU::SI_SPILL_S96_SAVE:
1280 case AMDGPU::SI_SPILL_S96_CFI_SAVE:
1281 case AMDGPU::SI_SPILL_S96_RESTORE:
1282 case AMDGPU::SI_SPILL_V96_SAVE:
1283 case AMDGPU::SI_SPILL_V96_CFI_SAVE:
1284 case AMDGPU::SI_SPILL_V96_RESTORE:
1285 case AMDGPU::SI_SPILL_A96_SAVE:
1286 case AMDGPU::SI_SPILL_A96_CFI_SAVE:
1287 case AMDGPU::SI_SPILL_A96_RESTORE:
1288 case AMDGPU::SI_SPILL_AV96_SAVE:
1289 case AMDGPU::SI_SPILL_AV96_CFI_SAVE:
1290 case AMDGPU::SI_SPILL_AV96_RESTORE:
1291 return 3;
1292 case AMDGPU::SI_SPILL_S64_SAVE:
1293 case AMDGPU::SI_SPILL_S64_CFI_SAVE:
1294 case AMDGPU::SI_SPILL_S64_RESTORE:
1295 case AMDGPU::SI_SPILL_V64_SAVE:
1296 case AMDGPU::SI_SPILL_V64_CFI_SAVE:
1297 case AMDGPU::SI_SPILL_V64_RESTORE:
1298 case AMDGPU::SI_SPILL_A64_SAVE:
1299 case AMDGPU::SI_SPILL_A64_CFI_SAVE:
1300 case AMDGPU::SI_SPILL_A64_RESTORE:
1301 case AMDGPU::SI_SPILL_AV64_SAVE:
1302 case AMDGPU::SI_SPILL_AV64_CFI_SAVE:
1303 case AMDGPU::SI_SPILL_AV64_RESTORE:
1304 return 2;
1305 case AMDGPU::SI_SPILL_S32_SAVE:
1306 case AMDGPU::SI_SPILL_S32_CFI_SAVE:
1307 case AMDGPU::SI_SPILL_S32_RESTORE:
1308 case AMDGPU::SI_SPILL_V32_SAVE:
1309 case AMDGPU::SI_SPILL_V32_CFI_SAVE:
1310 case AMDGPU::SI_SPILL_V32_RESTORE:
1311 case AMDGPU::SI_SPILL_A32_SAVE:
1312 case AMDGPU::SI_SPILL_A32_CFI_SAVE:
1313 case AMDGPU::SI_SPILL_A32_RESTORE:
1314 case AMDGPU::SI_SPILL_AV32_SAVE:
1315 case AMDGPU::SI_SPILL_AV32_CFI_SAVE:
1316 case AMDGPU::SI_SPILL_AV32_RESTORE:
1317 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
1318 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
1319 case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
1320 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
1321 case AMDGPU::SI_SPILL_V16_SAVE:
1322 case AMDGPU::SI_SPILL_V16_RESTORE:
1323 return 1;
1324 default: llvm_unreachable("Invalid spill opcode");
1325 }
1326}
1327
1328static int getOffsetMUBUFStore(unsigned Opc) {
1329 switch (Opc) {
1330 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
1331 return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1332 case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
1333 return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
1334 case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
1335 return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
1336 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
1337 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
1338 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
1339 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
1340 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
1341 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
1342 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
1343 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
1344 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
1345 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
1346 default:
1347 return -1;
1348 }
1349}
1350
1351static int getOffsetMUBUFLoad(unsigned Opc) {
1352 switch (Opc) {
1353 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
1354 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1355 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
1356 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
1357 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
1358 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
1359 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
1360 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
1361 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
1362 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
1363 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
1364 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
1365 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
1366 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
1367 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
1368 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
1369 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
1370 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
1371 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
1372 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
1373 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
1374 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
1375 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
1376 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
1377 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
1378 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
1379 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
1380 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
1381 default:
1382 return -1;
1383 }
1384}
1385
1386static int getOffenMUBUFStore(unsigned Opc) {
1387 switch (Opc) {
1388 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
1389 return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
1390 case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
1391 return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
1392 case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
1393 return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
1394 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
1395 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
1396 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
1397 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
1398 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
1399 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
1400 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
1401 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
1402 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
1403 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
1404 default:
1405 return -1;
1406 }
1407}
1408
1409static int getOffenMUBUFLoad(unsigned Opc) {
1410 switch (Opc) {
1411 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
1412 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
1413 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
1414 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
1415 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
1416 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
1417 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
1418 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
1419 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
1420 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
1421 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
1422 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
1423 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
1424 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
1425 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
1426 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
1427 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
1428 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
1429 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
1430 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
1431 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
1432 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
1433 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
1434 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
1435 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
1436 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
1437 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
1438 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
1439 default:
1440 return -1;
1441 }
1442}
1443
1446 MachineBasicBlock::iterator MI, int Index, unsigned Lane,
1447 unsigned ValueReg, bool IsKill, bool NeedsCFI) {
1448 MachineFunction *MF = MBB.getParent();
1450 const SIInstrInfo *TII = ST.getInstrInfo();
1451 const SIFrameLowering *TFL = ST.getFrameLowering();
1452
1453 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
1454
1455 if (Reg == AMDGPU::NoRegister)
1456 return MachineInstrBuilder();
1457
1458 bool IsStore = MI->mayStore();
1459 MachineRegisterInfo &MRI = MF->getRegInfo();
1460 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1461
1462 unsigned Dst = IsStore ? Reg : ValueReg;
1463 unsigned Src = IsStore ? ValueReg : Reg;
1464 bool IsVGPR = TRI->isVGPR(MRI, Reg);
1465 const DebugLoc &DL = MI->getDebugLoc();
1466 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
1467 // Spiller during regalloc may restore a spilled register to its superclass.
1468 // It could result in AGPR spills restored to VGPRs or the other way around,
1469 // making the src and dst with identical regclasses at this point. It just
1470 // needs a copy in such cases.
1471 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
1472 .addReg(Src, getKillRegState(IsKill));
1474 if (NeedsCFI)
1475 TFL->buildCFIForVRegToVRegSpill(MBB, MI, DL, Src, Dst);
1476 return CopyMIB;
1477 }
1478 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1479 : AMDGPU::V_ACCVGPR_READ_B32_e64;
1480
1481 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
1482 .addReg(Src, getKillRegState(IsKill));
1484 if (NeedsCFI)
1485 TFL->buildCFIForVRegToVRegSpill(MBB, MI, DL, Src, Dst);
1486 return MIB;
1487}
1488
1489// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1490// need to handle the case where an SGPR may need to be spilled while spilling.
1492 MachineFrameInfo &MFI,
1494 int Index,
1495 int64_t Offset) {
1496 const SIInstrInfo *TII = ST.getInstrInfo();
1497 MachineBasicBlock *MBB = MI->getParent();
1498 const DebugLoc &DL = MI->getDebugLoc();
1499 bool IsStore = MI->mayStore();
1500
1501 unsigned Opc = MI->getOpcode();
1502 int LoadStoreOp = IsStore ?
1504 if (LoadStoreOp == -1)
1505 return false;
1506
1507 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
1508 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false, false)
1509 .getInstr())
1510 return true;
1511
1512 MachineInstrBuilder NewMI =
1513 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
1514 .add(*Reg)
1515 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
1516 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
1517 .addImm(Offset)
1518 .addImm(0) // cpol
1519 .addImm(0) // swz
1520 .cloneMemRefs(*MI);
1521
1522 const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
1523 AMDGPU::OpName::vdata_in);
1524 if (VDataIn)
1525 NewMI.add(*VDataIn);
1526 return true;
1527}
1528
1530 unsigned LoadStoreOp,
1531 unsigned EltSize) {
1532 bool IsStore = TII->get(LoadStoreOp).mayStore();
1533 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr);
1534 bool UseST =
1535 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
1536
1537 // Handle block load/store first.
1538 if (TII->isBlockLoadStore(LoadStoreOp))
1539 return LoadStoreOp;
1540
1541 switch (EltSize) {
1542 case 4:
1543 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1544 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
1545 break;
1546 case 8:
1547 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1548 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
1549 break;
1550 case 12:
1551 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1552 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
1553 break;
1554 case 16:
1555 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1556 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
1557 break;
1558 default:
1559 llvm_unreachable("Unexpected spill load/store size!");
1560 }
1561
1562 if (HasVAddr)
1563 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1564 else if (UseST)
1565 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1566
1567 return LoadStoreOp;
1568}
1569
1572 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1573 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1574 RegScavenger *RS, LiveRegUnits *LiveUnits, bool NeedsCFI) const {
1575 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
1576
1577 MachineFunction *MF = MBB.getParent();
1578 const SIInstrInfo *TII = ST.getInstrInfo();
1579 const MachineFrameInfo &MFI = MF->getFrameInfo();
1580 const SIFrameLowering *TFL = ST.getFrameLowering();
1581 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1582
1583 const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
1584 bool IsStore = Desc->mayStore();
1585 bool IsFlat = TII->isFLATScratch(LoadStoreOp);
1586 bool IsBlock = TII->isBlockLoadStore(LoadStoreOp);
1587
1588 bool CanClobberSCC = false;
1589 bool Scavenged = false;
1590 MCRegister SOffset = ScratchOffsetReg;
1591
1592 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
1593 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1594 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
1595 unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
1596
1597 // On targets with register tuple alignment requirements,
1598 // for unaligned tuples, spill the first sub-reg as a 32-bit spill,
1599 // and spill the rest as a regular aligned tuple.
1600 // eg: SPILL_V224 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
1601 // will be spilt as:
1602 // SPILL_SCRATCH_DWORD $vgpr1
1603 // SPILL_SCRATCH_DWORDx4 $vgpr2_vgpr3_vgpr4_vgpr5
1604 // SPILL_SCRATCH_DWORDx2 $vgpr6_vgpr7
1605 bool IsRegMisaligned = false;
1606 if (!IsBlock && !IsAGPR && RegWidth > 4) {
1607 unsigned SpillOpcode =
1608 getFlatScratchSpillOpcode(TII, LoadStoreOp, std::min(RegWidth, 16u));
1609 int VDataIdx =
1610 IsStore ? AMDGPU::getNamedOperandIdx(SpillOpcode, AMDGPU::OpName::vdata)
1611 : 0; // Restore Ops have data reg as the first (output) operand.
1612 const TargetRegisterClass *ExpectedRC =
1613 TII->getRegClass(TII->get(SpillOpcode), VDataIdx);
1614 if (!ExpectedRC->contains(ValueReg)) {
1615 unsigned NumRegs = std::min(AMDGPU::getRegBitWidth(*ExpectedRC) / 4, 4u);
1616 unsigned SubIdx = getSubRegFromChannel(0, NumRegs);
1617 const TargetRegisterClass *MatchRC =
1618 getMatchingSuperRegClass(RC, ExpectedRC, SubIdx);
1619 if (!MatchRC || !MatchRC->contains(ValueReg))
1620 IsRegMisaligned = true;
1621 }
1622 }
1623 // The first sub-register will be spilled as a 32-bit value
1624 if (IsRegMisaligned)
1625 RegWidth -= 4u;
1626 // Always use 4 byte operations for AGPRs because we need to scavenge
1627 // a temporary VGPR.
1628 // If we're using a block operation, the element should be the whole block.
1629 unsigned EltSize = IsBlock ? RegWidth
1630 : (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u)
1631 : 4u;
1632 unsigned NumSubRegs = RegWidth / EltSize;
1633 unsigned Size = NumSubRegs * EltSize;
1634 unsigned RemSize = RegWidth - Size;
1635 unsigned NumRemSubRegs = RemSize ? 1 : 0;
1636 // An additional sub-register is needed to spill the misaligned component.
1637 if (IsRegMisaligned)
1638 NumSubRegs += 1;
1639 int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
1640 int64_t MaterializedOffset = Offset;
1641
1642 // Maxoffset is the starting offset for the last chunk to be spilled.
1643 // In case of non-zero remainder element, max offset will be the
1644 // last address(offset + Size) after spilling all the EltSize chunks.
1645 int64_t MaxOffset = Offset + Size - (RemSize ? 0 : EltSize);
1646 int64_t ScratchOffsetRegDelta = 0;
1647 int64_t AdditionalCFIOffset = 0;
1648
1649 if (IsFlat && EltSize > 4) {
1650 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1651 Desc = &TII->get(LoadStoreOp);
1652 }
1653
1654 Align Alignment = MFI.getObjectAlign(Index);
1655 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
1656
1657 assert((IsFlat || ((Offset % EltSize) == 0)) &&
1658 "unexpected VGPR spill offset");
1659
1660 // Track a VGPR to use for a constant offset we need to materialize.
1661 Register TmpOffsetVGPR;
1662
1663 // Track a VGPR to use as an intermediate value.
1664 Register TmpIntermediateVGPR;
1665 bool UseVGPROffset = false;
1666
1667 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1668 // combination.
1669 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
1670 int64_t VOffset) {
1671 // We are using a VGPR offset
1672 if (IsFlat && SGPRBase) {
1673 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1674 // SGPR, so perform the add as vector.
1675 // We don't need a base SGPR in the kernel.
1676
1677 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
1678 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
1679 .addReg(SGPRBase)
1680 .addImm(VOffset)
1681 .addImm(0); // clamp
1682 } else {
1683 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1684 .addReg(SGPRBase);
1685 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
1686 .addImm(VOffset)
1687 .addReg(TmpOffsetVGPR);
1688 }
1689 } else {
1690 assert(TmpOffsetVGPR);
1691 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1692 .addImm(VOffset);
1693 }
1694 };
1695
1696 bool IsOffsetLegal =
1697 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1699 : TII->isLegalMUBUFImmOffset(MaxOffset);
1700 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
1701 SOffset = MCRegister();
1702
1703 // We don't have access to the register scavenger if this function is called
1704 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1705 // TODO: Clobbering SCC is not necessary for scratch instructions in the
1706 // entry.
1707 if (RS) {
1708 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false);
1709
1710 // Piggy back on the liveness scan we just did see if SCC is dead.
1711 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
1712 } else if (LiveUnits) {
1713 CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
1714 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
1715 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1716 SOffset = Reg;
1717 break;
1718 }
1719 }
1720 }
1721
1722 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
1723 SOffset = Register();
1724
1725 if (!SOffset) {
1726 UseVGPROffset = true;
1727
1728 if (RS) {
1729 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1730 } else {
1731 assert(LiveUnits);
1732 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
1733 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1734 TmpOffsetVGPR = Reg;
1735 break;
1736 }
1737 }
1738 }
1739
1740 assert(TmpOffsetVGPR);
1741 } else if (!SOffset && CanClobberSCC) {
1742 // There are no free SGPRs, and since we are in the process of spilling
1743 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
1744 // on SI/CI and on VI it is true until we implement spilling using scalar
1745 // stores), we have no way to free up an SGPR. Our solution here is to
1746 // add the offset directly to the ScratchOffset or StackPtrOffset
1747 // register, and then subtract the offset after the spill to return the
1748 // register to it's original value.
1749
1750 // TODO: If we don't have to do an emergency stack slot spill, converting
1751 // to use the VGPR offset is fewer instructions.
1752 if (!ScratchOffsetReg)
1753 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
1754 SOffset = ScratchOffsetReg;
1755 ScratchOffsetRegDelta = Offset;
1756 } else {
1757 Scavenged = true;
1758 }
1759
1760 AdditionalCFIOffset = Offset;
1761 // We currently only support spilling VGPRs to EltSize boundaries, meaning
1762 // we can simplify the adjustment of Offset here to just scale with
1763 // WavefrontSize.
1764 if (!IsFlat && !UseVGPROffset)
1765 Offset *= ST.getWavefrontSize();
1766
1767 if (!UseVGPROffset && !SOffset)
1768 report_fatal_error("could not scavenge SGPR to spill in entry function");
1769
1770 if (UseVGPROffset) {
1771 // We are using a VGPR offset
1772 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
1773 } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
1774 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
1775 } else {
1776 assert(Offset != 0);
1777 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1778 .addReg(ScratchOffsetReg)
1779 .addImm(Offset);
1780 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1781 }
1782
1783 Offset = 0;
1784 }
1785
1786 if (IsFlat && SOffset == AMDGPU::NoRegister) {
1787 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
1788 && "Unexpected vaddr for flat scratch with a FI operand");
1789
1790 if (UseVGPROffset) {
1791 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1792 } else {
1793 assert(ST.hasFlatScratchSTMode());
1794 assert(!TII->isBlockLoadStore(LoadStoreOp) && "Block ops don't have ST");
1795 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1796 }
1797
1798 Desc = &TII->get(LoadStoreOp);
1799 }
1800
1801 // Save a copy of the original element size before its potentially changed for
1802 // misaligned tuples.
1803 unsigned OrigEltSize = EltSize;
1804 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
1805 ++i, RegOffset += EltSize) {
1806 if (IsRegMisaligned) {
1807 if (i == 0) {
1808 // For misaligned register tuples, spill only the first sub-reg in the
1809 // first iteration.
1810 EltSize = 4u;
1811 } else {
1812 // The misaligned register was spilt. Now the rest of the tuple is
1813 // properly aligned.
1814 IsRegMisaligned = false;
1815 EltSize = OrigEltSize;
1816 }
1817 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1818 }
1819 if (i == NumSubRegs) {
1820 EltSize = RemSize;
1821 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1822 }
1823 Desc = &TII->get(LoadStoreOp);
1824
1825 if (!IsFlat && UseVGPROffset) {
1826 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
1827 : getOffenMUBUFLoad(LoadStoreOp);
1828 Desc = &TII->get(NewLoadStoreOp);
1829 }
1830
1831 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
1832 // If we are spilling an AGPR beyond the range of the memory instruction
1833 // offset and need to use a VGPR offset, we ideally have at least 2
1834 // scratch VGPRs. If we don't have a second free VGPR without spilling,
1835 // recycle the VGPR used for the offset which requires resetting after
1836 // each subregister.
1837
1838 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
1839 }
1840
1841 unsigned NumRegs = EltSize / 4;
1842 Register SubReg = e == 1
1843 ? ValueReg
1844 : Register(getSubReg(ValueReg,
1845 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1846
1847 RegState SOffsetRegState = {};
1848 RegState SrcDstRegState = getDefRegState(!IsStore);
1849 const bool IsLastSubReg = i + 1 == e;
1850 const bool IsFirstSubReg = i == 0;
1851 if (IsLastSubReg) {
1852 SOffsetRegState |= getKillRegState(Scavenged);
1853 // The last implicit use carries the "Kill" flag.
1854 SrcDstRegState |= getKillRegState(IsKill);
1855 }
1856
1857 // Make sure the whole register is defined if there are undef components by
1858 // adding an implicit def of the super-reg on the first instruction.
1859 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg;
1860 bool NeedSuperRegImpOperand = e > 1;
1861
1862 // Remaining element size to spill into memory after some parts of it
1863 // spilled into either AGPRs or VGPRs.
1864 unsigned RemEltSize = EltSize;
1865
1866 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1867 // starting from the last lane. In case if a register cannot be completely
1868 // spilled into another register that will ensure its alignment does not
1869 // change. For targets with VGPR alignment requirement this is important
1870 // in case of flat scratch usage as we might get a scratch_load or
1871 // scratch_store of an unaligned register otherwise.
1872 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
1873 LaneE = RegOffset / 4;
1874 Lane >= LaneE; --Lane) {
1875 bool IsSubReg = e > 1 || EltSize > 4;
1876 Register Sub = IsSubReg
1877 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
1878 : ValueReg;
1879 auto MIB =
1880 spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill, NeedsCFI);
1881 if (!MIB.getInstr())
1882 break;
1883 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
1884 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1885 NeedSuperRegDef = false;
1886 }
1887 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
1888 NeedSuperRegImpOperand = true;
1889 RegState State = SrcDstRegState;
1890 if (!IsLastSubReg || (Lane != LaneE))
1891 State &= ~RegState::Kill;
1892 if (!IsFirstSubReg || (Lane != LaneS))
1893 State &= ~RegState::Define;
1894 MIB.addReg(ValueReg, RegState::Implicit | State);
1895 }
1896 RemEltSize -= 4;
1897 }
1898
1899 if (!RemEltSize) // Fully spilled into AGPRs.
1900 continue;
1901
1902 if (RemEltSize != EltSize) { // Partially spilled to AGPRs
1903 assert(IsFlat && EltSize > 4);
1904
1905 unsigned NumRegs = RemEltSize / 4;
1906 SubReg = Register(getSubReg(ValueReg,
1907 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1908 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
1909 Desc = &TII->get(Opc);
1910 }
1911
1912 unsigned FinalReg = SubReg;
1913
1914 if (IsAGPR) {
1915 assert(EltSize == 4);
1916
1917 if (!TmpIntermediateVGPR) {
1918 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
1919 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
1920 }
1921 if (IsStore) {
1922 auto AccRead = BuildMI(MBB, MI, DL,
1923 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
1924 TmpIntermediateVGPR)
1925 .addReg(SubReg, getKillRegState(IsKill));
1926 if (NeedSuperRegDef)
1927 AccRead.addReg(ValueReg, RegState::ImplicitDefine);
1928 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1929 AccRead.addReg(ValueReg, RegState::Implicit);
1931 }
1932 SubReg = TmpIntermediateVGPR;
1933 } else if (UseVGPROffset) {
1934 if (!TmpOffsetVGPR) {
1935 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1936 MI, false, 0);
1937 RS->setRegUsed(TmpOffsetVGPR);
1938 }
1939 }
1940
1941 Register FinalValueReg = ValueReg;
1942 if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_SADDR ||
1943 LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_ST) {
1944 // If we are loading 16-bit value with SRAMECC endabled we need a temp
1945 // 32-bit VGPR to load and extract 16-bits into the final register.
1946 ValueReg =
1947 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1948 SubReg = ValueReg;
1949 IsKill = false;
1950 }
1951
1952 // Create the MMO, additional set the NonVolatile flag as scratch memory
1953 // used for spills will not be used outside the thread.
1954 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
1956 PInfo, MMO->getFlags() | MOThreadPrivate, RemEltSize,
1957 commonAlignment(Alignment, RegOffset));
1958
1959 auto MIB =
1960 BuildMI(MBB, MI, DL, *Desc)
1961 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
1962
1963 if (UseVGPROffset) {
1964 // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1965 // intermediate accvgpr_write.
1966 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
1967 }
1968
1969 if (!IsFlat)
1970 MIB.addReg(FuncInfo->getScratchRSrcReg());
1971
1972 if (SOffset == AMDGPU::NoRegister) {
1973 if (!IsFlat) {
1974 if (UseVGPROffset && ScratchOffsetReg) {
1975 MIB.addReg(ScratchOffsetReg);
1976 } else {
1977 assert(FuncInfo->isBottomOfStack());
1978 MIB.addImm(0);
1979 }
1980 }
1981 } else {
1982 MIB.addReg(SOffset, SOffsetRegState);
1983 }
1984
1985 MIB.addImm(Offset + RegOffset);
1986
1987 bool LastUse = MMO->getFlags() & MOLastUse;
1988 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1989
1990 if (!IsFlat)
1991 MIB.addImm(0); // swz
1992 MIB.addMemOperand(NewMMO);
1993
1994 if (FinalValueReg != ValueReg) {
1995 // Extract 16-bit from the loaded 32-bit value.
1996 ValueReg = getSubReg(ValueReg, AMDGPU::lo16);
1997 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e64))
1998 .addReg(FinalValueReg, getDefRegState(true))
1999 .addImm(0)
2000 .addReg(ValueReg, getKillRegState(true))
2001 .addImm(0);
2002 ValueReg = FinalValueReg;
2003 }
2004
2005 if (IsStore && NeedsCFI) {
2006 if (TII->isBlockLoadStore(LoadStoreOp)) {
2007 assert(RegOffset == 0 &&
2008 "expected whole register block to be treated as single element");
2010 } else {
2012 MBB, MI, DebugLoc(), SubReg,
2013 (Offset + RegOffset) * ST.getWavefrontSize() + AdditionalCFIOffset);
2014 }
2015 }
2016
2017 if (!IsAGPR && NeedSuperRegDef)
2018 MIB.addReg(ValueReg, RegState::ImplicitDefine);
2019
2020 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
2021 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
2022 FinalReg)
2023 .addReg(TmpIntermediateVGPR, RegState::Kill);
2025 }
2026
2027 bool IsSrcDstDef = hasRegState(SrcDstRegState, RegState::Define);
2028 bool PartialReloadCopy = (RemEltSize != EltSize) && !IsStore;
2029 if (NeedSuperRegImpOperand &&
2030 (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) {
2031 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
2032 if (PartialReloadCopy)
2033 MIB.addReg(ValueReg, RegState::Implicit);
2034 }
2035
2036 // The epilog restore of a wwm-scratch register can cause undesired
2037 // optimization during machine-cp post PrologEpilogInserter if the same
2038 // register was assigned for return value ABI lowering with a COPY
2039 // instruction. As given below, with the epilog reload, the earlier COPY
2040 // appeared to be dead during machine-cp.
2041 // ...
2042 // v0 in WWM operation, needs the WWM spill at prolog/epilog.
2043 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
2044 // ...
2045 // Epilog block:
2046 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
2047 // ...
2048 // WWM spill restore to preserve the inactive lanes of v0.
2049 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
2050 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
2051 // $exec = S_MOV_B64 killed $sgpr4_sgpr5
2052 // ...
2053 // SI_RETURN implicit $vgpr0
2054 // ...
2055 // To fix it, mark the same reg as a tied op for such restore instructions
2056 // so that it marks a usage for the preceding COPY.
2057 if (!IsStore && MI != MBB.end() && MI->isReturn() &&
2058 MI->readsRegister(SubReg, this)) {
2059 MIB.addReg(SubReg, RegState::Implicit);
2060 MIB->tieOperands(0, MIB->getNumOperands() - 1);
2061 }
2062
2063 // If we're building a block load, we should add artificial uses for the
2064 // CSR VGPRs that are *not* being transferred. This is because liveness
2065 // analysis is not aware of the mask, so we need to somehow inform it that
2066 // those registers are not available before the load and they should not be
2067 // scavenged.
2068 if (!IsStore && TII->isBlockLoadStore(LoadStoreOp))
2069 addImplicitUsesForBlockCSRLoad(MIB, ValueReg);
2070 }
2071
2072 if (ScratchOffsetRegDelta != 0) {
2073 // Subtract the offset we added to the ScratchOffset register.
2074 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
2075 .addReg(SOffset)
2076 .addImm(-ScratchOffsetRegDelta);
2077 }
2078}
2079
2081 Register BlockReg) const {
2082 const MachineFunction *MF = MIB->getMF();
2083 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
2084 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
2085 Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
2086 for (unsigned RegOffset = 1; RegOffset < 32; ++RegOffset)
2087 if (!(Mask & (1 << RegOffset)) &&
2088 isCalleeSavedPhysReg(BaseVGPR + RegOffset, *MF))
2089 MIB.addUse(BaseVGPR + RegOffset, RegState::Implicit);
2090}
2091
2094 Register BlockReg,
2095 int64_t Offset) const {
2096 const MachineFunction *MF = MBB.getParent();
2097 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
2098 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
2099 Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
2100 for (unsigned RegOffset = 0; RegOffset < 32; ++RegOffset) {
2101 Register VGPR = BaseVGPR + RegOffset;
2102 if (Mask & (1 << RegOffset)) {
2103 assert(isCalleeSavedPhysReg(VGPR, *MF));
2104 ST.getFrameLowering()->buildCFIForVGPRToVMEMSpill(
2105 MBB, MBBI, DebugLoc(), VGPR,
2106 (Offset + RegOffset) * ST.getWavefrontSize());
2107 } else if (isCalleeSavedPhysReg(VGPR, *MF)) {
2108 // FIXME: This is a workaround for the fact that FrameLowering's
2109 // emitPrologueEntryCFI considers the block load to clobber all registers
2110 // in the block.
2111 ST.getFrameLowering()->buildCFIForSameValue(MBB, MBBI, DebugLoc(),
2112 BaseVGPR + RegOffset);
2113 }
2114 }
2115}
2116
2118 int Offset, bool IsLoad,
2119 bool IsKill) const {
2120 // Load/store VGPR
2121 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
2122 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
2123
2124 Register FrameReg =
2125 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
2126 ? getBaseRegister()
2127 : getFrameRegister(SB.MF);
2128
2129 Align Alignment = FrameInfo.getObjectAlign(Index);
2133 SB.EltSize, Alignment);
2134
2135 if (IsLoad) {
2136 unsigned Opc = ST.hasFlatScratchEnabled()
2137 ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2138 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2139 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
2140 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
2141 } else {
2142 unsigned Opc = ST.hasFlatScratchEnabled()
2143 ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2144 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2145 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
2146 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
2147 // This only ever adds one VGPR spill
2148 SB.MFI.addToSpilledVGPRs(1);
2149 }
2150}
2151
2153 RegScavenger *RS, SlotIndexes *Indexes,
2154 LiveIntervals *LIS, bool OnlyToVGPR,
2155 bool SpillToPhysVGPRLane, bool NeedsCFI) const {
2156 assert(!MI->getOperand(0).isUndef() &&
2157 "undef spill should have been deleted earlier");
2158
2159 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2160
2161 ArrayRef<SpilledReg> VGPRSpills =
2162 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2164 bool SpillToVGPR = !VGPRSpills.empty();
2165 if (OnlyToVGPR && !SpillToVGPR)
2166 return false;
2167
2168 const SIFrameLowering *TFL = ST.getFrameLowering();
2169
2170 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
2171 SB.SuperReg != SB.MFI.getFrameOffsetReg()));
2172
2173 if (SpillToVGPR) {
2174
2175 // Since stack slot coloring pass is trying to optimize SGPR spills,
2176 // VGPR lanes (mapped from spill stack slot) may be shared for SGPR
2177 // spills of different sizes. This accounts for number of VGPR lanes alloted
2178 // equal to the largest SGPR being spilled in them.
2179 assert(SB.NumSubRegs <= VGPRSpills.size() &&
2180 "Num of SGPRs spilled should be less than or equal to num of "
2181 "the VGPR lanes.");
2182
2183 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2184 Register SubReg =
2185 SB.NumSubRegs == 1
2186 ? SB.SuperReg
2187 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2188 SpilledReg Spill = VGPRSpills[i];
2189
2190 bool IsFirstSubreg = i == 0;
2191 bool IsLastSubreg = i == SB.NumSubRegs - 1;
2192 bool UseKill = SB.IsKill && IsLastSubreg;
2193
2194
2195 // Mark the "old value of vgpr" input undef only if this is the first sgpr
2196 // spill to this specific vgpr in the first basic block.
2197 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2198 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
2199 .addReg(SubReg, getKillRegState(UseKill))
2200 .addImm(Spill.Lane)
2201 .addReg(Spill.VGPR);
2202
2203 MachineInstr *CFI = nullptr;
2204 if (NeedsCFI) {
2205 if (SB.SuperReg == SB.TRI.getReturnAddressReg(SB.MF)) {
2206 if (i == e - 1)
2207 CFI = TFL->buildCFIForSGPRToVGPRSpill(*SB.MBB, MI, DebugLoc(),
2208 AMDGPU::PC_REG, VGPRSpills);
2209 } else {
2210 CFI = TFL->buildCFIForSGPRToVGPRSpill(*SB.MBB, MI, DebugLoc(), SubReg,
2211 Spill.VGPR, Spill.Lane);
2212 }
2213 }
2214
2215 if (Indexes) {
2216 if (IsFirstSubreg)
2217 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2218 else
2219 Indexes->insertMachineInstrInMaps(*MIB);
2220
2221 if (CFI)
2222 Indexes->insertMachineInstrInMaps(*CFI);
2223 }
2224
2225 if (IsFirstSubreg && SB.NumSubRegs > 1) {
2226 // We may be spilling a super-register which is only partially defined,
2227 // and need to ensure later spills think the value is defined.
2228 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2229 }
2230
2231 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg))
2232 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit);
2233
2234 // FIXME: Since this spills to another register instead of an actual
2235 // frame index, we should delete the frame index when all references to
2236 // it are fixed.
2237 }
2238 } else {
2239 SB.prepare();
2240
2241 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
2242 RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2243
2244 // Per VGPR helper data
2245 auto PVD = SB.getPerVGPRData();
2246
2247 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2248 RegState TmpVGPRFlags = RegState::Undef;
2249
2250 // Write sub registers into the VGPR
2251 for (unsigned i = Offset * PVD.PerVGPR,
2252 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2253 i < e; ++i) {
2254 Register SubReg =
2255 SB.NumSubRegs == 1
2256 ? SB.SuperReg
2257 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2258
2259 MachineInstrBuilder WriteLane =
2260 BuildMI(*SB.MBB, MI, SB.DL,
2261 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
2262 .addReg(SubReg, SubKillState)
2263 .addImm(i % PVD.PerVGPR)
2264 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2265 TmpVGPRFlags = {};
2266
2267 if (Indexes) {
2268 if (i == 0)
2269 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane);
2270 else
2271 Indexes->insertMachineInstrInMaps(*WriteLane);
2272 }
2273
2274 // There could be undef components of a spilled super register.
2275 // TODO: Can we detect this and skip the spill?
2276 if (SB.NumSubRegs > 1) {
2277 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2278 RegState SuperKillState = {};
2279 if (i + 1 == SB.NumSubRegs)
2280 SuperKillState |= getKillRegState(SB.IsKill);
2281 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2282 }
2283 }
2284
2285 // Write out VGPR
2286 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
2287
2288 // TODO: Implement CFI for SpillToVMEM for all scenarios.
2289 MachineInstr *CFI = nullptr;
2290 if (NeedsCFI && SB.SuperReg == SB.TRI.getReturnAddressReg(SB.MF)) {
2291 int64_t CFIOffset = (Offset * SB.EltSize +
2292 SB.MF.getFrameInfo().getObjectOffset(Index)) *
2293 ST.getWavefrontSize();
2294 CFI = TFL->buildCFIForSGPRToVMEMSpill(*SB.MBB, MI, DebugLoc(),
2295 AMDGPU::PC_REG, CFIOffset);
2296 }
2297 if (Indexes && CFI)
2298 Indexes->insertMachineInstrInMaps(*CFI);
2299 }
2300
2301 SB.restore();
2302 }
2303
2304 MI->eraseFromParent();
2306
2307 if (LIS)
2309
2310 return true;
2311}
2312
2314 RegScavenger *RS, SlotIndexes *Indexes,
2315 LiveIntervals *LIS, bool OnlyToVGPR,
2316 bool SpillToPhysVGPRLane) const {
2317 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2318
2319 ArrayRef<SpilledReg> VGPRSpills =
2320 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2322 bool SpillToVGPR = !VGPRSpills.empty();
2323 if (OnlyToVGPR && !SpillToVGPR)
2324 return false;
2325
2326 if (SpillToVGPR) {
2327 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2328 Register SubReg =
2329 SB.NumSubRegs == 1
2330 ? SB.SuperReg
2331 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2332
2333 SpilledReg Spill = VGPRSpills[i];
2334 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2335 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2336 .addReg(Spill.VGPR)
2337 .addImm(Spill.Lane);
2338 if (SB.NumSubRegs > 1 && i == 0)
2340 if (Indexes) {
2341 if (i == e - 1)
2342 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2343 else
2344 Indexes->insertMachineInstrInMaps(*MIB);
2345 }
2346 }
2347 } else {
2348 SB.prepare();
2349
2350 // Per VGPR helper data
2351 auto PVD = SB.getPerVGPRData();
2352
2353 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2354 // Load in VGPR data
2355 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
2356
2357 // Unpack lanes
2358 for (unsigned i = Offset * PVD.PerVGPR,
2359 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2360 i < e; ++i) {
2361 Register SubReg =
2362 SB.NumSubRegs == 1
2363 ? SB.SuperReg
2364 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2365
2366 bool LastSubReg = (i + 1 == e);
2367 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2368 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2369 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2370 .addImm(i);
2371 if (SB.NumSubRegs > 1 && i == 0)
2373 if (Indexes) {
2374 if (i == e - 1)
2375 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2376 else
2377 Indexes->insertMachineInstrInMaps(*MIB);
2378 }
2379 }
2380 }
2381
2382 SB.restore();
2383 }
2384
2385 MI->eraseFromParent();
2386
2387 if (LIS)
2389
2390 return true;
2391}
2392
2394 MachineBasicBlock &RestoreMBB,
2395 Register SGPR, RegScavenger *RS) const {
2396 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
2397 RS);
2398 SB.prepare();
2399 // Generate the spill of SGPR to SB.TmpVGPR.
2400 RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2401 auto PVD = SB.getPerVGPRData();
2402 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2403 RegState TmpVGPRFlags = RegState::Undef;
2404 // Write sub registers into the VGPR
2405 for (unsigned i = Offset * PVD.PerVGPR,
2406 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2407 i < e; ++i) {
2408 Register SubReg =
2409 SB.NumSubRegs == 1
2410 ? SB.SuperReg
2411 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2412
2413 MachineInstrBuilder WriteLane =
2414 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
2415 SB.TmpVGPR)
2416 .addReg(SubReg, SubKillState)
2417 .addImm(i % PVD.PerVGPR)
2418 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2419 TmpVGPRFlags = {};
2420 // There could be undef components of a spilled super register.
2421 // TODO: Can we detect this and skip the spill?
2422 if (SB.NumSubRegs > 1) {
2423 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2424 RegState SuperKillState = {};
2425 if (i + 1 == SB.NumSubRegs)
2426 SuperKillState |= getKillRegState(SB.IsKill);
2427 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2428 }
2429 }
2430 // Don't need to write VGPR out.
2431 }
2432
2433 // Restore clobbered registers in the specified restore block.
2434 MI = RestoreMBB.end();
2435 SB.setMI(&RestoreMBB, MI);
2436 // Generate the restore of SGPR from SB.TmpVGPR.
2437 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2438 // Don't need to load VGPR in.
2439 // Unpack lanes
2440 for (unsigned i = Offset * PVD.PerVGPR,
2441 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2442 i < e; ++i) {
2443 Register SubReg =
2444 SB.NumSubRegs == 1
2445 ? SB.SuperReg
2446 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2447
2448 assert(SubReg.isPhysical());
2449 bool LastSubReg = (i + 1 == e);
2450 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
2451 SubReg)
2452 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2453 .addImm(i);
2454 if (SB.NumSubRegs > 1 && i == 0)
2456 }
2457 }
2458 SB.restore();
2459
2461 return false;
2462}
2463
2464/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2465/// a VGPR and the stack slot can be safely eliminated when all other users are
2466/// handled.
2469 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
2470 bool NeedsCFI = false;
2471 switch (MI->getOpcode()) {
2472 case AMDGPU::SI_SPILL_S1024_CFI_SAVE:
2473 case AMDGPU::SI_SPILL_S512_CFI_SAVE:
2474 case AMDGPU::SI_SPILL_S256_CFI_SAVE:
2475 case AMDGPU::SI_SPILL_S224_CFI_SAVE:
2476 case AMDGPU::SI_SPILL_S192_CFI_SAVE:
2477 case AMDGPU::SI_SPILL_S160_CFI_SAVE:
2478 case AMDGPU::SI_SPILL_S128_CFI_SAVE:
2479 case AMDGPU::SI_SPILL_S96_CFI_SAVE:
2480 case AMDGPU::SI_SPILL_S64_CFI_SAVE:
2481 case AMDGPU::SI_SPILL_S32_CFI_SAVE:
2482 NeedsCFI = true;
2483 [[fallthrough]];
2484 case AMDGPU::SI_SPILL_S1024_SAVE:
2485 case AMDGPU::SI_SPILL_S512_SAVE:
2486 case AMDGPU::SI_SPILL_S384_SAVE:
2487 case AMDGPU::SI_SPILL_S352_SAVE:
2488 case AMDGPU::SI_SPILL_S320_SAVE:
2489 case AMDGPU::SI_SPILL_S288_SAVE:
2490 case AMDGPU::SI_SPILL_S256_SAVE:
2491 case AMDGPU::SI_SPILL_S224_SAVE:
2492 case AMDGPU::SI_SPILL_S192_SAVE:
2493 case AMDGPU::SI_SPILL_S160_SAVE:
2494 case AMDGPU::SI_SPILL_S128_SAVE:
2495 case AMDGPU::SI_SPILL_S96_SAVE:
2496 case AMDGPU::SI_SPILL_S64_SAVE:
2497 case AMDGPU::SI_SPILL_S32_SAVE:
2498 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane,
2499 NeedsCFI);
2500 case AMDGPU::SI_SPILL_S1024_RESTORE:
2501 case AMDGPU::SI_SPILL_S512_RESTORE:
2502 case AMDGPU::SI_SPILL_S384_RESTORE:
2503 case AMDGPU::SI_SPILL_S352_RESTORE:
2504 case AMDGPU::SI_SPILL_S320_RESTORE:
2505 case AMDGPU::SI_SPILL_S288_RESTORE:
2506 case AMDGPU::SI_SPILL_S256_RESTORE:
2507 case AMDGPU::SI_SPILL_S224_RESTORE:
2508 case AMDGPU::SI_SPILL_S192_RESTORE:
2509 case AMDGPU::SI_SPILL_S160_RESTORE:
2510 case AMDGPU::SI_SPILL_S128_RESTORE:
2511 case AMDGPU::SI_SPILL_S96_RESTORE:
2512 case AMDGPU::SI_SPILL_S64_RESTORE:
2513 case AMDGPU::SI_SPILL_S32_RESTORE:
2514 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2515 default:
2516 llvm_unreachable("not an SGPR spill instruction");
2517 }
2518}
2519
2521 int SPAdj, unsigned FIOperandNum,
2522 RegScavenger *RS) const {
2523 MachineFunction *MF = MI->getMF();
2524 MachineBasicBlock *MBB = MI->getParent();
2526 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2527 const SIInstrInfo *TII = ST.getInstrInfo();
2528 const DebugLoc &DL = MI->getDebugLoc();
2529
2530 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
2531
2533 "unreserved scratch RSRC register");
2534
2535 MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
2536 int Index = MI->getOperand(FIOperandNum).getIndex();
2537
2538 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
2539 ? getBaseRegister()
2540 : getFrameRegister(*MF);
2541
2542 bool NeedsCFI = false;
2543
2544 switch (MI->getOpcode()) {
2545 // SGPR register spill
2546 case AMDGPU::SI_SPILL_S1024_CFI_SAVE:
2547 case AMDGPU::SI_SPILL_S512_CFI_SAVE:
2548 case AMDGPU::SI_SPILL_S256_CFI_SAVE:
2549 case AMDGPU::SI_SPILL_S224_CFI_SAVE:
2550 case AMDGPU::SI_SPILL_S192_CFI_SAVE:
2551 case AMDGPU::SI_SPILL_S160_CFI_SAVE:
2552 case AMDGPU::SI_SPILL_S128_CFI_SAVE:
2553 case AMDGPU::SI_SPILL_S96_CFI_SAVE:
2554 case AMDGPU::SI_SPILL_S64_CFI_SAVE:
2555 case AMDGPU::SI_SPILL_S32_CFI_SAVE: {
2556 NeedsCFI = true;
2557 [[fallthrough]];
2558 }
2559 case AMDGPU::SI_SPILL_S1024_SAVE:
2560 case AMDGPU::SI_SPILL_S512_SAVE:
2561 case AMDGPU::SI_SPILL_S384_SAVE:
2562 case AMDGPU::SI_SPILL_S352_SAVE:
2563 case AMDGPU::SI_SPILL_S320_SAVE:
2564 case AMDGPU::SI_SPILL_S288_SAVE:
2565 case AMDGPU::SI_SPILL_S256_SAVE:
2566 case AMDGPU::SI_SPILL_S224_SAVE:
2567 case AMDGPU::SI_SPILL_S192_SAVE:
2568 case AMDGPU::SI_SPILL_S160_SAVE:
2569 case AMDGPU::SI_SPILL_S128_SAVE:
2570 case AMDGPU::SI_SPILL_S96_SAVE:
2571 case AMDGPU::SI_SPILL_S64_SAVE:
2572 case AMDGPU::SI_SPILL_S32_SAVE: {
2573 return spillSGPR(MI, Index, RS, nullptr, nullptr,
2574 FrameInfo.getStackID(Index) == TargetStackID::SGPRSpill,
2575 false, NeedsCFI);
2576 }
2577
2578 // SGPR register restore
2579 case AMDGPU::SI_SPILL_S1024_RESTORE:
2580 case AMDGPU::SI_SPILL_S512_RESTORE:
2581 case AMDGPU::SI_SPILL_S384_RESTORE:
2582 case AMDGPU::SI_SPILL_S352_RESTORE:
2583 case AMDGPU::SI_SPILL_S320_RESTORE:
2584 case AMDGPU::SI_SPILL_S288_RESTORE:
2585 case AMDGPU::SI_SPILL_S256_RESTORE:
2586 case AMDGPU::SI_SPILL_S224_RESTORE:
2587 case AMDGPU::SI_SPILL_S192_RESTORE:
2588 case AMDGPU::SI_SPILL_S160_RESTORE:
2589 case AMDGPU::SI_SPILL_S128_RESTORE:
2590 case AMDGPU::SI_SPILL_S96_RESTORE:
2591 case AMDGPU::SI_SPILL_S64_RESTORE:
2592 case AMDGPU::SI_SPILL_S32_RESTORE: {
2593 return restoreSGPR(MI, Index, RS, nullptr, nullptr,
2594 FrameInfo.getStackID(Index) ==
2596 }
2597
2598 // VGPR register spill
2599 case AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE:
2600 case AMDGPU::SI_SPILL_V1024_CFI_SAVE:
2601 case AMDGPU::SI_SPILL_V512_CFI_SAVE:
2602 case AMDGPU::SI_SPILL_V256_CFI_SAVE:
2603 case AMDGPU::SI_SPILL_V224_CFI_SAVE:
2604 case AMDGPU::SI_SPILL_V192_CFI_SAVE:
2605 case AMDGPU::SI_SPILL_V160_CFI_SAVE:
2606 case AMDGPU::SI_SPILL_V128_CFI_SAVE:
2607 case AMDGPU::SI_SPILL_V96_CFI_SAVE:
2608 case AMDGPU::SI_SPILL_V64_CFI_SAVE:
2609 case AMDGPU::SI_SPILL_V32_CFI_SAVE:
2610 case AMDGPU::SI_SPILL_A1024_CFI_SAVE:
2611 case AMDGPU::SI_SPILL_A512_CFI_SAVE:
2612 case AMDGPU::SI_SPILL_A256_CFI_SAVE:
2613 case AMDGPU::SI_SPILL_A224_CFI_SAVE:
2614 case AMDGPU::SI_SPILL_A192_CFI_SAVE:
2615 case AMDGPU::SI_SPILL_A160_CFI_SAVE:
2616 case AMDGPU::SI_SPILL_A128_CFI_SAVE:
2617 case AMDGPU::SI_SPILL_A96_CFI_SAVE:
2618 case AMDGPU::SI_SPILL_A64_CFI_SAVE:
2619 case AMDGPU::SI_SPILL_A32_CFI_SAVE:
2620 case AMDGPU::SI_SPILL_AV1024_CFI_SAVE:
2621 case AMDGPU::SI_SPILL_AV512_CFI_SAVE:
2622 case AMDGPU::SI_SPILL_AV256_CFI_SAVE:
2623 case AMDGPU::SI_SPILL_AV224_CFI_SAVE:
2624 case AMDGPU::SI_SPILL_AV192_CFI_SAVE:
2625 case AMDGPU::SI_SPILL_AV160_CFI_SAVE:
2626 case AMDGPU::SI_SPILL_AV128_CFI_SAVE:
2627 case AMDGPU::SI_SPILL_AV96_CFI_SAVE:
2628 case AMDGPU::SI_SPILL_AV64_CFI_SAVE:
2629 case AMDGPU::SI_SPILL_AV32_CFI_SAVE:
2630 NeedsCFI = true;
2631 [[fallthrough]];
2632 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
2633 case AMDGPU::SI_SPILL_V1024_SAVE:
2634 case AMDGPU::SI_SPILL_V512_SAVE:
2635 case AMDGPU::SI_SPILL_V384_SAVE:
2636 case AMDGPU::SI_SPILL_V352_SAVE:
2637 case AMDGPU::SI_SPILL_V320_SAVE:
2638 case AMDGPU::SI_SPILL_V288_SAVE:
2639 case AMDGPU::SI_SPILL_V256_SAVE:
2640 case AMDGPU::SI_SPILL_V224_SAVE:
2641 case AMDGPU::SI_SPILL_V192_SAVE:
2642 case AMDGPU::SI_SPILL_V160_SAVE:
2643 case AMDGPU::SI_SPILL_V128_SAVE:
2644 case AMDGPU::SI_SPILL_V96_SAVE:
2645 case AMDGPU::SI_SPILL_V64_SAVE:
2646 case AMDGPU::SI_SPILL_V32_SAVE:
2647 case AMDGPU::SI_SPILL_V16_SAVE:
2648 case AMDGPU::SI_SPILL_A1024_SAVE:
2649 case AMDGPU::SI_SPILL_A512_SAVE:
2650 case AMDGPU::SI_SPILL_A384_SAVE:
2651 case AMDGPU::SI_SPILL_A352_SAVE:
2652 case AMDGPU::SI_SPILL_A320_SAVE:
2653 case AMDGPU::SI_SPILL_A288_SAVE:
2654 case AMDGPU::SI_SPILL_A256_SAVE:
2655 case AMDGPU::SI_SPILL_A224_SAVE:
2656 case AMDGPU::SI_SPILL_A192_SAVE:
2657 case AMDGPU::SI_SPILL_A160_SAVE:
2658 case AMDGPU::SI_SPILL_A128_SAVE:
2659 case AMDGPU::SI_SPILL_A96_SAVE:
2660 case AMDGPU::SI_SPILL_A64_SAVE:
2661 case AMDGPU::SI_SPILL_A32_SAVE:
2662 case AMDGPU::SI_SPILL_AV1024_SAVE:
2663 case AMDGPU::SI_SPILL_AV512_SAVE:
2664 case AMDGPU::SI_SPILL_AV384_SAVE:
2665 case AMDGPU::SI_SPILL_AV352_SAVE:
2666 case AMDGPU::SI_SPILL_AV320_SAVE:
2667 case AMDGPU::SI_SPILL_AV288_SAVE:
2668 case AMDGPU::SI_SPILL_AV256_SAVE:
2669 case AMDGPU::SI_SPILL_AV224_SAVE:
2670 case AMDGPU::SI_SPILL_AV192_SAVE:
2671 case AMDGPU::SI_SPILL_AV160_SAVE:
2672 case AMDGPU::SI_SPILL_AV128_SAVE:
2673 case AMDGPU::SI_SPILL_AV96_SAVE:
2674 case AMDGPU::SI_SPILL_AV64_SAVE:
2675 case AMDGPU::SI_SPILL_AV32_SAVE:
2676 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
2677 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
2678 assert(
2679 MI->getOpcode() != AMDGPU::SI_BLOCK_SPILL_V1024_SAVE &&
2680 "block spill does not currenty support spilling non-CSR registers");
2681
2682 if (MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE)
2683 // Put mask into M0.
2684 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2685 AMDGPU::M0)
2686 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2687
2688 const MachineOperand *VData = TII->getNamedOperand(*MI,
2689 AMDGPU::OpName::vdata);
2690 if (VData->isUndef()) {
2691 MI->eraseFromParent();
2692 return true;
2693 }
2694
2695 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2696 MFI->getStackPtrOffsetReg());
2697
2698 unsigned Opc;
2699 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) {
2700 assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!");
2701 Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
2702 } else {
2703 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE
2704 ? AMDGPU::SCRATCH_STORE_BLOCK_SADDR
2705 : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2706 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2707 }
2708
2709 auto *MBB = MI->getParent();
2710 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2711 if (IsWWMRegSpill) {
2712 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2713 RS->isRegUsed(AMDGPU::SCC));
2714 }
2716 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2717 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2718 *MI->memoperands_begin(), RS, nullptr, NeedsCFI);
2720 if (IsWWMRegSpill)
2721 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2722
2723 MI->eraseFromParent();
2724 return true;
2725 }
2726 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: {
2727 // Put mask into M0.
2728 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2729 AMDGPU::M0)
2730 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2731 [[fallthrough]];
2732 }
2733 case AMDGPU::SI_SPILL_V16_RESTORE:
2734 case AMDGPU::SI_SPILL_V32_RESTORE:
2735 case AMDGPU::SI_SPILL_V64_RESTORE:
2736 case AMDGPU::SI_SPILL_V96_RESTORE:
2737 case AMDGPU::SI_SPILL_V128_RESTORE:
2738 case AMDGPU::SI_SPILL_V160_RESTORE:
2739 case AMDGPU::SI_SPILL_V192_RESTORE:
2740 case AMDGPU::SI_SPILL_V224_RESTORE:
2741 case AMDGPU::SI_SPILL_V256_RESTORE:
2742 case AMDGPU::SI_SPILL_V288_RESTORE:
2743 case AMDGPU::SI_SPILL_V320_RESTORE:
2744 case AMDGPU::SI_SPILL_V352_RESTORE:
2745 case AMDGPU::SI_SPILL_V384_RESTORE:
2746 case AMDGPU::SI_SPILL_V512_RESTORE:
2747 case AMDGPU::SI_SPILL_V1024_RESTORE:
2748 case AMDGPU::SI_SPILL_A32_RESTORE:
2749 case AMDGPU::SI_SPILL_A64_RESTORE:
2750 case AMDGPU::SI_SPILL_A96_RESTORE:
2751 case AMDGPU::SI_SPILL_A128_RESTORE:
2752 case AMDGPU::SI_SPILL_A160_RESTORE:
2753 case AMDGPU::SI_SPILL_A192_RESTORE:
2754 case AMDGPU::SI_SPILL_A224_RESTORE:
2755 case AMDGPU::SI_SPILL_A256_RESTORE:
2756 case AMDGPU::SI_SPILL_A288_RESTORE:
2757 case AMDGPU::SI_SPILL_A320_RESTORE:
2758 case AMDGPU::SI_SPILL_A352_RESTORE:
2759 case AMDGPU::SI_SPILL_A384_RESTORE:
2760 case AMDGPU::SI_SPILL_A512_RESTORE:
2761 case AMDGPU::SI_SPILL_A1024_RESTORE:
2762 case AMDGPU::SI_SPILL_AV32_RESTORE:
2763 case AMDGPU::SI_SPILL_AV64_RESTORE:
2764 case AMDGPU::SI_SPILL_AV96_RESTORE:
2765 case AMDGPU::SI_SPILL_AV128_RESTORE:
2766 case AMDGPU::SI_SPILL_AV160_RESTORE:
2767 case AMDGPU::SI_SPILL_AV192_RESTORE:
2768 case AMDGPU::SI_SPILL_AV224_RESTORE:
2769 case AMDGPU::SI_SPILL_AV256_RESTORE:
2770 case AMDGPU::SI_SPILL_AV288_RESTORE:
2771 case AMDGPU::SI_SPILL_AV320_RESTORE:
2772 case AMDGPU::SI_SPILL_AV352_RESTORE:
2773 case AMDGPU::SI_SPILL_AV384_RESTORE:
2774 case AMDGPU::SI_SPILL_AV512_RESTORE:
2775 case AMDGPU::SI_SPILL_AV1024_RESTORE:
2776 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
2777 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
2778 const MachineOperand *VData = TII->getNamedOperand(*MI,
2779 AMDGPU::OpName::vdata);
2780 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2781 MFI->getStackPtrOffsetReg());
2782
2783 unsigned Opc;
2784 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
2785 assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!");
2786 Opc = ST.d16PreservesUnusedBits()
2787 ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16
2788 : AMDGPU::SCRATCH_LOAD_USHORT_SADDR;
2789 } else {
2790 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE
2791 ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR
2792 : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2793 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2794 }
2795
2796 auto *MBB = MI->getParent();
2797 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2798 if (IsWWMRegSpill) {
2799 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2800 RS->isRegUsed(AMDGPU::SCC));
2801 }
2802
2804 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2805 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2806 *MI->memoperands_begin(), RS);
2807
2808 if (IsWWMRegSpill)
2809 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2810
2811 MI->eraseFromParent();
2812 return true;
2813 }
2814 case AMDGPU::V_ADD_U32_e32:
2815 case AMDGPU::V_ADD_U32_e64:
2816 case AMDGPU::V_ADD_CO_U32_e32:
2817 case AMDGPU::V_ADD_CO_U32_e64: {
2818 // TODO: Handle sub, and, or.
2819 unsigned NumDefs = MI->getNumExplicitDefs();
2820 unsigned Src0Idx = NumDefs;
2821
2822 bool HasClamp = false;
2823 MachineOperand *VCCOp = nullptr;
2824
2825 switch (MI->getOpcode()) {
2826 case AMDGPU::V_ADD_U32_e32:
2827 break;
2828 case AMDGPU::V_ADD_U32_e64:
2829 HasClamp = MI->getOperand(3).getImm();
2830 break;
2831 case AMDGPU::V_ADD_CO_U32_e32:
2832 VCCOp = &MI->getOperand(3);
2833 break;
2834 case AMDGPU::V_ADD_CO_U32_e64:
2835 VCCOp = &MI->getOperand(1);
2836 HasClamp = MI->getOperand(4).getImm();
2837 break;
2838 default:
2839 break;
2840 }
2841 bool DeadVCC = !VCCOp || VCCOp->isDead();
2842 MachineOperand &DstOp = MI->getOperand(0);
2843 Register DstReg = DstOp.getReg();
2844
2845 unsigned OtherOpIdx =
2846 FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2847 MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2848
2849 unsigned Src1Idx = Src0Idx + 1;
2850 Register MaterializedReg = FrameReg;
2851 Register ScavengedVGPR;
2852
2853 int64_t Offset = FrameInfo.getObjectOffset(Index);
2854 // For the non-immediate case, we could fall through to the default
2855 // handling, but we do an in-place update of the result register here to
2856 // avoid scavenging another register.
2857 if (OtherOp->isImm()) {
2858 int64_t TotalOffset = OtherOp->getImm() + Offset;
2859
2860 if (!ST.hasVOP3Literal() && SIInstrInfo::isVOP3(*MI) &&
2861 !AMDGPU::isInlinableIntLiteral(TotalOffset)) {
2862 // If we can't support a VOP3 literal in the VALU instruction, we
2863 // can't specially fold into the add.
2864 // TODO: Handle VOP3->VOP2 shrink to support the fold.
2865 break;
2866 }
2867
2868 OtherOp->setImm(TotalOffset);
2869 Offset = 0;
2870 }
2871
2872 if (FrameReg && !ST.hasFlatScratchEnabled()) {
2873 // We should just do an in-place update of the result register. However,
2874 // the value there may also be used by the add, in which case we need a
2875 // temporary register.
2876 //
2877 // FIXME: The scavenger is not finding the result register in the
2878 // common case where the add does not read the register.
2879
2880 ScavengedVGPR = RS->scavengeRegisterBackwards(
2881 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2882
2883 // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2884 // shift.
2885 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2886 .addDef(ScavengedVGPR, RegState::Renamable)
2887 .addImm(ST.getWavefrontSizeLog2())
2888 .addReg(FrameReg);
2889 MaterializedReg = ScavengedVGPR;
2890 }
2891
2892 if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2893 if (ST.hasFlatScratchEnabled() &&
2894 !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2895 // We didn't need the shift above, so we have an SGPR for the frame
2896 // register, but may have a VGPR only operand.
2897 //
2898 // TODO: On gfx10+, we can easily change the opcode to the e64 version
2899 // and use the higher constant bus restriction to avoid this copy.
2900
2901 if (!ScavengedVGPR) {
2902 ScavengedVGPR = RS->scavengeRegisterBackwards(
2903 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2904 /*SPAdj=*/0);
2905 }
2906
2907 assert(ScavengedVGPR != DstReg);
2908
2909 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2910 .addReg(MaterializedReg,
2911 getKillRegState(MaterializedReg != FrameReg));
2912 MaterializedReg = ScavengedVGPR;
2913 }
2914
2915 // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2916 // is not live, we could use a scalar add + vector add instead of 2
2917 // vector adds.
2918 auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2919 .addDef(DstReg, RegState::Renamable);
2920 if (NumDefs == 2)
2921 AddI32.add(MI->getOperand(1));
2922
2923 RegState MaterializedRegFlags =
2924 getKillRegState(MaterializedReg != FrameReg);
2925
2926 if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2927 // If we know we have a VGPR already, it's more likely the other
2928 // operand is a legal vsrc0.
2929 AddI32
2930 .add(*OtherOp)
2931 .addReg(MaterializedReg, MaterializedRegFlags);
2932 } else {
2933 // Commute operands to avoid violating VOP2 restrictions. This will
2934 // typically happen when using scratch.
2935 AddI32
2936 .addReg(MaterializedReg, MaterializedRegFlags)
2937 .add(*OtherOp);
2938 }
2939
2940 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2941 MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2942 AddI32.addImm(0); // clamp
2943
2944 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2945 AddI32.setOperandDead(3); // Dead vcc
2946
2947 MaterializedReg = DstReg;
2948
2949 OtherOp->ChangeToRegister(MaterializedReg, false);
2950 OtherOp->setIsKill(true);
2952 Offset = 0;
2953 } else if (Offset != 0) {
2954 assert(!MaterializedReg);
2956 Offset = 0;
2957 } else {
2958 if (DeadVCC && !HasClamp) {
2959 assert(Offset == 0);
2960
2961 // TODO: Losing kills and implicit operands. Just mutate to copy and
2962 // let lowerCopy deal with it?
2963 if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2964 // Folded to an identity copy.
2965 MI->eraseFromParent();
2966 return true;
2967 }
2968
2969 // The immediate value should be in OtherOp
2970 MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2971 MI->removeOperand(FIOperandNum);
2972
2973 unsigned NumOps = MI->getNumOperands();
2974 for (unsigned I = NumOps - 2; I >= NumDefs + 1; --I)
2975 MI->removeOperand(I);
2976
2977 if (NumDefs == 2)
2978 MI->removeOperand(1);
2979
2980 // The code below can't deal with a mov.
2981 return true;
2982 }
2983
2984 // This folded to a constant, but we have to keep the add around for
2985 // pointless implicit defs or clamp modifier.
2986 FIOp->ChangeToImmediate(0);
2987 }
2988
2989 // Try to improve legality by commuting.
2990 if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2991 std::swap(FIOp, OtherOp);
2992 std::swap(FIOperandNum, OtherOpIdx);
2993 }
2994
2995 // We need at most one mov to satisfy the operand constraints. Prefer to
2996 // move the FI operand first, as it may be a literal in a VOP3
2997 // instruction.
2998 for (unsigned SrcIdx : {FIOperandNum, OtherOpIdx}) {
2999 if (!TII->isOperandLegal(*MI, SrcIdx)) {
3000 // If commuting didn't make the operands legal, we need to materialize
3001 // in a register.
3002 // TODO: Can use SGPR on gfx10+ in some cases.
3003 if (!ScavengedVGPR) {
3004 ScavengedVGPR = RS->scavengeRegisterBackwards(
3005 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
3006 /*SPAdj=*/0);
3007 }
3008
3009 assert(ScavengedVGPR != DstReg);
3010
3011 MachineOperand &Src = MI->getOperand(SrcIdx);
3012 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
3013 .add(Src);
3014
3015 Src.ChangeToRegister(ScavengedVGPR, false);
3016 Src.setIsKill(true);
3017 break;
3018 }
3019 }
3020
3021 // Fold out add of 0 case that can appear in kernels.
3022 if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
3023 if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
3024 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
3025 }
3026
3027 MI->eraseFromParent();
3028 }
3029
3030 return true;
3031 }
3032 case AMDGPU::S_ADD_I32:
3033 case AMDGPU::S_ADD_U32: {
3034 // TODO: Handle s_or_b32, s_and_b32.
3035 unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
3036 MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
3037
3038 assert(FrameReg || MFI->isBottomOfStack());
3039
3040 MachineOperand &DstOp = MI->getOperand(0);
3041 const DebugLoc &DL = MI->getDebugLoc();
3042 Register MaterializedReg = FrameReg;
3043
3044 // Defend against live scc, which should never happen in practice.
3045 bool DeadSCC = MI->getOperand(3).isDead();
3046
3047 Register TmpReg;
3048
3049 // FIXME: Scavenger should figure out that the result register is
3050 // available. Also should do this for the v_add case.
3051 if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg())
3052 TmpReg = DstOp.getReg();
3053
3054 if (FrameReg && !ST.hasFlatScratchEnabled()) {
3055 // FIXME: In the common case where the add does not also read its result
3056 // (i.e. this isn't a reg += fi), it's not finding the dest reg as
3057 // available.
3058 if (!TmpReg)
3059 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3060 MI, /*RestoreAfter=*/false, 0,
3061 /*AllowSpill=*/false);
3062 if (TmpReg) {
3063 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
3064 .addDef(TmpReg, RegState::Renamable)
3065 .addReg(FrameReg)
3066 .addImm(ST.getWavefrontSizeLog2())
3067 .setOperandDead(3); // Set SCC dead
3068 }
3069 MaterializedReg = TmpReg;
3070 }
3071
3072 int64_t Offset = FrameInfo.getObjectOffset(Index);
3073
3074 // For the non-immediate case, we could fall through to the default
3075 // handling, but we do an in-place update of the result register here to
3076 // avoid scavenging another register.
3077 if (OtherOp.isImm()) {
3078 OtherOp.setImm(OtherOp.getImm() + Offset);
3079 Offset = 0;
3080
3081 if (MaterializedReg)
3082 FIOp->ChangeToRegister(MaterializedReg, false);
3083 else
3084 FIOp->ChangeToImmediate(0);
3085 } else if (MaterializedReg) {
3086 // If we can't fold the other operand, do another increment.
3087 Register DstReg = DstOp.getReg();
3088
3089 if (!TmpReg && MaterializedReg == FrameReg) {
3090 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3091 MI, /*RestoreAfter=*/false, 0,
3092 /*AllowSpill=*/false);
3093 DstReg = TmpReg;
3094 }
3095
3096 if (TmpReg) {
3097 auto AddI32 = BuildMI(*MBB, *MI, DL, MI->getDesc())
3098 .addDef(DstReg, RegState::Renamable)
3099 .addReg(MaterializedReg, RegState::Kill)
3100 .add(OtherOp);
3101 if (DeadSCC)
3102 AddI32.setOperandDead(3);
3103
3104 MaterializedReg = DstReg;
3105
3106 OtherOp.ChangeToRegister(MaterializedReg, false);
3107 OtherOp.setIsKill(true);
3108 OtherOp.setIsRenamable(true);
3109 }
3111 } else {
3112 // If we don't have any other offset to apply, we can just directly
3113 // interpret the frame index as the offset.
3115 }
3116
3117 if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
3118 assert(Offset == 0);
3119 MI->removeOperand(3);
3120 MI->removeOperand(OtherOpIdx);
3121 MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
3122 } else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
3123 assert(Offset == 0);
3124 MI->removeOperand(3);
3125 MI->removeOperand(FIOperandNum);
3126 MI->setDesc(
3127 TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
3128 }
3129
3130 assert(!FIOp->isFI());
3131 return true;
3132 }
3133 default: {
3134 break;
3135 }
3136 }
3137
3138 int64_t Offset = FrameInfo.getObjectOffset(Index);
3139 if (ST.hasFlatScratchEnabled()) {
3140 if (TII->isFLATScratch(*MI)) {
3141 assert(
3142 (int16_t)FIOperandNum ==
3143 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::saddr));
3144
3145 // The offset is always swizzled, just replace it
3146 if (FrameReg)
3147 FIOp->ChangeToRegister(FrameReg, false);
3148
3150 TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
3151 int64_t NewOffset = Offset + OffsetOp->getImm();
3152 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
3154 OffsetOp->setImm(NewOffset);
3155 if (FrameReg)
3156 return false;
3157 Offset = 0;
3158 }
3159
3160 if (!Offset) {
3161 unsigned Opc = MI->getOpcode();
3162 int NewOpc = -1;
3163 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
3165 } else if (ST.hasFlatScratchSTMode()) {
3166 // On GFX10 we have ST mode to use no registers for an address.
3167 // Otherwise we need to materialize 0 into an SGPR.
3169 }
3170
3171 if (NewOpc != -1) {
3172 // removeOperand doesn't fixup tied operand indexes as it goes, so
3173 // it asserts. Untie vdst_in for now and retie them afterwards.
3174 int VDstIn =
3175 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
3176 bool TiedVDst = VDstIn != -1 && MI->getOperand(VDstIn).isReg() &&
3177 MI->getOperand(VDstIn).isTied();
3178 if (TiedVDst)
3179 MI->untieRegOperand(VDstIn);
3180
3181 MI->removeOperand(
3182 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
3183
3184 if (TiedVDst) {
3185 int NewVDst =
3186 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
3187 int NewVDstIn =
3188 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
3189 assert(NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
3190 MI->tieOperands(NewVDst, NewVDstIn);
3191 }
3192 MI->setDesc(TII->get(NewOpc));
3193 return false;
3194 }
3195 }
3196 }
3197
3198 if (!FrameReg) {
3200 if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
3201 return false;
3202 }
3203
3204 // We need to use register here. Check if we can use an SGPR or need
3205 // a VGPR.
3206 FIOp->ChangeToRegister(AMDGPU::M0, false);
3207 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
3208
3209 if (!Offset && FrameReg && UseSGPR) {
3210 FIOp->setReg(FrameReg);
3211 return false;
3212 }
3213
3214 const TargetRegisterClass *RC =
3215 UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass;
3216
3217 Register TmpReg =
3218 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
3219 FIOp->setReg(TmpReg);
3220 FIOp->setIsKill();
3221
3222 if ((!FrameReg || !Offset) && TmpReg) {
3223 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
3224 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
3225 if (FrameReg)
3226 MIB.addReg(FrameReg);
3227 else
3228 MIB.addImm(Offset);
3229
3230 return false;
3231 }
3232
3233 bool NeedSaveSCC = (RS->isRegUsed(AMDGPU::SCC) &&
3234 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr)) ||
3235 MI->readsRegister(AMDGPU::SCC, /*TRI=*/nullptr);
3236
3237 Register TmpSReg =
3238 UseSGPR ? TmpReg
3239 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3240 MI, false, 0, !UseSGPR);
3241
3242 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) {
3243 int SVOpcode = AMDGPU::getFlatScratchInstSVfromSS(MI->getOpcode());
3244 if (ST.hasFlatScratchSVSMode() && SVOpcode != -1) {
3245 Register TmpVGPR = RS->scavengeRegisterBackwards(
3246 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3247
3248 // Materialize the frame register.
3249 auto MIB =
3250 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR);
3251 if (FrameReg)
3252 MIB.addReg(FrameReg);
3253 else
3254 MIB.addImm(Offset);
3255
3256 // Add the offset to the frame register.
3257 if (FrameReg && Offset)
3258 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), FrameReg)
3259 .addReg(FrameReg, RegState::Kill)
3260 .addImm(Offset);
3261
3262 BuildMI(*MBB, MI, DL, TII->get(SVOpcode))
3263 .add(MI->getOperand(0)) // $vdata
3264 .addReg(TmpVGPR) // $vaddr
3265 .addImm(0) // Offset
3266 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::cpol));
3267 MI->eraseFromParent();
3268 return true;
3269 }
3270 report_fatal_error("Cannot scavenge register in FI elimination!");
3271 }
3272
3273 if (!TmpSReg) {
3274 // Use frame register and restore it after.
3275 TmpSReg = FrameReg;
3276 FIOp->setReg(FrameReg);
3277 FIOp->setIsKill(false);
3278 }
3279
3280 if (NeedSaveSCC) {
3281 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
3282 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
3283 .addReg(FrameReg)
3284 .addImm(Offset);
3285 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
3286 .addReg(TmpSReg)
3287 .addImm(0);
3288 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
3289 .addImm(0)
3290 .addReg(TmpSReg);
3291 } else {
3292 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
3293 .addReg(FrameReg)
3294 .addImm(Offset);
3295 }
3296
3297 if (!UseSGPR)
3298 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3299 .addReg(TmpSReg, RegState::Kill);
3300
3301 if (TmpSReg == FrameReg) {
3302 // Undo frame register modification.
3303 if (NeedSaveSCC &&
3304 !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
3306 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
3307 TmpSReg)
3308 .addReg(FrameReg)
3309 .addImm(-Offset);
3310 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
3311 .addReg(TmpSReg)
3312 .addImm(0);
3313 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
3314 TmpSReg)
3315 .addImm(0)
3316 .addReg(TmpSReg);
3317 } else {
3318 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
3319 FrameReg)
3320 .addReg(FrameReg)
3321 .addImm(-Offset);
3322 }
3323 }
3324
3325 return false;
3326 }
3327
3328 bool IsMUBUF = TII->isMUBUF(*MI);
3329
3330 if (!IsMUBUF && !MFI->isBottomOfStack()) {
3331 // Convert to a swizzled stack address by scaling by the wave size.
3332 // In an entry function/kernel the offset is already swizzled.
3333 bool IsSALU = isSGPRClass(TII->getRegClass(MI->getDesc(), FIOperandNum));
3334 bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
3335 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
3336 const TargetRegisterClass *RC = IsSALU && !LiveSCC
3337 ? &AMDGPU::SReg_32RegClass
3338 : &AMDGPU::VGPR_32RegClass;
3339 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
3340 MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
3341 MI->getOpcode() == AMDGPU::S_MOV_B32;
3342 Register ResultReg =
3343 IsCopy ? MI->getOperand(0).getReg()
3344 : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
3345
3346 int64_t Offset = FrameInfo.getObjectOffset(Index);
3347 if (Offset == 0) {
3348 unsigned OpCode =
3349 IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64;
3350 Register TmpResultReg = ResultReg;
3351 if (IsSALU && LiveSCC) {
3352 TmpResultReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
3353 MI, false, 0);
3354 }
3355
3356 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
3357 if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
3358 // For V_LSHRREV, the operands are reversed (the shift count goes
3359 // first).
3360 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
3361 else
3362 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
3363 if (IsSALU && !LiveSCC)
3364 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
3365 if (IsSALU && LiveSCC) {
3366 Register NewDest;
3367 if (IsCopy) {
3368 assert(ResultReg.isPhysical());
3369 NewDest = ResultReg;
3370 } else {
3371 NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3372 Shift, false, 0);
3373 }
3374 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
3375 .addReg(TmpResultReg);
3376 ResultReg = NewDest;
3377 }
3378 } else {
3380 if (!IsSALU) {
3381 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
3382 nullptr) {
3383 // Reuse ResultReg in intermediate step.
3384 Register ScaledReg = ResultReg;
3385
3386 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3387 ScaledReg)
3388 .addImm(ST.getWavefrontSizeLog2())
3389 .addReg(FrameReg);
3390
3391 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
3392
3393 // TODO: Fold if use instruction is another add of a constant.
3394 if (IsVOP2 ||
3395 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
3396 // FIXME: This can fail
3397 MIB.addImm(Offset);
3398 MIB.addReg(ScaledReg, RegState::Kill);
3399 if (!IsVOP2)
3400 MIB.addImm(0); // clamp bit
3401 } else {
3402 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
3403 "Need to reuse carry out register");
3404
3405 // Use scavenged unused carry out as offset register.
3406 Register ConstOffsetReg;
3407 if (!isWave32)
3408 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
3409 else
3410 ConstOffsetReg = MIB.getReg(1);
3411
3412 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32),
3413 ConstOffsetReg)
3414 .addImm(Offset);
3415 MIB.addReg(ConstOffsetReg, RegState::Kill);
3416 MIB.addReg(ScaledReg, RegState::Kill);
3417 MIB.addImm(0); // clamp bit
3418 }
3419 }
3420 }
3421 if (!MIB || IsSALU) {
3422 // We have to produce a carry out, and there isn't a free SGPR pair
3423 // for it. We can keep the whole computation on the SALU to avoid
3424 // clobbering an additional register at the cost of an extra mov.
3425
3426 // We may have 1 free scratch SGPR even though a carry out is
3427 // unavailable. Only one additional mov is needed.
3428 Register TmpScaledReg = IsCopy && IsSALU
3429 ? ResultReg
3430 : RS->scavengeRegisterBackwards(
3431 AMDGPU::SReg_32_XM0RegClass, MI,
3432 false, 0, /*AllowSpill=*/false);
3433 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
3434 Register TmpResultReg = ScaledReg;
3435
3436 if (!LiveSCC) {
3437 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
3438 .addReg(FrameReg)
3439 .addImm(ST.getWavefrontSizeLog2());
3440 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
3441 .addReg(TmpResultReg, RegState::Kill)
3442 .addImm(Offset);
3443 } else {
3444 TmpResultReg = RS->scavengeRegisterBackwards(
3445 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3446
3448 if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
3449 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3450 TmpResultReg)
3451 .addImm(ST.getWavefrontSizeLog2())
3452 .addReg(FrameReg);
3453 if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
3454 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32), ResultReg)
3455 .addImm(Offset);
3456 Add.addReg(ResultReg, RegState::Kill)
3457 .addReg(TmpResultReg, RegState::Kill)
3458 .addImm(0);
3459 } else
3460 Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
3461 } else {
3462 assert(Offset > 0 && isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
3463 "offset is unsafe for v_mad_u32_u24");
3464
3465 // We start with a frame pointer with a wave space value, and
3466 // an offset in lane-space. We are materializing a lane space
3467 // value. We can either do a right shift of the frame pointer
3468 // to get to lane space, or a left shift of the offset to get
3469 // to wavespace. We can right shift after the computation to
3470 // get back to the desired per-lane value. We are using the
3471 // mad_u32_u24 primarily as an add with no carry out clobber.
3472 bool IsInlinableLiteral =
3473 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm());
3474 if (!IsInlinableLiteral) {
3475 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
3476 TmpResultReg)
3477 .addImm(Offset);
3478 }
3479
3480 Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
3481 TmpResultReg);
3482
3483 if (!IsInlinableLiteral) {
3484 Add.addReg(TmpResultReg, RegState::Kill);
3485 } else {
3486 // We fold the offset into mad itself if its inlinable.
3487 Add.addImm(Offset);
3488 }
3489 Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
3490 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3491 TmpResultReg)
3492 .addImm(ST.getWavefrontSizeLog2())
3493 .addReg(TmpResultReg);
3494 }
3495
3496 Register NewDest;
3497 if (IsCopy) {
3498 NewDest = ResultReg;
3499 } else {
3500 NewDest = RS->scavengeRegisterBackwards(
3501 AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
3502 /*AllowSpill=*/true);
3503 }
3504
3505 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
3506 NewDest)
3507 .addReg(TmpResultReg);
3508 ResultReg = NewDest;
3509 }
3510 if (!IsSALU)
3511 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
3512 .addReg(TmpResultReg, RegState::Kill);
3513 // If there were truly no free SGPRs, we need to undo everything.
3514 if (!TmpScaledReg.isValid()) {
3515 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
3516 .addReg(ScaledReg, RegState::Kill)
3517 .addImm(-Offset);
3518 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
3519 .addReg(FrameReg)
3520 .addImm(ST.getWavefrontSizeLog2());
3521 }
3522 }
3523 }
3524
3525 // Don't introduce an extra copy if we're just materializing in a mov.
3526 if (IsCopy) {
3527 MI->eraseFromParent();
3528 return true;
3529 }
3530 FIOp->ChangeToRegister(ResultReg, false, false, true);
3531 return false;
3532 }
3533
3534 if (IsMUBUF) {
3535 // Disable offen so we don't need a 0 vgpr base.
3536 assert(
3537 static_cast<int>(FIOperandNum) ==
3538 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr));
3539
3540 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
3541 assert((SOffset.isImm() && SOffset.getImm() == 0));
3542
3543 if (FrameReg != AMDGPU::NoRegister)
3544 SOffset.ChangeToRegister(FrameReg, false);
3545
3546 int64_t Offset = FrameInfo.getObjectOffset(Index);
3547 int64_t OldImm =
3548 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
3549 int64_t NewOffset = OldImm + Offset;
3550
3551 if (TII->isLegalMUBUFImmOffset(NewOffset) &&
3552 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
3553 MI->eraseFromParent();
3554 return true;
3555 }
3556 }
3557
3558 // If the offset is simply too big, don't convert to a scratch wave offset
3559 // relative index.
3560
3562 if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
3563 Register TmpReg =
3564 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
3565 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3566 .addImm(Offset);
3567 FIOp->ChangeToRegister(TmpReg, false, false, true);
3568 }
3569
3570 return false;
3571}
3572
3576
3578 return getEncodingValue(Reg) & AMDGPU::HWEncoding::REG_IDX_MASK;
3579}
3580
3582 return getRegBitWidth(RC.getID());
3583}
3584
3585static const TargetRegisterClass *
3587 if (BitWidth == 64)
3588 return &AMDGPU::VReg_64RegClass;
3589 if (BitWidth == 96)
3590 return &AMDGPU::VReg_96RegClass;
3591 if (BitWidth == 128)
3592 return &AMDGPU::VReg_128RegClass;
3593 if (BitWidth == 160)
3594 return &AMDGPU::VReg_160RegClass;
3595 if (BitWidth == 192)
3596 return &AMDGPU::VReg_192RegClass;
3597 if (BitWidth == 224)
3598 return &AMDGPU::VReg_224RegClass;
3599 if (BitWidth == 256)
3600 return &AMDGPU::VReg_256RegClass;
3601 if (BitWidth == 288)
3602 return &AMDGPU::VReg_288RegClass;
3603 if (BitWidth == 320)
3604 return &AMDGPU::VReg_320RegClass;
3605 if (BitWidth == 352)
3606 return &AMDGPU::VReg_352RegClass;
3607 if (BitWidth == 384)
3608 return &AMDGPU::VReg_384RegClass;
3609 if (BitWidth == 512)
3610 return &AMDGPU::VReg_512RegClass;
3611 if (BitWidth == 1024)
3612 return &AMDGPU::VReg_1024RegClass;
3613
3614 return nullptr;
3615}
3616
3617static const TargetRegisterClass *
3619 if (BitWidth == 64)
3620 return &AMDGPU::VReg_64_Align2RegClass;
3621 if (BitWidth == 96)
3622 return &AMDGPU::VReg_96_Align2RegClass;
3623 if (BitWidth == 128)
3624 return &AMDGPU::VReg_128_Align2RegClass;
3625 if (BitWidth == 160)
3626 return &AMDGPU::VReg_160_Align2RegClass;
3627 if (BitWidth == 192)
3628 return &AMDGPU::VReg_192_Align2RegClass;
3629 if (BitWidth == 224)
3630 return &AMDGPU::VReg_224_Align2RegClass;
3631 if (BitWidth == 256)
3632 return &AMDGPU::VReg_256_Align2RegClass;
3633 if (BitWidth == 288)
3634 return &AMDGPU::VReg_288_Align2RegClass;
3635 if (BitWidth == 320)
3636 return &AMDGPU::VReg_320_Align2RegClass;
3637 if (BitWidth == 352)
3638 return &AMDGPU::VReg_352_Align2RegClass;
3639 if (BitWidth == 384)
3640 return &AMDGPU::VReg_384_Align2RegClass;
3641 if (BitWidth == 512)
3642 return &AMDGPU::VReg_512_Align2RegClass;
3643 if (BitWidth == 1024)
3644 return &AMDGPU::VReg_1024_Align2RegClass;
3645
3646 return nullptr;
3647}
3648
3649const TargetRegisterClass *
3651 if (BitWidth == 1)
3652 return &AMDGPU::VReg_1RegClass;
3653 if (BitWidth == 16)
3654 return &AMDGPU::VGPR_16RegClass;
3655 if (BitWidth == 32)
3656 return &AMDGPU::VGPR_32RegClass;
3657 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
3659}
3660
3661const TargetRegisterClass *
3663 if (BitWidth <= 32)
3664 return &AMDGPU::VGPR_32_Lo256RegClass;
3665 if (BitWidth <= 64)
3666 return &AMDGPU::VReg_64_Lo256_Align2RegClass;
3667 if (BitWidth <= 96)
3668 return &AMDGPU::VReg_96_Lo256_Align2RegClass;
3669 if (BitWidth <= 128)
3670 return &AMDGPU::VReg_128_Lo256_Align2RegClass;
3671 if (BitWidth <= 160)
3672 return &AMDGPU::VReg_160_Lo256_Align2RegClass;
3673 if (BitWidth <= 192)
3674 return &AMDGPU::VReg_192_Lo256_Align2RegClass;
3675 if (BitWidth <= 224)
3676 return &AMDGPU::VReg_224_Lo256_Align2RegClass;
3677 if (BitWidth <= 256)
3678 return &AMDGPU::VReg_256_Lo256_Align2RegClass;
3679 if (BitWidth <= 288)
3680 return &AMDGPU::VReg_288_Lo256_Align2RegClass;
3681 if (BitWidth <= 320)
3682 return &AMDGPU::VReg_320_Lo256_Align2RegClass;
3683 if (BitWidth <= 352)
3684 return &AMDGPU::VReg_352_Lo256_Align2RegClass;
3685 if (BitWidth <= 384)
3686 return &AMDGPU::VReg_384_Lo256_Align2RegClass;
3687 if (BitWidth <= 512)
3688 return &AMDGPU::VReg_512_Lo256_Align2RegClass;
3689 if (BitWidth <= 1024)
3690 return &AMDGPU::VReg_1024_Lo256_Align2RegClass;
3691
3692 return nullptr;
3693}
3694
3695static const TargetRegisterClass *
3697 if (BitWidth == 64)
3698 return &AMDGPU::AReg_64RegClass;
3699 if (BitWidth == 96)
3700 return &AMDGPU::AReg_96RegClass;
3701 if (BitWidth == 128)
3702 return &AMDGPU::AReg_128RegClass;
3703 if (BitWidth == 160)
3704 return &AMDGPU::AReg_160RegClass;
3705 if (BitWidth == 192)
3706 return &AMDGPU::AReg_192RegClass;
3707 if (BitWidth == 224)
3708 return &AMDGPU::AReg_224RegClass;
3709 if (BitWidth == 256)
3710 return &AMDGPU::AReg_256RegClass;
3711 if (BitWidth == 288)
3712 return &AMDGPU::AReg_288RegClass;
3713 if (BitWidth == 320)
3714 return &AMDGPU::AReg_320RegClass;
3715 if (BitWidth == 352)
3716 return &AMDGPU::AReg_352RegClass;
3717 if (BitWidth == 384)
3718 return &AMDGPU::AReg_384RegClass;
3719 if (BitWidth == 512)
3720 return &AMDGPU::AReg_512RegClass;
3721 if (BitWidth == 1024)
3722 return &AMDGPU::AReg_1024RegClass;
3723
3724 return nullptr;
3725}
3726
3727static const TargetRegisterClass *
3729 if (BitWidth == 64)
3730 return &AMDGPU::AReg_64_Align2RegClass;
3731 if (BitWidth == 96)
3732 return &AMDGPU::AReg_96_Align2RegClass;
3733 if (BitWidth == 128)
3734 return &AMDGPU::AReg_128_Align2RegClass;
3735 if (BitWidth == 160)
3736 return &AMDGPU::AReg_160_Align2RegClass;
3737 if (BitWidth == 192)
3738 return &AMDGPU::AReg_192_Align2RegClass;
3739 if (BitWidth == 224)
3740 return &AMDGPU::AReg_224_Align2RegClass;
3741 if (BitWidth == 256)
3742 return &AMDGPU::AReg_256_Align2RegClass;
3743 if (BitWidth == 288)
3744 return &AMDGPU::AReg_288_Align2RegClass;
3745 if (BitWidth == 320)
3746 return &AMDGPU::AReg_320_Align2RegClass;
3747 if (BitWidth == 352)
3748 return &AMDGPU::AReg_352_Align2RegClass;
3749 if (BitWidth == 384)
3750 return &AMDGPU::AReg_384_Align2RegClass;
3751 if (BitWidth == 512)
3752 return &AMDGPU::AReg_512_Align2RegClass;
3753 if (BitWidth == 1024)
3754 return &AMDGPU::AReg_1024_Align2RegClass;
3755
3756 return nullptr;
3757}
3758
3759const TargetRegisterClass *
3761 if (BitWidth == 16)
3762 return &AMDGPU::AGPR_LO16RegClass;
3763 if (BitWidth == 32)
3764 return &AMDGPU::AGPR_32RegClass;
3765 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
3767}
3768
3769static const TargetRegisterClass *
3771 if (BitWidth == 64)
3772 return &AMDGPU::AV_64RegClass;
3773 if (BitWidth == 96)
3774 return &AMDGPU::AV_96RegClass;
3775 if (BitWidth == 128)
3776 return &AMDGPU::AV_128RegClass;
3777 if (BitWidth == 160)
3778 return &AMDGPU::AV_160RegClass;
3779 if (BitWidth == 192)
3780 return &AMDGPU::AV_192RegClass;
3781 if (BitWidth == 224)
3782 return &AMDGPU::AV_224RegClass;
3783 if (BitWidth == 256)
3784 return &AMDGPU::AV_256RegClass;
3785 if (BitWidth == 288)
3786 return &AMDGPU::AV_288RegClass;
3787 if (BitWidth == 320)
3788 return &AMDGPU::AV_320RegClass;
3789 if (BitWidth == 352)
3790 return &AMDGPU::AV_352RegClass;
3791 if (BitWidth == 384)
3792 return &AMDGPU::AV_384RegClass;
3793 if (BitWidth == 512)
3794 return &AMDGPU::AV_512RegClass;
3795 if (BitWidth == 1024)
3796 return &AMDGPU::AV_1024RegClass;
3797
3798 return nullptr;
3799}
3800
3801static const TargetRegisterClass *
3803 if (BitWidth == 64)
3804 return &AMDGPU::AV_64_Align2RegClass;
3805 if (BitWidth == 96)
3806 return &AMDGPU::AV_96_Align2RegClass;
3807 if (BitWidth == 128)
3808 return &AMDGPU::AV_128_Align2RegClass;
3809 if (BitWidth == 160)
3810 return &AMDGPU::AV_160_Align2RegClass;
3811 if (BitWidth == 192)
3812 return &AMDGPU::AV_192_Align2RegClass;
3813 if (BitWidth == 224)
3814 return &AMDGPU::AV_224_Align2RegClass;
3815 if (BitWidth == 256)
3816 return &AMDGPU::AV_256_Align2RegClass;
3817 if (BitWidth == 288)
3818 return &AMDGPU::AV_288_Align2RegClass;
3819 if (BitWidth == 320)
3820 return &AMDGPU::AV_320_Align2RegClass;
3821 if (BitWidth == 352)
3822 return &AMDGPU::AV_352_Align2RegClass;
3823 if (BitWidth == 384)
3824 return &AMDGPU::AV_384_Align2RegClass;
3825 if (BitWidth == 512)
3826 return &AMDGPU::AV_512_Align2RegClass;
3827 if (BitWidth == 1024)
3828 return &AMDGPU::AV_1024_Align2RegClass;
3829
3830 return nullptr;
3831}
3832
3833const TargetRegisterClass *
3835 if (BitWidth == 32)
3836 return &AMDGPU::AV_32RegClass;
3837 return ST.needsAlignedVGPRs()
3840}
3841
3842const TargetRegisterClass *
3844 // TODO: In principle this should use AV classes for gfx908 too. This is
3845 // limited to 90a+ to avoid regressing special case copy optimizations which
3846 // need new handling. The core issue is that it's not possible to directly
3847 // copy between AGPRs on gfx908, and the current optimizations around that
3848 // expect to see copies to VGPR.
3849 return ST.hasGFX90AInsts() ? getVectorSuperClassForBitWidth(BitWidth)
3851}
3852
3853const TargetRegisterClass *
3855 if (BitWidth == 16 || BitWidth == 32)
3856 return &AMDGPU::SReg_32RegClass;
3857 if (BitWidth == 64)
3858 return &AMDGPU::SReg_64RegClass;
3859 if (BitWidth == 96)
3860 return &AMDGPU::SGPR_96RegClass;
3861 if (BitWidth == 128)
3862 return &AMDGPU::SGPR_128RegClass;
3863 if (BitWidth == 160)
3864 return &AMDGPU::SGPR_160RegClass;
3865 if (BitWidth == 192)
3866 return &AMDGPU::SGPR_192RegClass;
3867 if (BitWidth == 224)
3868 return &AMDGPU::SGPR_224RegClass;
3869 if (BitWidth == 256)
3870 return &AMDGPU::SGPR_256RegClass;
3871 if (BitWidth == 288)
3872 return &AMDGPU::SGPR_288RegClass;
3873 if (BitWidth == 320)
3874 return &AMDGPU::SGPR_320RegClass;
3875 if (BitWidth == 352)
3876 return &AMDGPU::SGPR_352RegClass;
3877 if (BitWidth == 384)
3878 return &AMDGPU::SGPR_384RegClass;
3879 if (BitWidth == 512)
3880 return &AMDGPU::SGPR_512RegClass;
3881 if (BitWidth == 1024)
3882 return &AMDGPU::SGPR_1024RegClass;
3883
3884 return nullptr;
3885}
3886
3888 Register Reg) const {
3889 const TargetRegisterClass *RC;
3890 if (Reg.isVirtual())
3891 RC = MRI.getRegClass(Reg);
3892 else
3893 RC = getPhysRegBaseClass(Reg);
3894 return RC && isSGPRClass(RC);
3895}
3896
3897const TargetRegisterClass *
3899 unsigned Size = getRegSizeInBits(*SRC);
3900
3901 switch (SRC->getID()) {
3902 default:
3903 break;
3904 case AMDGPU::VS_32_Lo256RegClassID:
3905 case AMDGPU::VS_64_Lo256RegClassID:
3906 return getAllocatableClass(getAlignedLo256VGPRClassForBitWidth(Size));
3907 }
3908
3909 const TargetRegisterClass *VRC =
3910 getAllocatableClass(getVGPRClassForBitWidth(Size));
3911 assert(VRC && "Invalid register class size");
3912 return VRC;
3913}
3914
3915const TargetRegisterClass *
3917 unsigned Size = getRegSizeInBits(*SRC);
3919 assert(ARC && "Invalid register class size");
3920 return ARC;
3921}
3922
3923const TargetRegisterClass *
3925 unsigned Size = getRegSizeInBits(*SRC);
3927 assert(ARC && "Invalid register class size");
3928 return ARC;
3929}
3930
3931const TargetRegisterClass *
3933 unsigned Size = getRegSizeInBits(*VRC);
3934 if (Size == 32)
3935 return &AMDGPU::SGPR_32RegClass;
3937 assert(SRC && "Invalid register class size");
3938 return SRC;
3939}
3940
3941const TargetRegisterClass *
3943 const TargetRegisterClass *SubRC,
3944 unsigned SubIdx) const {
3945 // Ensure this subregister index is aligned in the super register.
3946 const TargetRegisterClass *MatchRC =
3947 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
3948 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
3949}
3950
3951bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
3954 return !ST.hasMFMAInlineLiteralBug();
3955
3956 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
3957 OpType <= AMDGPU::OPERAND_SRC_LAST;
3958}
3959
3960bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
3961 // TODO: 64-bit operands have extending behavior from 32-bit literal.
3962 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
3964}
3965
3966/// Returns a lowest register that is not used at any point in the function.
3967/// If all registers are used, then this function will return
3968/// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
3969/// highest unused register.
3971 const MachineRegisterInfo &MRI, const TargetRegisterClass *RC,
3972 const MachineFunction &MF, bool ReserveHighestRegister) const {
3973 // Never offer VCC as an unused register.
3974 auto isVCC = [](MCRegister Reg) {
3975 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3976 };
3977
3978 if (ReserveHighestRegister) {
3979 for (MCRegister Reg : reverse(*RC))
3980 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) && !isVCC(Reg))
3981 return Reg;
3982 } else {
3983 for (MCRegister Reg : *RC)
3984 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) && !isVCC(Reg))
3985 return Reg;
3986 }
3987 return MCRegister();
3988}
3989
3991 const RegisterBankInfo &RBI,
3992 Register Reg) const {
3993 auto *RB = RBI.getRegBank(Reg, MRI, *this);
3994 if (!RB)
3995 return false;
3996
3997 return !RBI.isDivergentRegBank(RB);
3998}
3999
4001 unsigned EltSize) const {
4002 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
4003 assert(RegBitWidth >= 32 && RegBitWidth <= 1024 && EltSize >= 2);
4004
4005 const unsigned RegHalves = RegBitWidth / 16;
4006 const unsigned EltHalves = EltSize / 2;
4007 assert(RegSplitParts.size() + 1 >= EltHalves);
4008
4009 const std::vector<int16_t> &Parts = RegSplitParts[EltHalves - 1];
4010 const unsigned NumParts = RegHalves / EltHalves;
4011
4012 return ArrayRef(Parts.data(), NumParts);
4013}
4014
4017 Register Reg) const {
4018 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg);
4019}
4020
4021const TargetRegisterClass *
4023 const MachineOperand &MO) const {
4024 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg());
4025 return getSubRegisterClass(SrcRC, MO.getSubReg());
4026}
4027
4029 Register Reg) const {
4030 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
4031 // Registers without classes are unaddressable, SGPR-like registers.
4032 return RC && isVGPRClass(RC);
4033}
4034
4036 Register Reg) const {
4037 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
4038
4039 // Registers without classes are unaddressable, SGPR-like registers.
4040 return RC && isAGPRClass(RC);
4041}
4042
4044 MachineFunction &MF) const {
4045 unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
4046 switch (RC->getID()) {
4047 default:
4048 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
4049 case AMDGPU::VGPR_32RegClassID:
4050 return std::min(
4051 ST.getMaxNumVGPRs(
4052 MinOcc,
4054 ST.getMaxNumVGPRs(MF));
4055 case AMDGPU::SGPR_32RegClassID:
4056 case AMDGPU::SGPR_LO16RegClassID:
4057 return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
4058 }
4059}
4060
4062 unsigned Idx) const {
4063 switch (static_cast<AMDGPU::RegisterPressureSets>(Idx)) {
4064 case AMDGPU::RegisterPressureSets::VGPR_32:
4065 case AMDGPU::RegisterPressureSets::AGPR_32:
4066 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
4067 const_cast<MachineFunction &>(MF));
4068 case AMDGPU::RegisterPressureSets::SReg_32:
4069 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
4070 const_cast<MachineFunction &>(MF));
4071 }
4072
4073 llvm_unreachable("Unexpected register pressure set!");
4074}
4075
4076const int *SIRegisterInfo::getRegUnitPressureSets(MCRegUnit RegUnit) const {
4077 static const int Empty[] = { -1 };
4078
4079 if (RegPressureIgnoredUnits[static_cast<unsigned>(RegUnit)])
4080 return Empty;
4081
4082 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
4083}
4084
4086 ArrayRef<MCPhysReg> Order,
4088 const MachineFunction &MF,
4089 const VirtRegMap *VRM,
4090 const LiveRegMatrix *Matrix) const {
4091
4092 const MachineRegisterInfo &MRI = MF.getRegInfo();
4093 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4094
4095 std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
4096
4097 switch (Hint.first) {
4098 case AMDGPURI::Size32: {
4099 Register Paired = Hint.second;
4100 assert(Paired);
4101 Register PairedPhys;
4102 if (Paired.isPhysical()) {
4103 PairedPhys =
4104 getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
4105 } else if (VRM && VRM->hasPhys(Paired)) {
4106 PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
4107 &AMDGPU::VGPR_32RegClass);
4108 }
4109
4110 // Prefer the paired physreg.
4111 if (PairedPhys)
4112 // isLo(Paired) is implicitly true here from the API of
4113 // getMatchingSuperReg.
4114 Hints.push_back(PairedPhys);
4115 return false;
4116 }
4117 case AMDGPURI::Size16: {
4118 Register Paired = Hint.second;
4119 assert(Paired);
4120 Register PairedPhys;
4121 if (Paired.isPhysical()) {
4122 PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
4123 } else if (VRM && VRM->hasPhys(Paired)) {
4124 PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
4125 }
4126
4127 // First prefer the paired physreg.
4128 if (PairedPhys)
4129 Hints.push_back(PairedPhys);
4130 else {
4131 // Add all the lo16 physregs.
4132 // When the Paired operand has not yet been assigned a physreg it is
4133 // better to try putting VirtReg in a lo16 register, because possibly
4134 // later Paired can be assigned to the overlapping register and the COPY
4135 // can be eliminated.
4136 for (MCPhysReg PhysReg : Order) {
4137 if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this))
4138 continue;
4139 if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
4140 !MRI.isReserved(PhysReg))
4141 Hints.push_back(PhysReg);
4142 }
4143 }
4144 return false;
4145 }
4146 default:
4147 return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
4148 VRM);
4149 }
4150}
4151
4153 // Not a callee saved register.
4154 return AMDGPU::SGPR30_SGPR31;
4155}
4156
4157const TargetRegisterClass *
4159 const RegisterBank &RB) const {
4160 switch (RB.getID()) {
4161 case AMDGPU::VGPRRegBankID:
4163 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
4164 case AMDGPU::VCCRegBankID:
4165 assert(Size == 1);
4166 return getWaveMaskRegClass();
4167 case AMDGPU::SGPRRegBankID:
4168 return getSGPRClassForBitWidth(std::max(32u, Size));
4169 case AMDGPU::AGPRRegBankID:
4170 return getAGPRClassForBitWidth(std::max(32u, Size));
4171 default:
4172 llvm_unreachable("unknown register bank");
4173 }
4174}
4175
4176const TargetRegisterClass *
4178 const MachineRegisterInfo &MRI) const {
4179 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
4180 if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB))
4181 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
4182
4183 if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB))
4184 return getAllocatableClass(RC);
4185
4186 return nullptr;
4187}
4188
4190 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
4191}
4192
4194 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4195}
4196
4198 // VGPR tuples have an alignment requirement on gfx90a variants.
4199 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
4200 : &AMDGPU::VReg_64RegClass;
4201}
4202
4203// Find reaching register definition
4207 LiveIntervals *LIS) const {
4208 auto &MDT = LIS->getDomTree();
4209 SlotIndex UseIdx = LIS->getInstructionIndex(Use);
4210 SlotIndex DefIdx;
4211
4212 if (Reg.isVirtual()) {
4213 if (!LIS->hasInterval(Reg))
4214 return nullptr;
4215 LiveInterval &LI = LIS->getInterval(Reg);
4216 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
4217 : MRI.getMaxLaneMaskForVReg(Reg);
4218 VNInfo *V = nullptr;
4219 if (LI.hasSubRanges()) {
4220 for (auto &S : LI.subranges()) {
4221 if ((S.LaneMask & SubLanes) == SubLanes) {
4222 V = S.getVNInfoAt(UseIdx);
4223 break;
4224 }
4225 }
4226 } else {
4227 V = LI.getVNInfoAt(UseIdx);
4228 }
4229 if (!V)
4230 return nullptr;
4231 DefIdx = V->def;
4232 } else {
4233 // Find last def.
4234 for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
4235 LiveRange &LR = LIS->getRegUnit(Unit);
4236 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
4237 if (!DefIdx.isValid() ||
4238 MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
4239 LIS->getInstructionFromIndex(V->def)))
4240 DefIdx = V->def;
4241 } else {
4242 return nullptr;
4243 }
4244 }
4245 }
4246
4247 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
4248
4249 if (!Def || !MDT.dominates(Def, &Use))
4250 return nullptr;
4251
4252 assert(Def->modifiesRegister(Reg, this));
4253
4254 return Def;
4255}
4256
4258 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32);
4259
4260 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
4261 AMDGPU::SReg_32RegClass,
4262 AMDGPU::AGPR_32RegClass } ) {
4263 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
4264 return Super;
4265 }
4266 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
4267 &AMDGPU::VGPR_32RegClass)) {
4268 return Super;
4269 }
4270
4271 return AMDGPU::NoRegister;
4272}
4273
4275 if (!ST.needsAlignedVGPRs())
4276 return true;
4277
4278 if (isVGPRClass(&RC))
4279 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
4280 if (isAGPRClass(&RC))
4281 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
4282 if (isVectorSuperClass(&RC))
4283 return RC.hasSuperClassEq(
4284 getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
4285
4286 assert(&RC != &AMDGPU::VS_64RegClass);
4287
4288 return true;
4289}
4290
4293 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
4294}
4295
4298 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2);
4299}
4300
4303 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
4304}
4305
4306unsigned
4308 unsigned SubReg) const {
4309 switch (RC->TSFlags & SIRCFlags::RegKindMask) {
4310 case SIRCFlags::HasSGPR:
4311 return std::min(128u, getSubRegIdxSize(SubReg));
4312 case SIRCFlags::HasAGPR:
4313 case SIRCFlags::HasVGPR:
4315 return std::min(32u, getSubRegIdxSize(SubReg));
4316 default:
4317 break;
4318 }
4319 return 0;
4320}
4321
4323 const TargetRegisterClass &RC,
4324 bool IncludeCalls) const {
4325 unsigned NumArchVGPRs = ST.getAddressableNumArchVGPRs();
4327 (RC.getID() == AMDGPU::VGPR_32RegClassID)
4328 ? RC.getRegisters().take_front(NumArchVGPRs)
4329 : RC.getRegisters();
4330 for (MCPhysReg Reg : reverse(Registers)) {
4331 if (Reg != AMDGPU::VCC_LO && Reg != AMDGPU::VCC_HI &&
4332 MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
4333 return getHWRegIndex(Reg) + 1;
4334 }
4335 return 0;
4336}
4337
4340 const MachineFunction &MF) const {
4342 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4343 if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
4344 RegFlags.push_back("WWM_REG");
4345 return RegFlags;
4346}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static const Function * getParent(const Value *V)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Live Register Matrix
A set of register units.
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
if(PassOpts->AAPipeline)
This file declares the machine register scavenger class.
SI Pre allocate WWM Registers
static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, int Index, unsigned Lane, unsigned ValueReg, bool IsKill, bool NeedsCFI)
static int getOffenMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyAGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFLoad(unsigned Opc)
static const std::array< unsigned, 17 > SubRegFromChannelTableWidthMap
static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI, const SIInstrInfo *TII)
static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI, const Twine &ErrMsg)
static const TargetRegisterClass * getAlignedAGPRClassForBitWidth(unsigned BitWidth)
static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, MachineFrameInfo &MFI, MachineBasicBlock::iterator MI, int Index, int64_t Offset)
static cl::opt< bool > EnableSpillCFISavedRegs("amdgpu-spill-cfi-saved-regs", cl::desc("Enable spilling the registers required for CFI emission"), cl::ReallyHidden, cl::init(false), cl::ZeroOrMore)
static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, unsigned LoadStoreOp, unsigned EltSize)
static const TargetRegisterClass * getAlignedVGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyVGPRClassForBitWidth(unsigned BitWidth)
static cl::opt< bool > EnableSpillSGPRToVGPR("amdgpu-spill-sgpr-to-vgpr", cl::desc("Enable spilling SGPRs to VGPRs"), cl::ReallyHidden, cl::init(true))
static const TargetRegisterClass * getAlignedVectorSuperClassForBitWidth(unsigned BitWidth)
static const TargetRegisterClass * getAnyVectorSuperClassForBitWidth(unsigned BitWidth)
static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI, const MachineInstr &MI)
static int getOffenMUBUFLoad(unsigned Opc)
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:484
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static const char * getRegisterName(MCRegister Reg)
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
bool test(unsigned Idx) const
Returns true if bit Idx is set.
Definition BitVector.h:482
bool empty() const
Returns whether there are no bits in this bitvector.
Definition BitVector.h:175
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Register getReg() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:354
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
bool hasInterval(Register Reg) const
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
MachineDominatorTree & getDomTree()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LiveRange & getRegUnit(MCRegUnit Unit)
Return the live range for register unit Unit.
This class represents the liveness of a register, stack slot, etc.
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
A set of register units used to track register liveness.
bool available(MCRegister Reg) const
Returns true if no part of physical register Reg is live.
Describe properties that are true of each instruction in the target description file.
MCRegAliasIterator enumerates all registers aliasing Reg.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition MCRegister.h:77
Generic base class for all target subtargets.
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasCalls() const
Return true if the current function has any function calls.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
bool hasStackObjects() const
Return true if there are any stack objects in this function.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
void setAsmPrinterFlag(AsmPrinterFlagTy Flag)
Set a flag for the AsmPrinter.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
const RegClassOrRegBank & getRegClassOrRegBank(Register Reg) const
Return the register bank or register class of Reg.
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
bool isAllocatable(MCRegister PhysReg) const
isAllocatable - Returns true when PhysReg belongs to an allocatable register class and it hasn't been...
std::pair< unsigned, Register > getRegAllocationHint(Register VReg) const
getRegAllocationHint - Return the register allocation hint for the specified virtual register.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI LaneBitmask getMaxLaneMaskForVReg(Register Reg) const
Returns a mask covering all bits that can appear in lane masks of subregisters of the virtual registe...
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Holds all the information related to register banks.
virtual bool isDivergentRegBank(const RegisterBank *RB) const
Returns true if the register bank is considered divergent.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
MachineInstr * buildCFIForSGPRToVMEMSpill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, MCRegister SGPR, int64_t Offset) const
Create a CFI index describing a spill of a SGPR to VMEM and build a MachineInstr around it.
MachineInstr * buildCFIForVRegToVRegSpill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCRegister Reg, const MCRegister RegCopy) const
Create a CFI index describing a spill of the VGPR/AGPR Reg to another VGPR/AGPR RegCopy and build a M...
MachineInstr * buildCFIForVGPRToVMEMSpill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, MCRegister VGPR, int64_t Offset) const
Create a CFI index describing a spill of a VGPR to VMEM and build a MachineInstr around it.
MachineInstr * buildCFIForSGPRToVGPRSpill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCRegister SGPR, const MCRegister VGPR, const int Lane) const
Create a CFI index describing a spill of an SGPR to a single lane of a VGPR and build a MachineInstr ...
static bool isFLATScratch(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isVOP3(const MCInstrDesc &Desc)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
ArrayRef< MCPhysReg > getAGPRSpillVGPRs() const
MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< MCPhysReg > getVGPRSpillAGPRs() const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const
uint32_t getMaskForVGPRBlockOps(Register RegisterBlock) const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const
bool checkFlag(Register Reg, uint8_t Flag) const
const ReservedRegSet & getWWMReservedRegs() const
Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, int64_t Offset) const override
int64_t getScratchInstrOffset(const MachineInstr *MI) const
bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const override
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
ArrayRef< MCPhysReg > getAllSGPR64(const MachineFunction &MF) const
Return all SGPR64 which satisfy the waves per execution unit requirement of the subtarget.
MCRegister findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, const MachineFunction &MF, bool ReserveHighestVGPR=false) const
Returns a lowest register that is not used at any point in the function.
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
void buildSpillLoadStore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned LoadStoreOp, int Index, Register ValueReg, bool ValueIsKill, MCRegister ScratchOffsetReg, int64_t InstrOffset, MachineMemOperand *MMO, RegScavenger *RS, LiveRegUnits *LiveUnits=nullptr, bool NeedsCFI=false) const
bool requiresFrameIndexReplacementScavenging(const MachineFunction &MF) const override
bool shouldRealignStack(const MachineFunction &MF) const override
bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
Register getFrameRegister(const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getVectorSuperClassForBitWidth(unsigned BitWidth) const
bool spillEmergencySGPR(MachineBasicBlock::iterator MI, MachineBasicBlock &RestoreMBB, Register SGPR, RegScavenger *RS) const
SIRegisterInfo(const GCNSubtarget &ST)
const uint32_t * getAllVGPRRegMask() const
MCRegister getReturnAddressReg(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
bool hasBasePointer(const MachineFunction &MF) const
const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override
Returns a legal register class to copy a register in the specified class to or from.
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
ArrayRef< MCPhysReg > getAllSGPR32(const MachineFunction &MF) const
Return all SGPR32 which satisfy the waves per execution unit requirement of the subtarget.
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch buffer in case spilling is needed.
bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool SpillToPhysVGPRLane=false) const
Special case of eliminateFrameIndex.
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override
LLVM_READONLY const TargetRegisterClass * getAGPRClassForBitWidth(unsigned BitWidth) const
static bool isChainScratchRegister(Register VGPR)
bool requiresRegisterScavenging(const MachineFunction &Fn) const override
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
bool isUniformReg(const MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, Register Reg) const override
const uint32_t * getNoPreservedMask() const override
StringRef getRegAsmName(MCRegister Reg) const override
const uint32_t * getAllAllocatableSRegMask() const
MCRegister getAlignedHighSGPRForRC(const MachineFunction &MF, const unsigned Align, const TargetRegisterClass *RC) const
Return the largest available SGPR aligned to Align for the register class RC.
void buildCFIForBlockCSRStore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register BlockReg, int64_t Offset) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getHWRegIndex(MCRegister Reg) const
const MCPhysReg * getCalleeSavedRegsViaCopy(const MachineFunction *MF) const
const uint32_t * getAllVectorRegMask() const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
const TargetRegisterClass * getPointerRegClass(unsigned Kind=0) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
bool opCanUseLiteralConstant(unsigned OpType) const
Register getBaseRegister() const
bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override
LLVM_READONLY const TargetRegisterClass * getAlignedLo256VGPRClassForBitWidth(unsigned BitWidth) const
LLVM_READONLY const TargetRegisterClass * getVGPRClassForBitWidth(unsigned BitWidth) const
const TargetRegisterClass * getEquivalentAVClass(const TargetRegisterClass *SRC) const
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override
static bool isVGPRClass(const TargetRegisterClass *RC)
MachineInstr * findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
SmallVector< StringLiteral > getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
ArrayRef< MCPhysReg > getAllSGPR128(const MachineFunction &MF) const
Return all SGPR128 which satisfy the waves per execution unit requirement of the subtarget.
unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const override
BitVector getReservedRegs(const MachineFunction &MF) const override
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override
const TargetRegisterClass * getRegClassForOperandReg(const MachineRegisterInfo &MRI, const MachineOperand &MO) const
void addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB, Register BlockReg) const
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, const TargetRegisterClass &RC, bool IncludeCalls=true) const
const uint32_t * getAllAGPRRegMask() const
const int * getRegUnitPressureSets(MCRegUnit RegUnit) const override
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override
bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false, bool NeedsCFI=false) const
If OnlyToVGPR is true, this will only succeed if this manages to find a free VGPR lane to spill.
MCRegister getExec() const
MCRegister getVCC() const
int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override
bool isVectorSuperClass(const TargetRegisterClass *RC) const
const TargetRegisterClass * getWaveMaskRegClass() const
unsigned getSubRegAlignmentNumBits(const TargetRegisterClass *RC, unsigned SubReg) const
void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override
bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override
const TargetRegisterClass * getVGPR64Class() const
void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill=true) const
bool isCFISavedRegsSpillEnabled() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
bool isValid() const
Returns true if this is a valid index.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
SlotIndex replaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
ReplaceMachineInstrInMaps - Replacing a machine instr with a new one in maps used by register allocat...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
const uint8_t TSFlags
Configurable target specific flags.
ArrayRef< MCPhysReg > getRegisters() const
unsigned getID() const
Return the register class ID number.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
virtual const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &) const
Returns the largest super class of RC that is legal to use in the current sub-target and has the same...
virtual bool shouldRealignStack(const MachineFunction &MF) const
True if storage within the function requires the stack pointer to be aligned more than the normal cal...
virtual bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM=nullptr, const LiveRegMatrix *Matrix=nullptr) const
Get a list of 'hint' registers that the register allocator should try first when allocating a physica...
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
VNInfo - Value Number Information.
MCRegister getPhys(Register virtReg) const
returns the physical register mapped to the specified virtual register
Definition VirtRegMap.h:91
bool hasPhys(Register virtReg) const
returns true if the specified virtual register is mapped to a physical register
Definition VirtRegMap.h:87
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ PRIVATE_ADDRESS
Address space for private memory.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
@ OPERAND_REG_IMM_FIRST
Definition SIDefines.h:257
@ OPERAND_REG_INLINE_AC_FIRST
Definition SIDefines.h:263
@ OPERAND_REG_INLINE_AC_LAST
Definition SIDefines.h:264
@ OPERAND_REG_IMM_LAST
Definition SIDefines.h:258
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY int32_t getFlatScratchInstSVfromSVS(uint32_t Opcode)
LLVM_READONLY int32_t getFlatScratchInstSVfromSS(uint32_t Opcode)
LLVM_READONLY int32_t getFlatScratchInstSTfromSS(uint32_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:558
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Renamable
Register that may be renamed.
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
@ HasSGPR
Definition SIDefines.h:27
@ HasVGPR
Definition SIDefines.h:25
@ RegKindMask
Definition SIDefines.h:30
@ HasAGPR
Definition SIDefines.h:26
constexpr RegState getDefRegState(bool B)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
constexpr bool hasRegState(RegState Value, RegState Test)
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition Threading.h:86
constexpr unsigned BitWidth
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static const MachineMemOperand::Flags MOThreadPrivate
Mark the MMO of accesses to memory locations that are never written to by other threads.
Definition SIInstrInfo.h:63
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI)
ArrayRef< int16_t > SplitParts
SIMachineFunctionInfo & MFI
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, int Index, RegScavenger *RS)
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, bool IsKill, int Index, RegScavenger *RS)
MachineBasicBlock::iterator MI
void readWriteTmpVGPR(unsigned Offset, bool IsLoad)
const SIRegisterInfo & TRI
MachineBasicBlock * MBB
const SIInstrInfo & TII
The llvm::once_flag structure.
Definition Threading.h:67