LLVM 23.0.0git
SIRegisterInfo.cpp
Go to the documentation of this file.
1//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI implementation of the TargetRegisterInfo class.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
16#include "GCNSubtarget.h"
20#include "SIRegisterInfo.h"
26
27using namespace llvm;
28
29#define GET_REGINFO_TARGET_DESC
30#include "AMDGPUGenRegisterInfo.inc"
31
33 "amdgpu-spill-sgpr-to-vgpr",
34 cl::desc("Enable spilling SGPRs to VGPRs"),
36 cl::init(true));
37
38std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
39std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
40
41// Map numbers of DWORDs to indexes in SubRegFromChannelTable.
42// Valid indexes are shifted 1, such that a 0 mapping means unsupported.
43// e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
44// meaning index 7 in SubRegFromChannelTable.
45static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
47
48static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI,
49 const Twine &ErrMsg) {
51 DiagnosticInfoUnsupported(Fn, ErrMsg, MI.getDebugLoc()));
52}
53
54namespace llvm {
55
56// A temporary struct to spill SGPRs.
57// This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
58// just v_writelane and v_readlane.
59//
60// When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
61// is saved to scratch (or the other way around for loads).
62// For this, a VGPR is required where the needed lanes can be clobbered. The
63// RegScavenger can provide a VGPR where currently active lanes can be
64// clobbered, but we still need to save inactive lanes.
65// The high-level steps are:
66// - Try to scavenge SGPR(s) to save exec
67// - Try to scavenge VGPR
68// - Save needed, all or inactive lanes of a TmpVGPR
69// - Spill/Restore SGPRs using TmpVGPR
70// - Restore TmpVGPR
71//
72// To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
73// cannot scavenge temporary SGPRs to save exec, we use the following code:
74// buffer_store_dword TmpVGPR ; only if active lanes need to be saved
75// s_not exec, exec
76// buffer_store_dword TmpVGPR ; save inactive lanes
77// s_not exec, exec
79 struct PerVGPRData {
80 unsigned PerVGPR;
81 unsigned NumVGPRs;
82 int64_t VGPRLanes;
83 };
84
85 // The SGPR to save
89 unsigned NumSubRegs;
90 bool IsKill;
91 const DebugLoc &DL;
92
93 /* When spilling to stack */
94 // The SGPRs are written into this VGPR, which is then written to scratch
95 // (or vice versa for loads).
96 Register TmpVGPR = AMDGPU::NoRegister;
97 // Temporary spill slot to save TmpVGPR to.
98 int TmpVGPRIndex = 0;
99 // If TmpVGPR is live before the spill or if it is scavenged.
100 bool TmpVGPRLive = false;
101 // Scavenged SGPR to save EXEC.
102 Register SavedExecReg = AMDGPU::NoRegister;
103 // Stack index to write the SGPRs to.
104 int Index;
105 unsigned EltSize = 4;
106
115 unsigned MovOpc;
116 unsigned NotOpc;
117
121 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
122 MI->getOperand(0).isKill(), Index, RS) {}
123
126 bool IsKill, int Index, RegScavenger *RS)
127 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
128 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
129 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
131 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
132 SplitParts = TRI.getRegSplitParts(RC, EltSize);
133 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
134
135 if (IsWave32) {
136 ExecReg = AMDGPU::EXEC_LO;
137 MovOpc = AMDGPU::S_MOV_B32;
138 NotOpc = AMDGPU::S_NOT_B32;
139 } else {
140 ExecReg = AMDGPU::EXEC;
141 MovOpc = AMDGPU::S_MOV_B64;
142 NotOpc = AMDGPU::S_NOT_B64;
143 }
144
145 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
146 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
147 SuperReg != AMDGPU::EXEC && "exec should never spill");
148 }
149
152 Data.PerVGPR = IsWave32 ? 32 : 64;
153 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
154 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
155 return Data;
156 }
157
158 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
159 // free.
160 // Writes these instructions if an SGPR can be scavenged:
161 // s_mov_b64 s[6:7], exec ; Save exec
162 // s_mov_b64 exec, 3 ; Wanted lanemask
163 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
164 //
165 // Writes these instructions if no SGPR can be scavenged:
166 // buffer_store_dword v0 ; Only if no free VGPR was found
167 // s_not_b64 exec, exec
168 // buffer_store_dword v0 ; Save inactive lanes
169 // ; exec stays inverted, it is flipped back in
170 // ; restore.
171 void prepare() {
172 // Scavenged temporary VGPR to use. It must be scavenged once for any number
173 // of spilled subregs.
174 // FIXME: The liveness analysis is limited and does not tell if a register
175 // is in use in lanes that are currently inactive. We can never be sure if
176 // a register as actually in use in another lane, so we need to save all
177 // used lanes of the chosen VGPR.
178 assert(RS && "Cannot spill SGPR to memory without RegScavenger");
179 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
180 0, false);
181
182 // Reserve temporary stack slot
183 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
184 if (TmpVGPR) {
185 // Found a register that is dead in the currently active lanes, we only
186 // need to spill inactive lanes.
187 TmpVGPRLive = false;
188 } else {
189 // Pick v0 because it doesn't make a difference.
190 TmpVGPR = AMDGPU::VGPR0;
191 TmpVGPRLive = true;
192 }
193
194 if (TmpVGPRLive) {
195 // We need to inform the scavenger that this index is already in use until
196 // we're done with the custom emergency spill.
197 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR);
198 }
199
200 // We may end up recursively calling the scavenger, and don't want to re-use
201 // the same register.
202 RS->setRegUsed(TmpVGPR);
203
204 // Try to scavenge SGPRs to save exec
205 assert(!SavedExecReg && "Exec is already saved, refuse to save again");
206 const TargetRegisterClass &RC =
207 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
208 RS->setRegUsed(SuperReg);
209 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
210
211 int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
212
213 if (SavedExecReg) {
214 RS->setRegUsed(SavedExecReg);
215 // Set exec to needed lanes
217 auto I =
218 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
219 if (!TmpVGPRLive)
221 // Spill needed lanes
222 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
223 } else {
224 // The modify and restore of exec clobber SCC, which we would have to save
225 // and restore. FIXME: We probably would need to reserve a register for
226 // this.
227 if (RS->isRegUsed(AMDGPU::SCC))
228 emitUnsupportedError(MF.getFunction(), *MI,
229 "unhandled SGPR spill to memory");
230
231 // Spill active lanes
232 if (TmpVGPRLive)
233 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
234 /*IsKill*/ false);
235 // Spill inactive lanes
236 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
237 if (!TmpVGPRLive)
239 I->getOperand(2).setIsDead(); // Mark SCC as dead.
240 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
241 }
242 }
243
244 // Writes these instructions if an SGPR can be scavenged:
245 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
246 // s_waitcnt vmcnt(0) ; If a free VGPR was found
247 // s_mov_b64 exec, s[6:7] ; Save exec
248 //
249 // Writes these instructions if no SGPR can be scavenged:
250 // buffer_load_dword v0 ; Restore inactive lanes
251 // s_waitcnt vmcnt(0) ; If a free VGPR was found
252 // s_not_b64 exec, exec
253 // buffer_load_dword v0 ; Only if no free VGPR was found
254 void restore() {
255 if (SavedExecReg) {
256 // Restore used lanes
257 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
258 /*IsKill*/ false);
259 // Restore exec
260 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
262 // Add an implicit use of the load so it is not dead.
263 // FIXME This inserts an unnecessary waitcnt
264 if (!TmpVGPRLive) {
266 }
267 } else {
268 // Restore inactive lanes
269 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
270 /*IsKill*/ false);
271 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
272 if (!TmpVGPRLive)
274 I->getOperand(2).setIsDead(); // Mark SCC as dead.
275
276 // Restore active lanes
277 if (TmpVGPRLive)
278 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
279 }
280
281 // Inform the scavenger where we're releasing our custom scavenged register.
282 if (TmpVGPRLive) {
283 MachineBasicBlock::iterator RestorePt = std::prev(MI);
284 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt);
285 }
286 }
287
288 // Write TmpVGPR to memory or read TmpVGPR from memory.
289 // Either using a single buffer_load/store if exec is set to the needed mask
290 // or using
291 // buffer_load
292 // s_not exec, exec
293 // buffer_load
294 // s_not exec, exec
295 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
296 if (SavedExecReg) {
297 // Spill needed lanes
298 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
299 } else {
300 // The modify and restore of exec clobber SCC, which we would have to save
301 // and restore. FIXME: We probably would need to reserve a register for
302 // this.
303 if (RS->isRegUsed(AMDGPU::SCC))
304 emitUnsupportedError(MF.getFunction(), *MI,
305 "unhandled SGPR spill to memory");
306
307 // Spill active lanes
308 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
309 /*IsKill*/ false);
310 // Spill inactive lanes
311 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
312 Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
313 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
314 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
315 Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
316 }
317 }
318
320 assert(MBB->getParent() == &MF);
321 MI = NewMI;
322 MBB = NewMBB;
323 }
324};
325
326} // namespace llvm
327
329 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
330 ST.getAMDGPUDwarfFlavour(),
331 /*PC=*/0,
332 ST.getHwMode(MCSubtargetInfo::HwMode_RegInfo)),
333 ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
334
335 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
336 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
337 (getSubRegIndexLaneMask(AMDGPU::lo16) |
338 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
339 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
340 "getNumCoveredRegs() will not work with generated subreg masks!");
341
342 RegPressureIgnoredUnits.resize(getNumRegUnits());
343 RegPressureIgnoredUnits.set(
344 static_cast<unsigned>(*regunits(MCRegister::from(AMDGPU::M0)).begin()));
345 for (auto Reg : AMDGPU::VGPR_16RegClass) {
346 if (AMDGPU::isHi16Reg(Reg, *this))
347 RegPressureIgnoredUnits.set(
348 static_cast<unsigned>(*regunits(Reg).begin()));
349 }
350
351 // HACK: Until this is fully tablegen'd.
352 static llvm::once_flag InitializeRegSplitPartsFlag;
353
354 static auto InitializeRegSplitPartsOnce = [this]() {
355 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
356 unsigned Size = getSubRegIdxSize(Idx);
357 if (Size & 15)
358 continue;
359 std::vector<int16_t> &Vec = RegSplitParts[Size / 16 - 1];
360 unsigned Pos = getSubRegIdxOffset(Idx);
361 if (Pos % Size)
362 continue;
363 Pos /= Size;
364 if (Vec.empty()) {
365 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
366 Vec.resize(MaxNumParts);
367 }
368 Vec[Pos] = Idx;
369 }
370 };
371
372 static llvm::once_flag InitializeSubRegFromChannelTableFlag;
373
374 static auto InitializeSubRegFromChannelTableOnce = [this]() {
375 for (auto &Row : SubRegFromChannelTable)
376 Row.fill(AMDGPU::NoSubRegister);
377 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
378 unsigned Width = getSubRegIdxSize(Idx) / 32;
379 unsigned Offset = getSubRegIdxOffset(Idx) / 32;
381 Width = SubRegFromChannelTableWidthMap[Width];
382 if (Width == 0)
383 continue;
384 unsigned TableIdx = Width - 1;
385 assert(TableIdx < SubRegFromChannelTable.size());
386 assert(Offset < SubRegFromChannelTable[TableIdx].size());
387 SubRegFromChannelTable[TableIdx][Offset] = Idx;
388 }
389 };
390
391 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
392 llvm::call_once(InitializeSubRegFromChannelTableFlag,
393 InitializeSubRegFromChannelTableOnce);
394}
395
396void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
397 MCRegister Reg) const {
398 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
399 Reserved.set(*R);
400}
401
402// Forced to be here by one .inc
404 const MachineFunction *MF) const {
406 switch (CC) {
407 case CallingConv::C:
410 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
411 : CSR_AMDGPU_SaveList;
414 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
415 : CSR_AMDGPU_SI_Gfx_SaveList;
417 return CSR_AMDGPU_CS_ChainPreserve_SaveList;
418 default: {
419 // Dummy to not crash RegisterClassInfo.
420 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
421 return &NoCalleeSavedReg;
422 }
423 }
424}
425
426const MCPhysReg *
428 return nullptr;
429}
430
432 CallingConv::ID CC) const {
433 switch (CC) {
434 case CallingConv::C:
437 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
438 : CSR_AMDGPU_RegMask;
441 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
442 : CSR_AMDGPU_SI_Gfx_RegMask;
445 // Calls to these functions never return, so we can pretend everything is
446 // preserved.
447 return AMDGPU_AllVGPRs_RegMask;
448 default:
449 return nullptr;
450 }
451}
452
454 return CSR_AMDGPU_NoRegs_RegMask;
455}
456
458 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
459}
460
463 const MachineFunction &MF) const {
464 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
465 // equivalent AV class. If used one, the verifier will crash after
466 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
467 // until Instruction selection.
468 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
469 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
470 return &AMDGPU::AV_32RegClass;
471 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
472 return &AMDGPU::AV_64RegClass;
473 if (RC == &AMDGPU::VReg_64_Align2RegClass ||
474 RC == &AMDGPU::AReg_64_Align2RegClass)
475 return &AMDGPU::AV_64_Align2RegClass;
476 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
477 return &AMDGPU::AV_96RegClass;
478 if (RC == &AMDGPU::VReg_96_Align2RegClass ||
479 RC == &AMDGPU::AReg_96_Align2RegClass)
480 return &AMDGPU::AV_96_Align2RegClass;
481 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
482 return &AMDGPU::AV_128RegClass;
483 if (RC == &AMDGPU::VReg_128_Align2RegClass ||
484 RC == &AMDGPU::AReg_128_Align2RegClass)
485 return &AMDGPU::AV_128_Align2RegClass;
486 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
487 return &AMDGPU::AV_160RegClass;
488 if (RC == &AMDGPU::VReg_160_Align2RegClass ||
489 RC == &AMDGPU::AReg_160_Align2RegClass)
490 return &AMDGPU::AV_160_Align2RegClass;
491 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
492 return &AMDGPU::AV_192RegClass;
493 if (RC == &AMDGPU::VReg_192_Align2RegClass ||
494 RC == &AMDGPU::AReg_192_Align2RegClass)
495 return &AMDGPU::AV_192_Align2RegClass;
496 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
497 return &AMDGPU::AV_256RegClass;
498 if (RC == &AMDGPU::VReg_256_Align2RegClass ||
499 RC == &AMDGPU::AReg_256_Align2RegClass)
500 return &AMDGPU::AV_256_Align2RegClass;
501 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
502 return &AMDGPU::AV_512RegClass;
503 if (RC == &AMDGPU::VReg_512_Align2RegClass ||
504 RC == &AMDGPU::AReg_512_Align2RegClass)
505 return &AMDGPU::AV_512_Align2RegClass;
506 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
507 return &AMDGPU::AV_1024RegClass;
508 if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
509 RC == &AMDGPU::AReg_1024_Align2RegClass)
510 return &AMDGPU::AV_1024_Align2RegClass;
511 }
512
514}
515
517 const SIFrameLowering *TFI = ST.getFrameLowering();
519
520 // During ISel lowering we always reserve the stack pointer in entry and chain
521 // functions, but never actually want to reference it when accessing our own
522 // frame. If we need a frame pointer we use it, but otherwise we can just use
523 // an immediate "0" which we represent by returning NoRegister.
524 if (FuncInfo->isBottomOfStack()) {
525 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
526 }
527 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
528 : FuncInfo->getStackPtrOffsetReg();
529}
530
532 // When we need stack realignment, we can't reference off of the
533 // stack pointer, so we reserve a base pointer.
534 return shouldRealignStack(MF);
535}
536
537Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
538
540 return AMDGPU_AllVGPRs_RegMask;
541}
542
544 return AMDGPU_AllAGPRs_RegMask;
545}
546
548 return AMDGPU_AllVectorRegs_RegMask;
549}
550
552 return AMDGPU_AllAllocatableSRegs_RegMask;
553}
554
555unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
556 unsigned NumRegs) {
557 assert(NumRegs < SubRegFromChannelTableWidthMap.size());
558 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
559 assert(NumRegIndex && "Not implemented");
560 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
561 return SubRegFromChannelTable[NumRegIndex - 1][Channel];
562}
563
566 const unsigned Align,
567 const TargetRegisterClass *RC) const {
568 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align;
569 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
570 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
571}
572
574 const MachineFunction &MF) const {
575 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
576}
577
579 BitVector Reserved(getNumRegs());
580 Reserved.set(AMDGPU::MODE);
581
583
584 // Reserve special purpose registers.
585 //
586 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
587 // this seems likely to result in bugs, so I'm marking them as reserved.
588 reserveRegisterTuples(Reserved, AMDGPU::EXEC);
589 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
590
591 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
592 reserveRegisterTuples(Reserved, AMDGPU::M0);
593
594 // Reserve src_vccz, src_execz, src_scc.
595 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
596 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
597 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
598
599 // Reserve the memory aperture registers
600 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
601 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
602 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
603 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
604 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_LO);
605 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_HI);
606
607 // Reserve async counters pseudo registers
608 reserveRegisterTuples(Reserved, AMDGPU::ASYNCcnt);
609 reserveRegisterTuples(Reserved, AMDGPU::TENSORcnt);
610
611 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
612 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
613
614 // Reserve xnack_mask registers - support is not implemented in Codegen.
615 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
616
617 // Reserve lds_direct register - support is not implemented in Codegen.
618 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
619
620 // Reserve Trap Handler registers - support is not implemented in Codegen.
621 reserveRegisterTuples(Reserved, AMDGPU::TBA);
622 reserveRegisterTuples(Reserved, AMDGPU::TMA);
623 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
624 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
625 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
626 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
627 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
628 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
629 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
630 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
631
632 // Reserve null register - it shall never be allocated
633 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
634
635 // Reserve SGPRs.
636 //
637 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
638 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
639 for (const TargetRegisterClass *RC : regclasses()) {
640 if (RC->isBaseClass() && isSGPRClass(RC)) {
641 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
642 for (MCPhysReg Reg : *RC) {
643 unsigned Index = getHWRegIndex(Reg);
644 if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs)
645 Reserved.set(Reg);
646 }
647 }
648 }
649
650 Register ScratchRSrcReg = MFI->getScratchRSrcReg();
651 if (ScratchRSrcReg != AMDGPU::NoRegister) {
652 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
653 // need to spill.
654 // TODO: May need to reserve a VGPR if doing LDS spilling.
655 reserveRegisterTuples(Reserved, ScratchRSrcReg);
656 }
657
658 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
659 if (LongBranchReservedReg)
660 reserveRegisterTuples(Reserved, LongBranchReservedReg);
661
662 // We have to assume the SP is needed in case there are calls in the function,
663 // which is detected after the function is lowered. If we aren't really going
664 // to need SP, don't bother reserving it.
665 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
666 if (StackPtrReg) {
667 reserveRegisterTuples(Reserved, StackPtrReg);
668 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
669 }
670
671 MCRegister FrameReg = MFI->getFrameOffsetReg();
672 if (FrameReg) {
673 reserveRegisterTuples(Reserved, FrameReg);
674 assert(!isSubRegister(ScratchRSrcReg, FrameReg));
675 }
676
677 if (hasBasePointer(MF)) {
678 MCRegister BasePtrReg = getBaseRegister();
679 reserveRegisterTuples(Reserved, BasePtrReg);
680 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
681 }
682
683 // FIXME: Use same reserved register introduced in D149775
684 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
685 Register ExecCopyReg = MFI->getSGPRForEXECCopy();
686 if (ExecCopyReg)
687 reserveRegisterTuples(Reserved, ExecCopyReg);
688
689 // Reserve VGPRs/AGPRs.
690 //
691 auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction());
692
693 for (const TargetRegisterClass *RC : regclasses()) {
694 if (RC->isBaseClass() && isVGPRClass(RC)) {
695 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
696 for (MCPhysReg Reg : *RC) {
697 unsigned Index = getHWRegIndex(Reg);
698 if (Index + NumRegs > MaxNumVGPRs)
699 Reserved.set(Reg);
700 }
701 }
702 }
703
704 // Reserve all the AGPRs if there are no instructions to use it.
705 if (!ST.hasMAIInsts())
706 MaxNumAGPRs = 0;
707 for (const TargetRegisterClass *RC : regclasses()) {
708 if (RC->isBaseClass() && isAGPRClass(RC)) {
709 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
710 for (MCPhysReg Reg : *RC) {
711 unsigned Index = getHWRegIndex(Reg);
712 if (Index + NumRegs > MaxNumAGPRs)
713 Reserved.set(Reg);
714 }
715 }
716 }
717
718 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
719 // VGPR available at all times.
720 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
721 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
722 }
723
724 // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The
725 // MFI->getNonWWMRegMask() field will have a valid bitmask only during
726 // wwm-regalloc and it would be empty otherwise.
727 BitVector NonWWMRegMask = MFI->getNonWWMRegMask();
728 if (!NonWWMRegMask.empty()) {
729 for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs;
730 RegI < RegE; ++RegI) {
731 if (NonWWMRegMask.test(RegI))
732 reserveRegisterTuples(Reserved, RegI);
733 }
734 }
735
736 for (Register Reg : MFI->getWWMReservedRegs())
737 reserveRegisterTuples(Reserved, Reg);
738
739 // FIXME: Stop using reserved registers for this.
740 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
741 reserveRegisterTuples(Reserved, Reg);
742
743 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
744 reserveRegisterTuples(Reserved, Reg);
745
746 return Reserved;
747}
748
750 MCRegister PhysReg) const {
751 return !MF.getRegInfo().isReserved(PhysReg);
752}
753
756 // On entry or in chain functions, the base address is 0, so it can't possibly
757 // need any more alignment.
758
759 // FIXME: Should be able to specify the entry frame alignment per calling
760 // convention instead.
761 if (Info->isBottomOfStack())
762 return false;
763
765}
766
769 if (Info->isEntryFunction()) {
770 const MachineFrameInfo &MFI = Fn.getFrameInfo();
771 return MFI.hasStackObjects() || MFI.hasCalls();
772 }
773
774 // May need scavenger for dealing with callee saved registers.
775 return true;
776}
777
779 const MachineFunction &MF) const {
780 // Do not use frame virtual registers. They used to be used for SGPRs, but
781 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
782 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
783 // spill.
784 return false;
785}
786
788 const MachineFunction &MF) const {
789 const MachineFrameInfo &MFI = MF.getFrameInfo();
790 return MFI.hasStackObjects();
791}
792
794 const MachineFunction &) const {
795 // There are no special dedicated stack or frame pointers.
796 return true;
797}
798
801
802 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
803 AMDGPU::OpName::offset);
804 return MI->getOperand(OffIdx).getImm();
805}
806
808 int Idx) const {
809 switch (MI->getOpcode()) {
810 case AMDGPU::V_ADD_U32_e32:
811 case AMDGPU::V_ADD_U32_e64:
812 case AMDGPU::V_ADD_CO_U32_e32: {
813 int OtherIdx = Idx == 1 ? 2 : 1;
814 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
815 return OtherOp.isImm() ? OtherOp.getImm() : 0;
816 }
817 case AMDGPU::V_ADD_CO_U32_e64: {
818 int OtherIdx = Idx == 2 ? 3 : 2;
819 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
820 return OtherOp.isImm() ? OtherOp.getImm() : 0;
821 }
822 default:
823 break;
824 }
825
827 return 0;
828
829 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
830 AMDGPU::OpName::vaddr) ||
831 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
832 AMDGPU::OpName::saddr))) &&
833 "Should never see frame index on non-address operand");
834
836}
837
839 const MachineInstr &MI) {
840 assert(MI.getDesc().isAdd());
841 const MachineOperand &Src0 = MI.getOperand(1);
842 const MachineOperand &Src1 = MI.getOperand(2);
843
844 if (Src0.isFI()) {
845 return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
846 Src1.getReg()));
847 }
848
849 if (Src1.isFI()) {
850 return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
851 Src0.getReg()));
852 }
853
854 return false;
855}
856
858 // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
859 switch (MI->getOpcode()) {
860 case AMDGPU::V_ADD_U32_e32: {
861 // TODO: We could handle this but it requires work to avoid violating
862 // operand restrictions.
863 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
864 !isFIPlusImmOrVGPR(*this, *MI))
865 return false;
866 [[fallthrough]];
867 }
868 case AMDGPU::V_ADD_U32_e64:
869 // FIXME: This optimization is barely profitable hasFlatScratchEnabled
870 // as-is.
871 //
872 // Much of the benefit with the MUBUF handling is we avoid duplicating the
873 // shift of the frame register, which isn't needed with scratch.
874 //
875 // materializeFrameBaseRegister doesn't know the register classes of the
876 // uses, and unconditionally uses an s_add_i32, which will end up using a
877 // copy for the vector uses.
878 return !ST.hasFlatScratchEnabled();
879 case AMDGPU::V_ADD_CO_U32_e32:
880 if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
881 !isFIPlusImmOrVGPR(*this, *MI))
882 return false;
883 // We can't deal with the case where the carry out has a use (though this
884 // should never happen)
885 return MI->getOperand(3).isDead();
886 case AMDGPU::V_ADD_CO_U32_e64:
887 // TODO: Should we check use_empty instead?
888 return MI->getOperand(1).isDead();
889 default:
890 break;
891 }
892
894 return false;
895
896 int64_t FullOffset = Offset + getScratchInstrOffset(MI);
897
898 const SIInstrInfo *TII = ST.getInstrInfo();
900 return !TII->isLegalMUBUFImmOffset(FullOffset);
901
902 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
904}
905
907 int FrameIdx,
908 int64_t Offset) const {
909 MachineBasicBlock::iterator Ins = MBB->begin();
910 DebugLoc DL; // Defaults to "unknown"
911
912 if (Ins != MBB->end())
913 DL = Ins->getDebugLoc();
914
915 MachineFunction *MF = MBB->getParent();
916 const SIInstrInfo *TII = ST.getInstrInfo();
918 unsigned MovOpc =
919 ST.hasFlatScratchEnabled() ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
920
921 Register BaseReg = MRI.createVirtualRegister(
922 ST.hasFlatScratchEnabled() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
923 : &AMDGPU::VGPR_32RegClass);
924
925 if (Offset == 0) {
926 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
927 .addFrameIndex(FrameIdx);
928 return BaseReg;
929 }
930
931 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
932
933 Register FIReg = MRI.createVirtualRegister(ST.hasFlatScratchEnabled()
934 ? &AMDGPU::SReg_32_XM0RegClass
935 : &AMDGPU::VGPR_32RegClass);
936
937 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
938 .addImm(Offset);
939 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
940 .addFrameIndex(FrameIdx);
941
942 if (ST.hasFlatScratchEnabled()) {
943 // FIXME: Make sure scc isn't live in.
944 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
945 .addReg(OffsetReg, RegState::Kill)
946 .addReg(FIReg)
947 .setOperandDead(3); // scc
948 return BaseReg;
949 }
950
951 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
952 .addReg(OffsetReg, RegState::Kill)
953 .addReg(FIReg)
954 .addImm(0); // clamp bit
955
956 return BaseReg;
957}
958
960 int64_t Offset) const {
961 const SIInstrInfo *TII = ST.getInstrInfo();
962
963 switch (MI.getOpcode()) {
964 case AMDGPU::V_ADD_U32_e32:
965 case AMDGPU::V_ADD_CO_U32_e32: {
966 MachineOperand *FIOp = &MI.getOperand(2);
967 MachineOperand *ImmOp = &MI.getOperand(1);
968 if (!FIOp->isFI())
969 std::swap(FIOp, ImmOp);
970
971 if (!ImmOp->isImm()) {
972 assert(Offset == 0);
973 FIOp->ChangeToRegister(BaseReg, false);
974 TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
975 return;
976 }
977
978 int64_t TotalOffset = ImmOp->getImm() + Offset;
979 if (TotalOffset == 0) {
980 MI.setDesc(TII->get(AMDGPU::COPY));
981 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
982 MI.removeOperand(I);
983
984 MI.getOperand(1).ChangeToRegister(BaseReg, false);
985 return;
986 }
987
988 ImmOp->setImm(TotalOffset);
989
990 MachineBasicBlock *MBB = MI.getParent();
991 MachineFunction *MF = MBB->getParent();
993
994 // FIXME: materializeFrameBaseRegister does not know the register class of
995 // the uses of the frame index, and assumes SGPR for hasFlatScratchEnabled.
996 // Emit a copy so we have a legal operand and hope the register coalescer
997 // can clean it up.
998 if (isSGPRReg(MRI, BaseReg)) {
999 Register BaseRegVGPR =
1000 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1001 BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
1002 .addReg(BaseReg);
1003 MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
1004 } else {
1005 MI.getOperand(2).ChangeToRegister(BaseReg, false);
1006 }
1007 return;
1008 }
1009 case AMDGPU::V_ADD_U32_e64:
1010 case AMDGPU::V_ADD_CO_U32_e64: {
1011 int Src0Idx = MI.getNumExplicitDefs();
1012 MachineOperand *FIOp = &MI.getOperand(Src0Idx);
1013 MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
1014 if (!FIOp->isFI())
1015 std::swap(FIOp, ImmOp);
1016
1017 if (!ImmOp->isImm()) {
1018 FIOp->ChangeToRegister(BaseReg, false);
1019 TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
1020 return;
1021 }
1022
1023 int64_t TotalOffset = ImmOp->getImm() + Offset;
1024 if (TotalOffset == 0) {
1025 MI.setDesc(TII->get(AMDGPU::COPY));
1026
1027 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1028 MI.removeOperand(I);
1029
1030 MI.getOperand(1).ChangeToRegister(BaseReg, false);
1031 } else {
1032 FIOp->ChangeToRegister(BaseReg, false);
1033 ImmOp->setImm(TotalOffset);
1034 }
1035
1036 return;
1037 }
1038 default:
1039 break;
1040 }
1041
1042 bool IsFlat = TII->isFLATScratch(MI);
1043
1044#ifndef NDEBUG
1045 // FIXME: Is it possible to be storing a frame index to itself?
1046 bool SeenFI = false;
1047 for (const MachineOperand &MO: MI.operands()) {
1048 if (MO.isFI()) {
1049 if (SeenFI)
1050 llvm_unreachable("should not see multiple frame indices");
1051
1052 SeenFI = true;
1053 }
1054 }
1055#endif
1056
1057 MachineOperand *FIOp =
1058 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
1059 : AMDGPU::OpName::vaddr);
1060
1061 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
1062 int64_t NewOffset = OffsetOp->getImm() + Offset;
1063
1064 assert(FIOp && FIOp->isFI() && "frame index must be address operand");
1065 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
1066
1067 if (IsFlat) {
1068 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1070 "offset should be legal");
1071 FIOp->ChangeToRegister(BaseReg, false);
1072 OffsetOp->setImm(NewOffset);
1073 return;
1074 }
1075
1076#ifndef NDEBUG
1077 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
1078 assert(SOffset->isImm() && SOffset->getImm() == 0);
1079#endif
1080
1081 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
1082
1083 FIOp->ChangeToRegister(BaseReg, false);
1084 OffsetOp->setImm(NewOffset);
1085}
1086
1088 Register BaseReg,
1089 int64_t Offset) const {
1090
1091 switch (MI->getOpcode()) {
1092 case AMDGPU::V_ADD_U32_e32:
1093 case AMDGPU::V_ADD_CO_U32_e32:
1094 return true;
1095 case AMDGPU::V_ADD_U32_e64:
1096 case AMDGPU::V_ADD_CO_U32_e64:
1097 return ST.hasVOP3Literal() || AMDGPU::isInlinableIntLiteral(Offset);
1098 default:
1099 break;
1100 }
1101
1103 return false;
1104
1105 int64_t NewOffset = Offset + getScratchInstrOffset(MI);
1106
1107 const SIInstrInfo *TII = ST.getInstrInfo();
1109 return TII->isLegalMUBUFImmOffset(NewOffset);
1110
1111 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1113}
1114
1115const TargetRegisterClass *
1117 // This is inaccurate. It depends on the instruction and address space. The
1118 // only place where we should hit this is for dealing with frame indexes /
1119 // private accesses, so this is correct in that case.
1120 return &AMDGPU::VGPR_32RegClass;
1121}
1122
1123const TargetRegisterClass *
1125 return RC == &AMDGPU::SCC_CLASSRegClass ? &AMDGPU::SReg_32RegClass : RC;
1126}
1127
1129 const SIInstrInfo *TII) {
1130
1131 unsigned Op = MI.getOpcode();
1132 switch (Op) {
1133 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
1134 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
1135 // FIXME: This assumes the mask is statically known and not computed at
1136 // runtime. However, some ABIs may want to compute the mask dynamically and
1137 // this will need to be updated.
1138 return llvm::popcount(
1139 (uint64_t)TII->getNamedOperand(MI, AMDGPU::OpName::mask)->getImm());
1140 case AMDGPU::SI_SPILL_S1024_SAVE:
1141 case AMDGPU::SI_SPILL_S1024_RESTORE:
1142 case AMDGPU::SI_SPILL_V1024_SAVE:
1143 case AMDGPU::SI_SPILL_V1024_RESTORE:
1144 case AMDGPU::SI_SPILL_A1024_SAVE:
1145 case AMDGPU::SI_SPILL_A1024_RESTORE:
1146 case AMDGPU::SI_SPILL_AV1024_SAVE:
1147 case AMDGPU::SI_SPILL_AV1024_RESTORE:
1148 return 32;
1149 case AMDGPU::SI_SPILL_S512_SAVE:
1150 case AMDGPU::SI_SPILL_S512_RESTORE:
1151 case AMDGPU::SI_SPILL_V512_SAVE:
1152 case AMDGPU::SI_SPILL_V512_RESTORE:
1153 case AMDGPU::SI_SPILL_A512_SAVE:
1154 case AMDGPU::SI_SPILL_A512_RESTORE:
1155 case AMDGPU::SI_SPILL_AV512_SAVE:
1156 case AMDGPU::SI_SPILL_AV512_RESTORE:
1157 return 16;
1158 case AMDGPU::SI_SPILL_S384_SAVE:
1159 case AMDGPU::SI_SPILL_S384_RESTORE:
1160 case AMDGPU::SI_SPILL_V384_SAVE:
1161 case AMDGPU::SI_SPILL_V384_RESTORE:
1162 case AMDGPU::SI_SPILL_A384_SAVE:
1163 case AMDGPU::SI_SPILL_A384_RESTORE:
1164 case AMDGPU::SI_SPILL_AV384_SAVE:
1165 case AMDGPU::SI_SPILL_AV384_RESTORE:
1166 return 12;
1167 case AMDGPU::SI_SPILL_S352_SAVE:
1168 case AMDGPU::SI_SPILL_S352_RESTORE:
1169 case AMDGPU::SI_SPILL_V352_SAVE:
1170 case AMDGPU::SI_SPILL_V352_RESTORE:
1171 case AMDGPU::SI_SPILL_A352_SAVE:
1172 case AMDGPU::SI_SPILL_A352_RESTORE:
1173 case AMDGPU::SI_SPILL_AV352_SAVE:
1174 case AMDGPU::SI_SPILL_AV352_RESTORE:
1175 return 11;
1176 case AMDGPU::SI_SPILL_S320_SAVE:
1177 case AMDGPU::SI_SPILL_S320_RESTORE:
1178 case AMDGPU::SI_SPILL_V320_SAVE:
1179 case AMDGPU::SI_SPILL_V320_RESTORE:
1180 case AMDGPU::SI_SPILL_A320_SAVE:
1181 case AMDGPU::SI_SPILL_A320_RESTORE:
1182 case AMDGPU::SI_SPILL_AV320_SAVE:
1183 case AMDGPU::SI_SPILL_AV320_RESTORE:
1184 return 10;
1185 case AMDGPU::SI_SPILL_S288_SAVE:
1186 case AMDGPU::SI_SPILL_S288_RESTORE:
1187 case AMDGPU::SI_SPILL_V288_SAVE:
1188 case AMDGPU::SI_SPILL_V288_RESTORE:
1189 case AMDGPU::SI_SPILL_A288_SAVE:
1190 case AMDGPU::SI_SPILL_A288_RESTORE:
1191 case AMDGPU::SI_SPILL_AV288_SAVE:
1192 case AMDGPU::SI_SPILL_AV288_RESTORE:
1193 return 9;
1194 case AMDGPU::SI_SPILL_S256_SAVE:
1195 case AMDGPU::SI_SPILL_S256_RESTORE:
1196 case AMDGPU::SI_SPILL_V256_SAVE:
1197 case AMDGPU::SI_SPILL_V256_RESTORE:
1198 case AMDGPU::SI_SPILL_A256_SAVE:
1199 case AMDGPU::SI_SPILL_A256_RESTORE:
1200 case AMDGPU::SI_SPILL_AV256_SAVE:
1201 case AMDGPU::SI_SPILL_AV256_RESTORE:
1202 return 8;
1203 case AMDGPU::SI_SPILL_S224_SAVE:
1204 case AMDGPU::SI_SPILL_S224_RESTORE:
1205 case AMDGPU::SI_SPILL_V224_SAVE:
1206 case AMDGPU::SI_SPILL_V224_RESTORE:
1207 case AMDGPU::SI_SPILL_A224_SAVE:
1208 case AMDGPU::SI_SPILL_A224_RESTORE:
1209 case AMDGPU::SI_SPILL_AV224_SAVE:
1210 case AMDGPU::SI_SPILL_AV224_RESTORE:
1211 return 7;
1212 case AMDGPU::SI_SPILL_S192_SAVE:
1213 case AMDGPU::SI_SPILL_S192_RESTORE:
1214 case AMDGPU::SI_SPILL_V192_SAVE:
1215 case AMDGPU::SI_SPILL_V192_RESTORE:
1216 case AMDGPU::SI_SPILL_A192_SAVE:
1217 case AMDGPU::SI_SPILL_A192_RESTORE:
1218 case AMDGPU::SI_SPILL_AV192_SAVE:
1219 case AMDGPU::SI_SPILL_AV192_RESTORE:
1220 return 6;
1221 case AMDGPU::SI_SPILL_S160_SAVE:
1222 case AMDGPU::SI_SPILL_S160_RESTORE:
1223 case AMDGPU::SI_SPILL_V160_SAVE:
1224 case AMDGPU::SI_SPILL_V160_RESTORE:
1225 case AMDGPU::SI_SPILL_A160_SAVE:
1226 case AMDGPU::SI_SPILL_A160_RESTORE:
1227 case AMDGPU::SI_SPILL_AV160_SAVE:
1228 case AMDGPU::SI_SPILL_AV160_RESTORE:
1229 return 5;
1230 case AMDGPU::SI_SPILL_S128_SAVE:
1231 case AMDGPU::SI_SPILL_S128_RESTORE:
1232 case AMDGPU::SI_SPILL_V128_SAVE:
1233 case AMDGPU::SI_SPILL_V128_RESTORE:
1234 case AMDGPU::SI_SPILL_A128_SAVE:
1235 case AMDGPU::SI_SPILL_A128_RESTORE:
1236 case AMDGPU::SI_SPILL_AV128_SAVE:
1237 case AMDGPU::SI_SPILL_AV128_RESTORE:
1238 return 4;
1239 case AMDGPU::SI_SPILL_S96_SAVE:
1240 case AMDGPU::SI_SPILL_S96_RESTORE:
1241 case AMDGPU::SI_SPILL_V96_SAVE:
1242 case AMDGPU::SI_SPILL_V96_RESTORE:
1243 case AMDGPU::SI_SPILL_A96_SAVE:
1244 case AMDGPU::SI_SPILL_A96_RESTORE:
1245 case AMDGPU::SI_SPILL_AV96_SAVE:
1246 case AMDGPU::SI_SPILL_AV96_RESTORE:
1247 return 3;
1248 case AMDGPU::SI_SPILL_S64_SAVE:
1249 case AMDGPU::SI_SPILL_S64_RESTORE:
1250 case AMDGPU::SI_SPILL_V64_SAVE:
1251 case AMDGPU::SI_SPILL_V64_RESTORE:
1252 case AMDGPU::SI_SPILL_A64_SAVE:
1253 case AMDGPU::SI_SPILL_A64_RESTORE:
1254 case AMDGPU::SI_SPILL_AV64_SAVE:
1255 case AMDGPU::SI_SPILL_AV64_RESTORE:
1256 return 2;
1257 case AMDGPU::SI_SPILL_S32_SAVE:
1258 case AMDGPU::SI_SPILL_S32_RESTORE:
1259 case AMDGPU::SI_SPILL_V32_SAVE:
1260 case AMDGPU::SI_SPILL_V32_RESTORE:
1261 case AMDGPU::SI_SPILL_A32_SAVE:
1262 case AMDGPU::SI_SPILL_A32_RESTORE:
1263 case AMDGPU::SI_SPILL_AV32_SAVE:
1264 case AMDGPU::SI_SPILL_AV32_RESTORE:
1265 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
1266 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
1267 case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
1268 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
1269 case AMDGPU::SI_SPILL_V16_SAVE:
1270 case AMDGPU::SI_SPILL_V16_RESTORE:
1271 return 1;
1272 default: llvm_unreachable("Invalid spill opcode");
1273 }
1274}
1275
1276static int getOffsetMUBUFStore(unsigned Opc) {
1277 switch (Opc) {
1278 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
1279 return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1280 case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
1281 return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
1282 case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
1283 return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
1284 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
1285 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
1286 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
1287 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
1288 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
1289 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
1290 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
1291 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
1292 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
1293 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
1294 default:
1295 return -1;
1296 }
1297}
1298
1299static int getOffsetMUBUFLoad(unsigned Opc) {
1300 switch (Opc) {
1301 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
1302 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1303 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
1304 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
1305 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
1306 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
1307 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
1308 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
1309 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
1310 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
1311 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
1312 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
1313 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
1314 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
1315 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
1316 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
1317 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
1318 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
1319 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
1320 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
1321 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
1322 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
1323 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
1324 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
1325 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
1326 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
1327 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
1328 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
1329 default:
1330 return -1;
1331 }
1332}
1333
1334static int getOffenMUBUFStore(unsigned Opc) {
1335 switch (Opc) {
1336 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
1337 return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
1338 case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
1339 return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
1340 case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
1341 return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
1342 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
1343 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
1344 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
1345 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
1346 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
1347 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
1348 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
1349 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
1350 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
1351 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
1352 default:
1353 return -1;
1354 }
1355}
1356
1357static int getOffenMUBUFLoad(unsigned Opc) {
1358 switch (Opc) {
1359 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
1360 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
1361 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
1362 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
1363 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
1364 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
1365 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
1366 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
1367 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
1368 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
1369 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
1370 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
1371 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
1372 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
1373 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
1374 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
1375 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
1376 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
1377 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
1378 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
1379 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
1380 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
1381 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
1382 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
1383 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
1384 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
1385 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
1386 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
1387 default:
1388 return -1;
1389 }
1390}
1391
1395 int Index, unsigned Lane,
1396 unsigned ValueReg, bool IsKill) {
1397 MachineFunction *MF = MBB.getParent();
1399 const SIInstrInfo *TII = ST.getInstrInfo();
1400
1401 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
1402
1403 if (Reg == AMDGPU::NoRegister)
1404 return MachineInstrBuilder();
1405
1406 bool IsStore = MI->mayStore();
1408 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1409
1410 unsigned Dst = IsStore ? Reg : ValueReg;
1411 unsigned Src = IsStore ? ValueReg : Reg;
1412 bool IsVGPR = TRI->isVGPR(MRI, Reg);
1413 const DebugLoc &DL = MI->getDebugLoc();
1414 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
1415 // Spiller during regalloc may restore a spilled register to its superclass.
1416 // It could result in AGPR spills restored to VGPRs or the other way around,
1417 // making the src and dst with identical regclasses at this point. It just
1418 // needs a copy in such cases.
1419 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
1420 .addReg(Src, getKillRegState(IsKill));
1422 return CopyMIB;
1423 }
1424 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1425 : AMDGPU::V_ACCVGPR_READ_B32_e64;
1426
1427 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
1428 .addReg(Src, getKillRegState(IsKill));
1430 return MIB;
1431}
1432
1433// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1434// need to handle the case where an SGPR may need to be spilled while spilling.
1436 MachineFrameInfo &MFI,
1438 int Index,
1439 int64_t Offset) {
1440 const SIInstrInfo *TII = ST.getInstrInfo();
1441 MachineBasicBlock *MBB = MI->getParent();
1442 const DebugLoc &DL = MI->getDebugLoc();
1443 bool IsStore = MI->mayStore();
1444
1445 unsigned Opc = MI->getOpcode();
1446 int LoadStoreOp = IsStore ?
1448 if (LoadStoreOp == -1)
1449 return false;
1450
1451 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
1452 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr())
1453 return true;
1454
1455 MachineInstrBuilder NewMI =
1456 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
1457 .add(*Reg)
1458 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
1459 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
1460 .addImm(Offset)
1461 .addImm(0) // cpol
1462 .addImm(0) // swz
1463 .cloneMemRefs(*MI);
1464
1465 const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
1466 AMDGPU::OpName::vdata_in);
1467 if (VDataIn)
1468 NewMI.add(*VDataIn);
1469 return true;
1470}
1471
1473 unsigned LoadStoreOp,
1474 unsigned EltSize) {
1475 bool IsStore = TII->get(LoadStoreOp).mayStore();
1476 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr);
1477 bool UseST =
1478 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
1479
1480 // Handle block load/store first.
1481 if (TII->isBlockLoadStore(LoadStoreOp))
1482 return LoadStoreOp;
1483
1484 switch (EltSize) {
1485 case 4:
1486 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1487 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
1488 break;
1489 case 8:
1490 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1491 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
1492 break;
1493 case 12:
1494 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1495 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
1496 break;
1497 case 16:
1498 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1499 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
1500 break;
1501 default:
1502 llvm_unreachable("Unexpected spill load/store size!");
1503 }
1504
1505 if (HasVAddr)
1506 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1507 else if (UseST)
1508 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1509
1510 return LoadStoreOp;
1511}
1512
1515 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1516 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1517 RegScavenger *RS, LiveRegUnits *LiveUnits) const {
1518 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
1519
1520 MachineFunction *MF = MBB.getParent();
1521 const SIInstrInfo *TII = ST.getInstrInfo();
1522 const MachineFrameInfo &MFI = MF->getFrameInfo();
1523 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1524
1525 const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
1526 bool IsStore = Desc->mayStore();
1527 bool IsFlat = TII->isFLATScratch(LoadStoreOp);
1528 bool IsBlock = TII->isBlockLoadStore(LoadStoreOp);
1529
1530 bool CanClobberSCC = false;
1531 bool Scavenged = false;
1532 MCRegister SOffset = ScratchOffsetReg;
1533
1534 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
1535 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1536 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
1537 const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
1538
1539 // Always use 4 byte operations for AGPRs because we need to scavenge
1540 // a temporary VGPR.
1541 // If we're using a block operation, the element should be the whole block.
1542 unsigned EltSize = IsBlock ? RegWidth
1543 : (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u)
1544 : 4u;
1545 unsigned NumSubRegs = RegWidth / EltSize;
1546 unsigned Size = NumSubRegs * EltSize;
1547 unsigned RemSize = RegWidth - Size;
1548 unsigned NumRemSubRegs = RemSize ? 1 : 0;
1549 int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
1550 int64_t MaterializedOffset = Offset;
1551
1552 // Maxoffset is the starting offset for the last chunk to be spilled.
1553 // In case of non-zero remainder element, max offset will be the
1554 // last address(offset + Size) after spilling all the EltSize chunks.
1555 int64_t MaxOffset = Offset + Size - (RemSize ? 0 : EltSize);
1556 int64_t ScratchOffsetRegDelta = 0;
1557
1558 if (IsFlat && EltSize > 4) {
1559 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1560 Desc = &TII->get(LoadStoreOp);
1561 }
1562
1563 Align Alignment = MFI.getObjectAlign(Index);
1564 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
1565
1566 assert((IsFlat || ((Offset % EltSize) == 0)) &&
1567 "unexpected VGPR spill offset");
1568
1569 // Track a VGPR to use for a constant offset we need to materialize.
1570 Register TmpOffsetVGPR;
1571
1572 // Track a VGPR to use as an intermediate value.
1573 Register TmpIntermediateVGPR;
1574 bool UseVGPROffset = false;
1575
1576 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1577 // combination.
1578 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
1579 int64_t VOffset) {
1580 // We are using a VGPR offset
1581 if (IsFlat && SGPRBase) {
1582 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1583 // SGPR, so perform the add as vector.
1584 // We don't need a base SGPR in the kernel.
1585
1586 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
1587 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
1588 .addReg(SGPRBase)
1589 .addImm(VOffset)
1590 .addImm(0); // clamp
1591 } else {
1592 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1593 .addReg(SGPRBase);
1594 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
1595 .addImm(VOffset)
1596 .addReg(TmpOffsetVGPR);
1597 }
1598 } else {
1599 assert(TmpOffsetVGPR);
1600 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1601 .addImm(VOffset);
1602 }
1603 };
1604
1605 bool IsOffsetLegal =
1606 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1608 : TII->isLegalMUBUFImmOffset(MaxOffset);
1609 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
1610 SOffset = MCRegister();
1611
1612 // We don't have access to the register scavenger if this function is called
1613 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1614 // TODO: Clobbering SCC is not necessary for scratch instructions in the
1615 // entry.
1616 if (RS) {
1617 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false);
1618
1619 // Piggy back on the liveness scan we just did see if SCC is dead.
1620 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
1621 } else if (LiveUnits) {
1622 CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
1623 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
1624 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1625 SOffset = Reg;
1626 break;
1627 }
1628 }
1629 }
1630
1631 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
1632 SOffset = Register();
1633
1634 if (!SOffset) {
1635 UseVGPROffset = true;
1636
1637 if (RS) {
1638 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1639 } else {
1640 assert(LiveUnits);
1641 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
1642 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1643 TmpOffsetVGPR = Reg;
1644 break;
1645 }
1646 }
1647 }
1648
1649 assert(TmpOffsetVGPR);
1650 } else if (!SOffset && CanClobberSCC) {
1651 // There are no free SGPRs, and since we are in the process of spilling
1652 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
1653 // on SI/CI and on VI it is true until we implement spilling using scalar
1654 // stores), we have no way to free up an SGPR. Our solution here is to
1655 // add the offset directly to the ScratchOffset or StackPtrOffset
1656 // register, and then subtract the offset after the spill to return the
1657 // register to it's original value.
1658
1659 // TODO: If we don't have to do an emergency stack slot spill, converting
1660 // to use the VGPR offset is fewer instructions.
1661 if (!ScratchOffsetReg)
1662 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
1663 SOffset = ScratchOffsetReg;
1664 ScratchOffsetRegDelta = Offset;
1665 } else {
1666 Scavenged = true;
1667 }
1668
1669 // We currently only support spilling VGPRs to EltSize boundaries, meaning
1670 // we can simplify the adjustment of Offset here to just scale with
1671 // WavefrontSize.
1672 if (!IsFlat && !UseVGPROffset)
1673 Offset *= ST.getWavefrontSize();
1674
1675 if (!UseVGPROffset && !SOffset)
1676 report_fatal_error("could not scavenge SGPR to spill in entry function");
1677
1678 if (UseVGPROffset) {
1679 // We are using a VGPR offset
1680 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
1681 } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
1682 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
1683 } else {
1684 assert(Offset != 0);
1685 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1686 .addReg(ScratchOffsetReg)
1687 .addImm(Offset);
1688 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1689 }
1690
1691 Offset = 0;
1692 }
1693
1694 if (IsFlat && SOffset == AMDGPU::NoRegister) {
1695 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
1696 && "Unexpected vaddr for flat scratch with a FI operand");
1697
1698 if (UseVGPROffset) {
1699 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1700 } else {
1701 assert(ST.hasFlatScratchSTMode());
1702 assert(!TII->isBlockLoadStore(LoadStoreOp) && "Block ops don't have ST");
1703 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1704 }
1705
1706 Desc = &TII->get(LoadStoreOp);
1707 }
1708
1709 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
1710 ++i, RegOffset += EltSize) {
1711 if (i == NumSubRegs) {
1712 EltSize = RemSize;
1713 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1714 }
1715 Desc = &TII->get(LoadStoreOp);
1716
1717 if (!IsFlat && UseVGPROffset) {
1718 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
1719 : getOffenMUBUFLoad(LoadStoreOp);
1720 Desc = &TII->get(NewLoadStoreOp);
1721 }
1722
1723 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
1724 // If we are spilling an AGPR beyond the range of the memory instruction
1725 // offset and need to use a VGPR offset, we ideally have at least 2
1726 // scratch VGPRs. If we don't have a second free VGPR without spilling,
1727 // recycle the VGPR used for the offset which requires resetting after
1728 // each subregister.
1729
1730 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
1731 }
1732
1733 unsigned NumRegs = EltSize / 4;
1734 Register SubReg = e == 1
1735 ? ValueReg
1736 : Register(getSubReg(ValueReg,
1737 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1738
1739 RegState SOffsetRegState = {};
1740 RegState SrcDstRegState = getDefRegState(!IsStore);
1741 const bool IsLastSubReg = i + 1 == e;
1742 const bool IsFirstSubReg = i == 0;
1743 if (IsLastSubReg) {
1744 SOffsetRegState |= getKillRegState(Scavenged);
1745 // The last implicit use carries the "Kill" flag.
1746 SrcDstRegState |= getKillRegState(IsKill);
1747 }
1748
1749 // Make sure the whole register is defined if there are undef components by
1750 // adding an implicit def of the super-reg on the first instruction.
1751 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg;
1752 bool NeedSuperRegImpOperand = e > 1;
1753
1754 // Remaining element size to spill into memory after some parts of it
1755 // spilled into either AGPRs or VGPRs.
1756 unsigned RemEltSize = EltSize;
1757
1758 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1759 // starting from the last lane. In case if a register cannot be completely
1760 // spilled into another register that will ensure its alignment does not
1761 // change. For targets with VGPR alignment requirement this is important
1762 // in case of flat scratch usage as we might get a scratch_load or
1763 // scratch_store of an unaligned register otherwise.
1764 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
1765 LaneE = RegOffset / 4;
1766 Lane >= LaneE; --Lane) {
1767 bool IsSubReg = e > 1 || EltSize > 4;
1768 Register Sub = IsSubReg
1769 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
1770 : ValueReg;
1771 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
1772 if (!MIB.getInstr())
1773 break;
1774 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
1775 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1776 NeedSuperRegDef = false;
1777 }
1778 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
1779 NeedSuperRegImpOperand = true;
1780 RegState State = SrcDstRegState;
1781 if (!IsLastSubReg || (Lane != LaneE))
1782 State &= ~RegState::Kill;
1783 if (!IsFirstSubReg || (Lane != LaneS))
1784 State &= ~RegState::Define;
1785 MIB.addReg(ValueReg, RegState::Implicit | State);
1786 }
1787 RemEltSize -= 4;
1788 }
1789
1790 if (!RemEltSize) // Fully spilled into AGPRs.
1791 continue;
1792
1793 if (RemEltSize != EltSize) { // Partially spilled to AGPRs
1794 assert(IsFlat && EltSize > 4);
1795
1796 unsigned NumRegs = RemEltSize / 4;
1797 SubReg = Register(getSubReg(ValueReg,
1798 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1799 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
1800 Desc = &TII->get(Opc);
1801 }
1802
1803 unsigned FinalReg = SubReg;
1804
1805 if (IsAGPR) {
1806 assert(EltSize == 4);
1807
1808 if (!TmpIntermediateVGPR) {
1809 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
1810 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
1811 }
1812 if (IsStore) {
1813 auto AccRead = BuildMI(MBB, MI, DL,
1814 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
1815 TmpIntermediateVGPR)
1816 .addReg(SubReg, getKillRegState(IsKill));
1817 if (NeedSuperRegDef)
1818 AccRead.addReg(ValueReg, RegState::ImplicitDefine);
1819 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1820 AccRead.addReg(ValueReg, RegState::Implicit);
1822 }
1823 SubReg = TmpIntermediateVGPR;
1824 } else if (UseVGPROffset) {
1825 if (!TmpOffsetVGPR) {
1826 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1827 MI, false, 0);
1828 RS->setRegUsed(TmpOffsetVGPR);
1829 }
1830 }
1831
1832 Register FinalValueReg = ValueReg;
1833 if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_SADDR) {
1834 // If we are loading 16-bit value with SRAMECC endabled we need a temp
1835 // 32-bit VGPR to load and extract 16-bits into the final register.
1836 ValueReg =
1837 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1838 SubReg = ValueReg;
1839 IsKill = false;
1840 }
1841
1842 // Create the MMO, additional set the NonVolatile flag as scratch memory
1843 // used for spills will not be used outside the thread.
1844 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
1846 PInfo, MMO->getFlags() | MOThreadPrivate, RemEltSize,
1847 commonAlignment(Alignment, RegOffset));
1848
1849 auto MIB =
1850 BuildMI(MBB, MI, DL, *Desc)
1851 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
1852
1853 if (UseVGPROffset) {
1854 // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1855 // intermediate accvgpr_write.
1856 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
1857 }
1858
1859 if (!IsFlat)
1860 MIB.addReg(FuncInfo->getScratchRSrcReg());
1861
1862 if (SOffset == AMDGPU::NoRegister) {
1863 if (!IsFlat) {
1864 if (UseVGPROffset && ScratchOffsetReg) {
1865 MIB.addReg(ScratchOffsetReg);
1866 } else {
1867 assert(FuncInfo->isBottomOfStack());
1868 MIB.addImm(0);
1869 }
1870 }
1871 } else {
1872 MIB.addReg(SOffset, SOffsetRegState);
1873 }
1874
1875 MIB.addImm(Offset + RegOffset);
1876
1877 bool LastUse = MMO->getFlags() & MOLastUse;
1878 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1879
1880 if (!IsFlat)
1881 MIB.addImm(0); // swz
1882 MIB.addMemOperand(NewMMO);
1883
1884 if (FinalValueReg != ValueReg) {
1885 // Extract 16-bit from the loaded 32-bit value.
1886 ValueReg = getSubReg(ValueReg, AMDGPU::lo16);
1887 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e64))
1888 .addReg(FinalValueReg, getDefRegState(true))
1889 .addImm(0)
1890 .addReg(ValueReg, getKillRegState(true))
1891 .addImm(0);
1892 ValueReg = FinalValueReg;
1893 }
1894
1895 if (!IsAGPR && NeedSuperRegDef)
1896 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1897
1898 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
1899 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
1900 FinalReg)
1901 .addReg(TmpIntermediateVGPR, RegState::Kill);
1903 }
1904
1905 bool IsSrcDstDef = hasRegState(SrcDstRegState, RegState::Define);
1906 bool PartialReloadCopy = (RemEltSize != EltSize) && !IsStore;
1907 if (NeedSuperRegImpOperand &&
1908 (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) {
1909 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
1910 if (PartialReloadCopy)
1911 MIB.addReg(ValueReg, RegState::Implicit);
1912 }
1913
1914 // The epilog restore of a wwm-scratch register can cause undesired
1915 // optimization during machine-cp post PrologEpilogInserter if the same
1916 // register was assigned for return value ABI lowering with a COPY
1917 // instruction. As given below, with the epilog reload, the earlier COPY
1918 // appeared to be dead during machine-cp.
1919 // ...
1920 // v0 in WWM operation, needs the WWM spill at prolog/epilog.
1921 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
1922 // ...
1923 // Epilog block:
1924 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
1925 // ...
1926 // WWM spill restore to preserve the inactive lanes of v0.
1927 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
1928 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
1929 // $exec = S_MOV_B64 killed $sgpr4_sgpr5
1930 // ...
1931 // SI_RETURN implicit $vgpr0
1932 // ...
1933 // To fix it, mark the same reg as a tied op for such restore instructions
1934 // so that it marks a usage for the preceding COPY.
1935 if (!IsStore && MI != MBB.end() && MI->isReturn() &&
1936 MI->readsRegister(SubReg, this)) {
1937 MIB.addReg(SubReg, RegState::Implicit);
1938 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1939 }
1940
1941 // If we're building a block load, we should add artificial uses for the
1942 // CSR VGPRs that are *not* being transferred. This is because liveness
1943 // analysis is not aware of the mask, so we need to somehow inform it that
1944 // those registers are not available before the load and they should not be
1945 // scavenged.
1946 if (!IsStore && TII->isBlockLoadStore(LoadStoreOp))
1947 addImplicitUsesForBlockCSRLoad(MIB, ValueReg);
1948 }
1949
1950 if (ScratchOffsetRegDelta != 0) {
1951 // Subtract the offset we added to the ScratchOffset register.
1952 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1953 .addReg(SOffset)
1954 .addImm(-ScratchOffsetRegDelta);
1955 }
1956}
1957
1959 Register BlockReg) const {
1960 const MachineFunction *MF = MIB->getMF();
1961 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1962 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
1963 Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
1964 for (unsigned RegOffset = 1; RegOffset < 32; ++RegOffset)
1965 if (!(Mask & (1 << RegOffset)) &&
1966 isCalleeSavedPhysReg(BaseVGPR + RegOffset, *MF))
1967 MIB.addUse(BaseVGPR + RegOffset, RegState::Implicit);
1968}
1969
1971 int Offset, bool IsLoad,
1972 bool IsKill) const {
1973 // Load/store VGPR
1974 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
1975 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
1976
1977 Register FrameReg =
1978 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
1979 ? getBaseRegister()
1980 : getFrameRegister(SB.MF);
1981
1982 Align Alignment = FrameInfo.getObjectAlign(Index);
1986 SB.EltSize, Alignment);
1987
1988 if (IsLoad) {
1989 unsigned Opc = ST.hasFlatScratchEnabled()
1990 ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1991 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1992 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
1993 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1994 } else {
1995 unsigned Opc = ST.hasFlatScratchEnabled()
1996 ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1997 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1998 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
1999 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
2000 // This only ever adds one VGPR spill
2001 SB.MFI.addToSpilledVGPRs(1);
2002 }
2003}
2004
2006 RegScavenger *RS, SlotIndexes *Indexes,
2007 LiveIntervals *LIS, bool OnlyToVGPR,
2008 bool SpillToPhysVGPRLane) const {
2009 assert(!MI->getOperand(0).isUndef() &&
2010 "undef spill should have been deleted earlier");
2011
2012 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2013
2014 ArrayRef<SpilledReg> VGPRSpills =
2015 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2017 bool SpillToVGPR = !VGPRSpills.empty();
2018 if (OnlyToVGPR && !SpillToVGPR)
2019 return false;
2020
2021 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
2022 SB.SuperReg != SB.MFI.getFrameOffsetReg()));
2023
2024 if (SpillToVGPR) {
2025
2026 // Since stack slot coloring pass is trying to optimize SGPR spills,
2027 // VGPR lanes (mapped from spill stack slot) may be shared for SGPR
2028 // spills of different sizes. This accounts for number of VGPR lanes alloted
2029 // equal to the largest SGPR being spilled in them.
2030 assert(SB.NumSubRegs <= VGPRSpills.size() &&
2031 "Num of SGPRs spilled should be less than or equal to num of "
2032 "the VGPR lanes.");
2033
2034 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2036 SB.NumSubRegs == 1
2037 ? SB.SuperReg
2038 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2039 SpilledReg Spill = VGPRSpills[i];
2040
2041 bool IsFirstSubreg = i == 0;
2042 bool IsLastSubreg = i == SB.NumSubRegs - 1;
2043 bool UseKill = SB.IsKill && IsLastSubreg;
2044
2045
2046 // Mark the "old value of vgpr" input undef only if this is the first sgpr
2047 // spill to this specific vgpr in the first basic block.
2048 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2049 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
2050 .addReg(SubReg, getKillRegState(UseKill))
2051 .addImm(Spill.Lane)
2052 .addReg(Spill.VGPR);
2053 if (Indexes) {
2054 if (IsFirstSubreg)
2055 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2056 else
2057 Indexes->insertMachineInstrInMaps(*MIB);
2058 }
2059
2060 if (IsFirstSubreg && SB.NumSubRegs > 1) {
2061 // We may be spilling a super-register which is only partially defined,
2062 // and need to ensure later spills think the value is defined.
2063 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2064 }
2065
2066 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg))
2068
2069 // FIXME: Since this spills to another register instead of an actual
2070 // frame index, we should delete the frame index when all references to
2071 // it are fixed.
2072 }
2073 } else {
2074 SB.prepare();
2075
2076 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
2077 RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2078
2079 // Per VGPR helper data
2080 auto PVD = SB.getPerVGPRData();
2081
2082 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2083 RegState TmpVGPRFlags = RegState::Undef;
2084
2085 // Write sub registers into the VGPR
2086 for (unsigned i = Offset * PVD.PerVGPR,
2087 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2088 i < e; ++i) {
2090 SB.NumSubRegs == 1
2091 ? SB.SuperReg
2092 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2093
2094 MachineInstrBuilder WriteLane =
2095 BuildMI(*SB.MBB, MI, SB.DL,
2096 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
2097 .addReg(SubReg, SubKillState)
2098 .addImm(i % PVD.PerVGPR)
2099 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2100 TmpVGPRFlags = {};
2101
2102 if (Indexes) {
2103 if (i == 0)
2104 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane);
2105 else
2106 Indexes->insertMachineInstrInMaps(*WriteLane);
2107 }
2108
2109 // There could be undef components of a spilled super register.
2110 // TODO: Can we detect this and skip the spill?
2111 if (SB.NumSubRegs > 1) {
2112 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2113 RegState SuperKillState = {};
2114 if (i + 1 == SB.NumSubRegs)
2115 SuperKillState |= getKillRegState(SB.IsKill);
2116 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2117 }
2118 }
2119
2120 // Write out VGPR
2121 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
2122 }
2123
2124 SB.restore();
2125 }
2126
2127 MI->eraseFromParent();
2129
2130 if (LIS)
2132
2133 return true;
2134}
2135
2137 RegScavenger *RS, SlotIndexes *Indexes,
2138 LiveIntervals *LIS, bool OnlyToVGPR,
2139 bool SpillToPhysVGPRLane) const {
2140 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2141
2142 ArrayRef<SpilledReg> VGPRSpills =
2143 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2145 bool SpillToVGPR = !VGPRSpills.empty();
2146 if (OnlyToVGPR && !SpillToVGPR)
2147 return false;
2148
2149 if (SpillToVGPR) {
2150 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2152 SB.NumSubRegs == 1
2153 ? SB.SuperReg
2154 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2155
2156 SpilledReg Spill = VGPRSpills[i];
2157 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2158 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2159 .addReg(Spill.VGPR)
2160 .addImm(Spill.Lane);
2161 if (SB.NumSubRegs > 1 && i == 0)
2163 if (Indexes) {
2164 if (i == e - 1)
2165 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2166 else
2167 Indexes->insertMachineInstrInMaps(*MIB);
2168 }
2169 }
2170 } else {
2171 SB.prepare();
2172
2173 // Per VGPR helper data
2174 auto PVD = SB.getPerVGPRData();
2175
2176 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2177 // Load in VGPR data
2178 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
2179
2180 // Unpack lanes
2181 for (unsigned i = Offset * PVD.PerVGPR,
2182 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2183 i < e; ++i) {
2185 SB.NumSubRegs == 1
2186 ? SB.SuperReg
2187 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2188
2189 bool LastSubReg = (i + 1 == e);
2190 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2191 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2192 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2193 .addImm(i);
2194 if (SB.NumSubRegs > 1 && i == 0)
2196 if (Indexes) {
2197 if (i == e - 1)
2198 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2199 else
2200 Indexes->insertMachineInstrInMaps(*MIB);
2201 }
2202 }
2203 }
2204
2205 SB.restore();
2206 }
2207
2208 MI->eraseFromParent();
2209
2210 if (LIS)
2212
2213 return true;
2214}
2215
2217 MachineBasicBlock &RestoreMBB,
2218 Register SGPR, RegScavenger *RS) const {
2219 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
2220 RS);
2221 SB.prepare();
2222 // Generate the spill of SGPR to SB.TmpVGPR.
2223 RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2224 auto PVD = SB.getPerVGPRData();
2225 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2226 RegState TmpVGPRFlags = RegState::Undef;
2227 // Write sub registers into the VGPR
2228 for (unsigned i = Offset * PVD.PerVGPR,
2229 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2230 i < e; ++i) {
2232 SB.NumSubRegs == 1
2233 ? SB.SuperReg
2234 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2235
2236 MachineInstrBuilder WriteLane =
2237 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
2238 SB.TmpVGPR)
2239 .addReg(SubReg, SubKillState)
2240 .addImm(i % PVD.PerVGPR)
2241 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2242 TmpVGPRFlags = {};
2243 // There could be undef components of a spilled super register.
2244 // TODO: Can we detect this and skip the spill?
2245 if (SB.NumSubRegs > 1) {
2246 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2247 RegState SuperKillState = {};
2248 if (i + 1 == SB.NumSubRegs)
2249 SuperKillState |= getKillRegState(SB.IsKill);
2250 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2251 }
2252 }
2253 // Don't need to write VGPR out.
2254 }
2255
2256 // Restore clobbered registers in the specified restore block.
2257 MI = RestoreMBB.end();
2258 SB.setMI(&RestoreMBB, MI);
2259 // Generate the restore of SGPR from SB.TmpVGPR.
2260 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2261 // Don't need to load VGPR in.
2262 // Unpack lanes
2263 for (unsigned i = Offset * PVD.PerVGPR,
2264 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2265 i < e; ++i) {
2267 SB.NumSubRegs == 1
2268 ? SB.SuperReg
2269 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2270
2271 assert(SubReg.isPhysical());
2272 bool LastSubReg = (i + 1 == e);
2273 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
2274 SubReg)
2275 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2276 .addImm(i);
2277 if (SB.NumSubRegs > 1 && i == 0)
2279 }
2280 }
2281 SB.restore();
2282
2284 return false;
2285}
2286
2287/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2288/// a VGPR and the stack slot can be safely eliminated when all other users are
2289/// handled.
2292 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
2293 switch (MI->getOpcode()) {
2294 case AMDGPU::SI_SPILL_S1024_SAVE:
2295 case AMDGPU::SI_SPILL_S512_SAVE:
2296 case AMDGPU::SI_SPILL_S384_SAVE:
2297 case AMDGPU::SI_SPILL_S352_SAVE:
2298 case AMDGPU::SI_SPILL_S320_SAVE:
2299 case AMDGPU::SI_SPILL_S288_SAVE:
2300 case AMDGPU::SI_SPILL_S256_SAVE:
2301 case AMDGPU::SI_SPILL_S224_SAVE:
2302 case AMDGPU::SI_SPILL_S192_SAVE:
2303 case AMDGPU::SI_SPILL_S160_SAVE:
2304 case AMDGPU::SI_SPILL_S128_SAVE:
2305 case AMDGPU::SI_SPILL_S96_SAVE:
2306 case AMDGPU::SI_SPILL_S64_SAVE:
2307 case AMDGPU::SI_SPILL_S32_SAVE:
2308 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2309 case AMDGPU::SI_SPILL_S1024_RESTORE:
2310 case AMDGPU::SI_SPILL_S512_RESTORE:
2311 case AMDGPU::SI_SPILL_S384_RESTORE:
2312 case AMDGPU::SI_SPILL_S352_RESTORE:
2313 case AMDGPU::SI_SPILL_S320_RESTORE:
2314 case AMDGPU::SI_SPILL_S288_RESTORE:
2315 case AMDGPU::SI_SPILL_S256_RESTORE:
2316 case AMDGPU::SI_SPILL_S224_RESTORE:
2317 case AMDGPU::SI_SPILL_S192_RESTORE:
2318 case AMDGPU::SI_SPILL_S160_RESTORE:
2319 case AMDGPU::SI_SPILL_S128_RESTORE:
2320 case AMDGPU::SI_SPILL_S96_RESTORE:
2321 case AMDGPU::SI_SPILL_S64_RESTORE:
2322 case AMDGPU::SI_SPILL_S32_RESTORE:
2323 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2324 default:
2325 llvm_unreachable("not an SGPR spill instruction");
2326 }
2327}
2328
2330 int SPAdj, unsigned FIOperandNum,
2331 RegScavenger *RS) const {
2332 MachineFunction *MF = MI->getMF();
2333 MachineBasicBlock *MBB = MI->getParent();
2335 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2336 const SIInstrInfo *TII = ST.getInstrInfo();
2337 const DebugLoc &DL = MI->getDebugLoc();
2338
2339 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
2340
2342 "unreserved scratch RSRC register");
2343
2344 MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
2345 int Index = MI->getOperand(FIOperandNum).getIndex();
2346
2347 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
2348 ? getBaseRegister()
2349 : getFrameRegister(*MF);
2350
2351 switch (MI->getOpcode()) {
2352 // SGPR register spill
2353 case AMDGPU::SI_SPILL_S1024_SAVE:
2354 case AMDGPU::SI_SPILL_S512_SAVE:
2355 case AMDGPU::SI_SPILL_S384_SAVE:
2356 case AMDGPU::SI_SPILL_S352_SAVE:
2357 case AMDGPU::SI_SPILL_S320_SAVE:
2358 case AMDGPU::SI_SPILL_S288_SAVE:
2359 case AMDGPU::SI_SPILL_S256_SAVE:
2360 case AMDGPU::SI_SPILL_S224_SAVE:
2361 case AMDGPU::SI_SPILL_S192_SAVE:
2362 case AMDGPU::SI_SPILL_S160_SAVE:
2363 case AMDGPU::SI_SPILL_S128_SAVE:
2364 case AMDGPU::SI_SPILL_S96_SAVE:
2365 case AMDGPU::SI_SPILL_S64_SAVE:
2366 case AMDGPU::SI_SPILL_S32_SAVE: {
2367 return spillSGPR(MI, Index, RS);
2368 }
2369
2370 // SGPR register restore
2371 case AMDGPU::SI_SPILL_S1024_RESTORE:
2372 case AMDGPU::SI_SPILL_S512_RESTORE:
2373 case AMDGPU::SI_SPILL_S384_RESTORE:
2374 case AMDGPU::SI_SPILL_S352_RESTORE:
2375 case AMDGPU::SI_SPILL_S320_RESTORE:
2376 case AMDGPU::SI_SPILL_S288_RESTORE:
2377 case AMDGPU::SI_SPILL_S256_RESTORE:
2378 case AMDGPU::SI_SPILL_S224_RESTORE:
2379 case AMDGPU::SI_SPILL_S192_RESTORE:
2380 case AMDGPU::SI_SPILL_S160_RESTORE:
2381 case AMDGPU::SI_SPILL_S128_RESTORE:
2382 case AMDGPU::SI_SPILL_S96_RESTORE:
2383 case AMDGPU::SI_SPILL_S64_RESTORE:
2384 case AMDGPU::SI_SPILL_S32_RESTORE: {
2385 return restoreSGPR(MI, Index, RS);
2386 }
2387
2388 // VGPR register spill
2389 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: {
2390 // Put mask into M0.
2391 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2392 AMDGPU::M0)
2393 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2394 [[fallthrough]];
2395 }
2396 case AMDGPU::SI_SPILL_V1024_SAVE:
2397 case AMDGPU::SI_SPILL_V512_SAVE:
2398 case AMDGPU::SI_SPILL_V384_SAVE:
2399 case AMDGPU::SI_SPILL_V352_SAVE:
2400 case AMDGPU::SI_SPILL_V320_SAVE:
2401 case AMDGPU::SI_SPILL_V288_SAVE:
2402 case AMDGPU::SI_SPILL_V256_SAVE:
2403 case AMDGPU::SI_SPILL_V224_SAVE:
2404 case AMDGPU::SI_SPILL_V192_SAVE:
2405 case AMDGPU::SI_SPILL_V160_SAVE:
2406 case AMDGPU::SI_SPILL_V128_SAVE:
2407 case AMDGPU::SI_SPILL_V96_SAVE:
2408 case AMDGPU::SI_SPILL_V64_SAVE:
2409 case AMDGPU::SI_SPILL_V32_SAVE:
2410 case AMDGPU::SI_SPILL_V16_SAVE:
2411 case AMDGPU::SI_SPILL_A1024_SAVE:
2412 case AMDGPU::SI_SPILL_A512_SAVE:
2413 case AMDGPU::SI_SPILL_A384_SAVE:
2414 case AMDGPU::SI_SPILL_A352_SAVE:
2415 case AMDGPU::SI_SPILL_A320_SAVE:
2416 case AMDGPU::SI_SPILL_A288_SAVE:
2417 case AMDGPU::SI_SPILL_A256_SAVE:
2418 case AMDGPU::SI_SPILL_A224_SAVE:
2419 case AMDGPU::SI_SPILL_A192_SAVE:
2420 case AMDGPU::SI_SPILL_A160_SAVE:
2421 case AMDGPU::SI_SPILL_A128_SAVE:
2422 case AMDGPU::SI_SPILL_A96_SAVE:
2423 case AMDGPU::SI_SPILL_A64_SAVE:
2424 case AMDGPU::SI_SPILL_A32_SAVE:
2425 case AMDGPU::SI_SPILL_AV1024_SAVE:
2426 case AMDGPU::SI_SPILL_AV512_SAVE:
2427 case AMDGPU::SI_SPILL_AV384_SAVE:
2428 case AMDGPU::SI_SPILL_AV352_SAVE:
2429 case AMDGPU::SI_SPILL_AV320_SAVE:
2430 case AMDGPU::SI_SPILL_AV288_SAVE:
2431 case AMDGPU::SI_SPILL_AV256_SAVE:
2432 case AMDGPU::SI_SPILL_AV224_SAVE:
2433 case AMDGPU::SI_SPILL_AV192_SAVE:
2434 case AMDGPU::SI_SPILL_AV160_SAVE:
2435 case AMDGPU::SI_SPILL_AV128_SAVE:
2436 case AMDGPU::SI_SPILL_AV96_SAVE:
2437 case AMDGPU::SI_SPILL_AV64_SAVE:
2438 case AMDGPU::SI_SPILL_AV32_SAVE:
2439 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
2440 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
2441 const MachineOperand *VData = TII->getNamedOperand(*MI,
2442 AMDGPU::OpName::vdata);
2443 if (VData->isUndef()) {
2444 MI->eraseFromParent();
2445 return true;
2446 }
2447
2448 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2449 MFI->getStackPtrOffsetReg());
2450
2451 unsigned Opc;
2452 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) {
2453 assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!");
2454 Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
2455 } else {
2456 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_SAVE
2457 ? AMDGPU::SCRATCH_STORE_BLOCK_SADDR
2458 : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2459 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2460 }
2461
2462 auto *MBB = MI->getParent();
2463 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2464 if (IsWWMRegSpill) {
2465 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2466 RS->isRegUsed(AMDGPU::SCC));
2467 }
2469 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2470 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2471 *MI->memoperands_begin(), RS);
2473 if (IsWWMRegSpill)
2474 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2475
2476 MI->eraseFromParent();
2477 return true;
2478 }
2479 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: {
2480 // Put mask into M0.
2481 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2482 AMDGPU::M0)
2483 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2484 [[fallthrough]];
2485 }
2486 case AMDGPU::SI_SPILL_V16_RESTORE:
2487 case AMDGPU::SI_SPILL_V32_RESTORE:
2488 case AMDGPU::SI_SPILL_V64_RESTORE:
2489 case AMDGPU::SI_SPILL_V96_RESTORE:
2490 case AMDGPU::SI_SPILL_V128_RESTORE:
2491 case AMDGPU::SI_SPILL_V160_RESTORE:
2492 case AMDGPU::SI_SPILL_V192_RESTORE:
2493 case AMDGPU::SI_SPILL_V224_RESTORE:
2494 case AMDGPU::SI_SPILL_V256_RESTORE:
2495 case AMDGPU::SI_SPILL_V288_RESTORE:
2496 case AMDGPU::SI_SPILL_V320_RESTORE:
2497 case AMDGPU::SI_SPILL_V352_RESTORE:
2498 case AMDGPU::SI_SPILL_V384_RESTORE:
2499 case AMDGPU::SI_SPILL_V512_RESTORE:
2500 case AMDGPU::SI_SPILL_V1024_RESTORE:
2501 case AMDGPU::SI_SPILL_A32_RESTORE:
2502 case AMDGPU::SI_SPILL_A64_RESTORE:
2503 case AMDGPU::SI_SPILL_A96_RESTORE:
2504 case AMDGPU::SI_SPILL_A128_RESTORE:
2505 case AMDGPU::SI_SPILL_A160_RESTORE:
2506 case AMDGPU::SI_SPILL_A192_RESTORE:
2507 case AMDGPU::SI_SPILL_A224_RESTORE:
2508 case AMDGPU::SI_SPILL_A256_RESTORE:
2509 case AMDGPU::SI_SPILL_A288_RESTORE:
2510 case AMDGPU::SI_SPILL_A320_RESTORE:
2511 case AMDGPU::SI_SPILL_A352_RESTORE:
2512 case AMDGPU::SI_SPILL_A384_RESTORE:
2513 case AMDGPU::SI_SPILL_A512_RESTORE:
2514 case AMDGPU::SI_SPILL_A1024_RESTORE:
2515 case AMDGPU::SI_SPILL_AV32_RESTORE:
2516 case AMDGPU::SI_SPILL_AV64_RESTORE:
2517 case AMDGPU::SI_SPILL_AV96_RESTORE:
2518 case AMDGPU::SI_SPILL_AV128_RESTORE:
2519 case AMDGPU::SI_SPILL_AV160_RESTORE:
2520 case AMDGPU::SI_SPILL_AV192_RESTORE:
2521 case AMDGPU::SI_SPILL_AV224_RESTORE:
2522 case AMDGPU::SI_SPILL_AV256_RESTORE:
2523 case AMDGPU::SI_SPILL_AV288_RESTORE:
2524 case AMDGPU::SI_SPILL_AV320_RESTORE:
2525 case AMDGPU::SI_SPILL_AV352_RESTORE:
2526 case AMDGPU::SI_SPILL_AV384_RESTORE:
2527 case AMDGPU::SI_SPILL_AV512_RESTORE:
2528 case AMDGPU::SI_SPILL_AV1024_RESTORE:
2529 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
2530 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
2531 const MachineOperand *VData = TII->getNamedOperand(*MI,
2532 AMDGPU::OpName::vdata);
2533 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2534 MFI->getStackPtrOffsetReg());
2535
2536 unsigned Opc;
2537 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
2538 assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!");
2539 Opc = ST.d16PreservesUnusedBits()
2540 ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16
2541 : AMDGPU::SCRATCH_LOAD_USHORT_SADDR;
2542 } else {
2543 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE
2544 ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR
2545 : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2546 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2547 }
2548
2549 auto *MBB = MI->getParent();
2550 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2551 if (IsWWMRegSpill) {
2552 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2553 RS->isRegUsed(AMDGPU::SCC));
2554 }
2555
2557 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2558 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2559 *MI->memoperands_begin(), RS);
2560
2561 if (IsWWMRegSpill)
2562 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2563
2564 MI->eraseFromParent();
2565 return true;
2566 }
2567 case AMDGPU::V_ADD_U32_e32:
2568 case AMDGPU::V_ADD_U32_e64:
2569 case AMDGPU::V_ADD_CO_U32_e32:
2570 case AMDGPU::V_ADD_CO_U32_e64: {
2571 // TODO: Handle sub, and, or.
2572 unsigned NumDefs = MI->getNumExplicitDefs();
2573 unsigned Src0Idx = NumDefs;
2574
2575 bool HasClamp = false;
2576 MachineOperand *VCCOp = nullptr;
2577
2578 switch (MI->getOpcode()) {
2579 case AMDGPU::V_ADD_U32_e32:
2580 break;
2581 case AMDGPU::V_ADD_U32_e64:
2582 HasClamp = MI->getOperand(3).getImm();
2583 break;
2584 case AMDGPU::V_ADD_CO_U32_e32:
2585 VCCOp = &MI->getOperand(3);
2586 break;
2587 case AMDGPU::V_ADD_CO_U32_e64:
2588 VCCOp = &MI->getOperand(1);
2589 HasClamp = MI->getOperand(4).getImm();
2590 break;
2591 default:
2592 break;
2593 }
2594 bool DeadVCC = !VCCOp || VCCOp->isDead();
2595 MachineOperand &DstOp = MI->getOperand(0);
2596 Register DstReg = DstOp.getReg();
2597
2598 unsigned OtherOpIdx =
2599 FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2600 MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2601
2602 unsigned Src1Idx = Src0Idx + 1;
2603 Register MaterializedReg = FrameReg;
2604 Register ScavengedVGPR;
2605
2606 int64_t Offset = FrameInfo.getObjectOffset(Index);
2607 // For the non-immediate case, we could fall through to the default
2608 // handling, but we do an in-place update of the result register here to
2609 // avoid scavenging another register.
2610 if (OtherOp->isImm()) {
2611 int64_t TotalOffset = OtherOp->getImm() + Offset;
2612
2613 if (!ST.hasVOP3Literal() && SIInstrInfo::isVOP3(*MI) &&
2614 !AMDGPU::isInlinableIntLiteral(TotalOffset)) {
2615 // If we can't support a VOP3 literal in the VALU instruction, we
2616 // can't specially fold into the add.
2617 // TODO: Handle VOP3->VOP2 shrink to support the fold.
2618 break;
2619 }
2620
2621 OtherOp->setImm(TotalOffset);
2622 Offset = 0;
2623 }
2624
2625 if (FrameReg && !ST.hasFlatScratchEnabled()) {
2626 // We should just do an in-place update of the result register. However,
2627 // the value there may also be used by the add, in which case we need a
2628 // temporary register.
2629 //
2630 // FIXME: The scavenger is not finding the result register in the
2631 // common case where the add does not read the register.
2632
2633 ScavengedVGPR = RS->scavengeRegisterBackwards(
2634 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2635
2636 // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2637 // shift.
2638 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2639 .addDef(ScavengedVGPR, RegState::Renamable)
2640 .addImm(ST.getWavefrontSizeLog2())
2641 .addReg(FrameReg);
2642 MaterializedReg = ScavengedVGPR;
2643 }
2644
2645 if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2646 if (ST.hasFlatScratchEnabled() &&
2647 !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2648 // We didn't need the shift above, so we have an SGPR for the frame
2649 // register, but may have a VGPR only operand.
2650 //
2651 // TODO: On gfx10+, we can easily change the opcode to the e64 version
2652 // and use the higher constant bus restriction to avoid this copy.
2653
2654 if (!ScavengedVGPR) {
2655 ScavengedVGPR = RS->scavengeRegisterBackwards(
2656 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2657 /*SPAdj=*/0);
2658 }
2659
2660 assert(ScavengedVGPR != DstReg);
2661
2662 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2663 .addReg(MaterializedReg,
2664 getKillRegState(MaterializedReg != FrameReg));
2665 MaterializedReg = ScavengedVGPR;
2666 }
2667
2668 // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2669 // is not live, we could use a scalar add + vector add instead of 2
2670 // vector adds.
2671 auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2672 .addDef(DstReg, RegState::Renamable);
2673 if (NumDefs == 2)
2674 AddI32.add(MI->getOperand(1));
2675
2676 RegState MaterializedRegFlags =
2677 getKillRegState(MaterializedReg != FrameReg);
2678
2679 if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2680 // If we know we have a VGPR already, it's more likely the other
2681 // operand is a legal vsrc0.
2682 AddI32
2683 .add(*OtherOp)
2684 .addReg(MaterializedReg, MaterializedRegFlags);
2685 } else {
2686 // Commute operands to avoid violating VOP2 restrictions. This will
2687 // typically happen when using scratch.
2688 AddI32
2689 .addReg(MaterializedReg, MaterializedRegFlags)
2690 .add(*OtherOp);
2691 }
2692
2693 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2694 MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2695 AddI32.addImm(0); // clamp
2696
2697 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2698 AddI32.setOperandDead(3); // Dead vcc
2699
2700 MaterializedReg = DstReg;
2701
2702 OtherOp->ChangeToRegister(MaterializedReg, false);
2703 OtherOp->setIsKill(true);
2705 Offset = 0;
2706 } else if (Offset != 0) {
2707 assert(!MaterializedReg);
2709 Offset = 0;
2710 } else {
2711 if (DeadVCC && !HasClamp) {
2712 assert(Offset == 0);
2713
2714 // TODO: Losing kills and implicit operands. Just mutate to copy and
2715 // let lowerCopy deal with it?
2716 if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2717 // Folded to an identity copy.
2718 MI->eraseFromParent();
2719 return true;
2720 }
2721
2722 // The immediate value should be in OtherOp
2723 MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2724 MI->removeOperand(FIOperandNum);
2725
2726 unsigned NumOps = MI->getNumOperands();
2727 for (unsigned I = NumOps - 2; I >= NumDefs + 1; --I)
2728 MI->removeOperand(I);
2729
2730 if (NumDefs == 2)
2731 MI->removeOperand(1);
2732
2733 // The code below can't deal with a mov.
2734 return true;
2735 }
2736
2737 // This folded to a constant, but we have to keep the add around for
2738 // pointless implicit defs or clamp modifier.
2739 FIOp->ChangeToImmediate(0);
2740 }
2741
2742 // Try to improve legality by commuting.
2743 if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2744 std::swap(FIOp, OtherOp);
2745 std::swap(FIOperandNum, OtherOpIdx);
2746 }
2747
2748 // We need at most one mov to satisfy the operand constraints. Prefer to
2749 // move the FI operand first, as it may be a literal in a VOP3
2750 // instruction.
2751 for (unsigned SrcIdx : {FIOperandNum, OtherOpIdx}) {
2752 if (!TII->isOperandLegal(*MI, SrcIdx)) {
2753 // If commuting didn't make the operands legal, we need to materialize
2754 // in a register.
2755 // TODO: Can use SGPR on gfx10+ in some cases.
2756 if (!ScavengedVGPR) {
2757 ScavengedVGPR = RS->scavengeRegisterBackwards(
2758 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2759 /*SPAdj=*/0);
2760 }
2761
2762 assert(ScavengedVGPR != DstReg);
2763
2764 MachineOperand &Src = MI->getOperand(SrcIdx);
2765 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2766 .add(Src);
2767
2768 Src.ChangeToRegister(ScavengedVGPR, false);
2769 Src.setIsKill(true);
2770 break;
2771 }
2772 }
2773
2774 // Fold out add of 0 case that can appear in kernels.
2775 if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
2776 if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
2777 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
2778 }
2779
2780 MI->eraseFromParent();
2781 }
2782
2783 return true;
2784 }
2785 case AMDGPU::S_ADD_I32:
2786 case AMDGPU::S_ADD_U32: {
2787 // TODO: Handle s_or_b32, s_and_b32.
2788 unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
2789 MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
2790
2791 assert(FrameReg || MFI->isBottomOfStack());
2792
2793 MachineOperand &DstOp = MI->getOperand(0);
2794 const DebugLoc &DL = MI->getDebugLoc();
2795 Register MaterializedReg = FrameReg;
2796
2797 // Defend against live scc, which should never happen in practice.
2798 bool DeadSCC = MI->getOperand(3).isDead();
2799
2800 Register TmpReg;
2801
2802 // FIXME: Scavenger should figure out that the result register is
2803 // available. Also should do this for the v_add case.
2804 if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg())
2805 TmpReg = DstOp.getReg();
2806
2807 if (FrameReg && !ST.hasFlatScratchEnabled()) {
2808 // FIXME: In the common case where the add does not also read its result
2809 // (i.e. this isn't a reg += fi), it's not finding the dest reg as
2810 // available.
2811 if (!TmpReg)
2812 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2813 MI, /*RestoreAfter=*/false, 0,
2814 /*AllowSpill=*/false);
2815 if (TmpReg) {
2816 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
2817 .addDef(TmpReg, RegState::Renamable)
2818 .addReg(FrameReg)
2819 .addImm(ST.getWavefrontSizeLog2())
2820 .setOperandDead(3); // Set SCC dead
2821 }
2822 MaterializedReg = TmpReg;
2823 }
2824
2825 int64_t Offset = FrameInfo.getObjectOffset(Index);
2826
2827 // For the non-immediate case, we could fall through to the default
2828 // handling, but we do an in-place update of the result register here to
2829 // avoid scavenging another register.
2830 if (OtherOp.isImm()) {
2831 OtherOp.setImm(OtherOp.getImm() + Offset);
2832 Offset = 0;
2833
2834 if (MaterializedReg)
2835 FIOp->ChangeToRegister(MaterializedReg, false);
2836 else
2837 FIOp->ChangeToImmediate(0);
2838 } else if (MaterializedReg) {
2839 // If we can't fold the other operand, do another increment.
2840 Register DstReg = DstOp.getReg();
2841
2842 if (!TmpReg && MaterializedReg == FrameReg) {
2843 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2844 MI, /*RestoreAfter=*/false, 0,
2845 /*AllowSpill=*/false);
2846 DstReg = TmpReg;
2847 }
2848
2849 if (TmpReg) {
2850 auto AddI32 = BuildMI(*MBB, *MI, DL, MI->getDesc())
2851 .addDef(DstReg, RegState::Renamable)
2852 .addReg(MaterializedReg, RegState::Kill)
2853 .add(OtherOp);
2854 if (DeadSCC)
2855 AddI32.setOperandDead(3);
2856
2857 MaterializedReg = DstReg;
2858
2859 OtherOp.ChangeToRegister(MaterializedReg, false);
2860 OtherOp.setIsKill(true);
2861 OtherOp.setIsRenamable(true);
2862 }
2864 } else {
2865 // If we don't have any other offset to apply, we can just directly
2866 // interpret the frame index as the offset.
2868 }
2869
2870 if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
2871 assert(Offset == 0);
2872 MI->removeOperand(3);
2873 MI->removeOperand(OtherOpIdx);
2874 MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2875 } else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
2876 assert(Offset == 0);
2877 MI->removeOperand(3);
2878 MI->removeOperand(FIOperandNum);
2879 MI->setDesc(
2880 TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2881 }
2882
2883 assert(!FIOp->isFI());
2884 return true;
2885 }
2886 default: {
2887 break;
2888 }
2889 }
2890
2891 int64_t Offset = FrameInfo.getObjectOffset(Index);
2892 if (ST.hasFlatScratchEnabled()) {
2893 if (TII->isFLATScratch(*MI)) {
2894 assert(
2895 (int16_t)FIOperandNum ==
2896 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::saddr));
2897
2898 // The offset is always swizzled, just replace it
2899 if (FrameReg)
2900 FIOp->ChangeToRegister(FrameReg, false);
2901
2903 TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2904 int64_t NewOffset = Offset + OffsetOp->getImm();
2905 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
2907 OffsetOp->setImm(NewOffset);
2908 if (FrameReg)
2909 return false;
2910 Offset = 0;
2911 }
2912
2913 if (!Offset) {
2914 unsigned Opc = MI->getOpcode();
2915 int NewOpc = -1;
2916 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
2918 } else if (ST.hasFlatScratchSTMode()) {
2919 // On GFX10 we have ST mode to use no registers for an address.
2920 // Otherwise we need to materialize 0 into an SGPR.
2922 }
2923
2924 if (NewOpc != -1) {
2925 // removeOperand doesn't fixup tied operand indexes as it goes, so
2926 // it asserts. Untie vdst_in for now and retie them afterwards.
2927 int VDstIn =
2928 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
2929 bool TiedVDst = VDstIn != -1 && MI->getOperand(VDstIn).isReg() &&
2930 MI->getOperand(VDstIn).isTied();
2931 if (TiedVDst)
2932 MI->untieRegOperand(VDstIn);
2933
2934 MI->removeOperand(
2935 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
2936
2937 if (TiedVDst) {
2938 int NewVDst =
2939 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
2940 int NewVDstIn =
2941 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
2942 assert(NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
2943 MI->tieOperands(NewVDst, NewVDstIn);
2944 }
2945 MI->setDesc(TII->get(NewOpc));
2946 return false;
2947 }
2948 }
2949 }
2950
2951 if (!FrameReg) {
2953 if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
2954 return false;
2955 }
2956
2957 // We need to use register here. Check if we can use an SGPR or need
2958 // a VGPR.
2959 FIOp->ChangeToRegister(AMDGPU::M0, false);
2960 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
2961
2962 if (!Offset && FrameReg && UseSGPR) {
2963 FIOp->setReg(FrameReg);
2964 return false;
2965 }
2966
2967 const TargetRegisterClass *RC =
2968 UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass;
2969
2970 Register TmpReg =
2971 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2972 FIOp->setReg(TmpReg);
2973 FIOp->setIsKill();
2974
2975 if ((!FrameReg || !Offset) && TmpReg) {
2976 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2977 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
2978 if (FrameReg)
2979 MIB.addReg(FrameReg);
2980 else
2981 MIB.addImm(Offset);
2982
2983 return false;
2984 }
2985
2986 bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) &&
2987 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
2988
2989 Register TmpSReg =
2990 UseSGPR ? TmpReg
2991 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2992 MI, false, 0, !UseSGPR);
2993
2994 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) {
2995 int SVOpcode = AMDGPU::getFlatScratchInstSVfromSS(MI->getOpcode());
2996 if (ST.hasFlatScratchSVSMode() && SVOpcode != -1) {
2997 Register TmpVGPR = RS->scavengeRegisterBackwards(
2998 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
2999
3000 // Materialize the frame register.
3001 auto MIB =
3002 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR);
3003 if (FrameReg)
3004 MIB.addReg(FrameReg);
3005 else
3006 MIB.addImm(Offset);
3007
3008 // Add the offset to the frame register.
3009 if (FrameReg && Offset)
3010 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), FrameReg)
3011 .addReg(FrameReg, RegState::Kill)
3012 .addImm(Offset);
3013
3014 BuildMI(*MBB, MI, DL, TII->get(SVOpcode))
3015 .add(MI->getOperand(0)) // $vdata
3016 .addReg(TmpVGPR) // $vaddr
3017 .addImm(0) // Offset
3018 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::cpol));
3019 MI->eraseFromParent();
3020 return true;
3021 }
3022 report_fatal_error("Cannot scavenge register in FI elimination!");
3023 }
3024
3025 if (!TmpSReg) {
3026 // Use frame register and restore it after.
3027 TmpSReg = FrameReg;
3028 FIOp->setReg(FrameReg);
3029 FIOp->setIsKill(false);
3030 }
3031
3032 if (NeedSaveSCC) {
3033 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
3034 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
3035 .addReg(FrameReg)
3036 .addImm(Offset);
3037 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
3038 .addReg(TmpSReg)
3039 .addImm(0);
3040 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
3041 .addImm(0)
3042 .addReg(TmpSReg);
3043 } else {
3044 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
3045 .addReg(FrameReg)
3046 .addImm(Offset);
3047 }
3048
3049 if (!UseSGPR)
3050 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3051 .addReg(TmpSReg, RegState::Kill);
3052
3053 if (TmpSReg == FrameReg) {
3054 // Undo frame register modification.
3055 if (NeedSaveSCC &&
3056 !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
3058 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
3059 TmpSReg)
3060 .addReg(FrameReg)
3061 .addImm(-Offset);
3062 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
3063 .addReg(TmpSReg)
3064 .addImm(0);
3065 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
3066 TmpSReg)
3067 .addImm(0)
3068 .addReg(TmpSReg);
3069 } else {
3070 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
3071 FrameReg)
3072 .addReg(FrameReg)
3073 .addImm(-Offset);
3074 }
3075 }
3076
3077 return false;
3078 }
3079
3080 bool IsMUBUF = TII->isMUBUF(*MI);
3081
3082 if (!IsMUBUF && !MFI->isBottomOfStack()) {
3083 // Convert to a swizzled stack address by scaling by the wave size.
3084 // In an entry function/kernel the offset is already swizzled.
3085 bool IsSALU = isSGPRClass(TII->getRegClass(MI->getDesc(), FIOperandNum));
3086 bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
3087 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
3088 const TargetRegisterClass *RC = IsSALU && !LiveSCC
3089 ? &AMDGPU::SReg_32RegClass
3090 : &AMDGPU::VGPR_32RegClass;
3091 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
3092 MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
3093 MI->getOpcode() == AMDGPU::S_MOV_B32;
3094 Register ResultReg =
3095 IsCopy ? MI->getOperand(0).getReg()
3096 : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
3097
3098 int64_t Offset = FrameInfo.getObjectOffset(Index);
3099 if (Offset == 0) {
3100 unsigned OpCode =
3101 IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64;
3102 Register TmpResultReg = ResultReg;
3103 if (IsSALU && LiveSCC) {
3104 TmpResultReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
3105 MI, false, 0);
3106 }
3107
3108 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
3109 if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
3110 // For V_LSHRREV, the operands are reversed (the shift count goes
3111 // first).
3112 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
3113 else
3114 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
3115 if (IsSALU && !LiveSCC)
3116 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
3117 if (IsSALU && LiveSCC) {
3118 Register NewDest;
3119 if (IsCopy) {
3120 assert(ResultReg.isPhysical());
3121 NewDest = ResultReg;
3122 } else {
3123 NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3124 Shift, false, 0);
3125 }
3126 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
3127 .addReg(TmpResultReg);
3128 ResultReg = NewDest;
3129 }
3130 } else {
3132 if (!IsSALU) {
3133 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
3134 nullptr) {
3135 // Reuse ResultReg in intermediate step.
3136 Register ScaledReg = ResultReg;
3137
3138 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3139 ScaledReg)
3140 .addImm(ST.getWavefrontSizeLog2())
3141 .addReg(FrameReg);
3142
3143 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
3144
3145 // TODO: Fold if use instruction is another add of a constant.
3146 if (IsVOP2 ||
3147 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
3148 // FIXME: This can fail
3149 MIB.addImm(Offset);
3150 MIB.addReg(ScaledReg, RegState::Kill);
3151 if (!IsVOP2)
3152 MIB.addImm(0); // clamp bit
3153 } else {
3154 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
3155 "Need to reuse carry out register");
3156
3157 // Use scavenged unused carry out as offset register.
3158 Register ConstOffsetReg;
3159 if (!isWave32)
3160 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
3161 else
3162 ConstOffsetReg = MIB.getReg(1);
3163
3164 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32),
3165 ConstOffsetReg)
3166 .addImm(Offset);
3167 MIB.addReg(ConstOffsetReg, RegState::Kill);
3168 MIB.addReg(ScaledReg, RegState::Kill);
3169 MIB.addImm(0); // clamp bit
3170 }
3171 }
3172 }
3173 if (!MIB || IsSALU) {
3174 // We have to produce a carry out, and there isn't a free SGPR pair
3175 // for it. We can keep the whole computation on the SALU to avoid
3176 // clobbering an additional register at the cost of an extra mov.
3177
3178 // We may have 1 free scratch SGPR even though a carry out is
3179 // unavailable. Only one additional mov is needed.
3180 Register TmpScaledReg = IsCopy && IsSALU
3181 ? ResultReg
3182 : RS->scavengeRegisterBackwards(
3183 AMDGPU::SReg_32_XM0RegClass, MI,
3184 false, 0, /*AllowSpill=*/false);
3185 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
3186 Register TmpResultReg = ScaledReg;
3187
3188 if (!LiveSCC) {
3189 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
3190 .addReg(FrameReg)
3191 .addImm(ST.getWavefrontSizeLog2());
3192 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
3193 .addReg(TmpResultReg, RegState::Kill)
3194 .addImm(Offset);
3195 } else {
3196 TmpResultReg = RS->scavengeRegisterBackwards(
3197 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3198
3200 if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
3201 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3202 TmpResultReg)
3203 .addImm(ST.getWavefrontSizeLog2())
3204 .addReg(FrameReg);
3205 if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
3206 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32), ResultReg)
3207 .addImm(Offset);
3208 Add.addReg(ResultReg, RegState::Kill)
3209 .addReg(TmpResultReg, RegState::Kill)
3210 .addImm(0);
3211 } else
3212 Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
3213 } else {
3214 assert(Offset > 0 && isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
3215 "offset is unsafe for v_mad_u32_u24");
3216
3217 // We start with a frame pointer with a wave space value, and
3218 // an offset in lane-space. We are materializing a lane space
3219 // value. We can either do a right shift of the frame pointer
3220 // to get to lane space, or a left shift of the offset to get
3221 // to wavespace. We can right shift after the computation to
3222 // get back to the desired per-lane value. We are using the
3223 // mad_u32_u24 primarily as an add with no carry out clobber.
3224 bool IsInlinableLiteral =
3225 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm());
3226 if (!IsInlinableLiteral) {
3227 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
3228 TmpResultReg)
3229 .addImm(Offset);
3230 }
3231
3232 Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
3233 TmpResultReg);
3234
3235 if (!IsInlinableLiteral) {
3236 Add.addReg(TmpResultReg, RegState::Kill);
3237 } else {
3238 // We fold the offset into mad itself if its inlinable.
3239 Add.addImm(Offset);
3240 }
3241 Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
3242 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3243 TmpResultReg)
3244 .addImm(ST.getWavefrontSizeLog2())
3245 .addReg(TmpResultReg);
3246 }
3247
3248 Register NewDest;
3249 if (IsCopy) {
3250 NewDest = ResultReg;
3251 } else {
3252 NewDest = RS->scavengeRegisterBackwards(
3253 AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
3254 /*AllowSpill=*/true);
3255 }
3256
3257 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
3258 NewDest)
3259 .addReg(TmpResultReg);
3260 ResultReg = NewDest;
3261 }
3262 if (!IsSALU)
3263 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
3264 .addReg(TmpResultReg, RegState::Kill);
3265 // If there were truly no free SGPRs, we need to undo everything.
3266 if (!TmpScaledReg.isValid()) {
3267 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
3268 .addReg(ScaledReg, RegState::Kill)
3269 .addImm(-Offset);
3270 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
3271 .addReg(FrameReg)
3272 .addImm(ST.getWavefrontSizeLog2());
3273 }
3274 }
3275 }
3276
3277 // Don't introduce an extra copy if we're just materializing in a mov.
3278 if (IsCopy) {
3279 MI->eraseFromParent();
3280 return true;
3281 }
3282 FIOp->ChangeToRegister(ResultReg, false, false, true);
3283 return false;
3284 }
3285
3286 if (IsMUBUF) {
3287 // Disable offen so we don't need a 0 vgpr base.
3288 assert(
3289 static_cast<int>(FIOperandNum) ==
3290 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr));
3291
3292 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
3293 assert((SOffset.isImm() && SOffset.getImm() == 0));
3294
3295 if (FrameReg != AMDGPU::NoRegister)
3296 SOffset.ChangeToRegister(FrameReg, false);
3297
3298 int64_t Offset = FrameInfo.getObjectOffset(Index);
3299 int64_t OldImm =
3300 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
3301 int64_t NewOffset = OldImm + Offset;
3302
3303 if (TII->isLegalMUBUFImmOffset(NewOffset) &&
3304 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
3305 MI->eraseFromParent();
3306 return true;
3307 }
3308 }
3309
3310 // If the offset is simply too big, don't convert to a scratch wave offset
3311 // relative index.
3312
3314 if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
3315 Register TmpReg =
3316 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
3317 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3318 .addImm(Offset);
3319 FIOp->ChangeToRegister(TmpReg, false, false, true);
3320 }
3321
3322 return false;
3323}
3324
3328
3330 return getEncodingValue(Reg) & AMDGPU::HWEncoding::REG_IDX_MASK;
3331}
3332
3334 return getRegBitWidth(RC.getID());
3335}
3336
3337static const TargetRegisterClass *
3339 if (BitWidth == 64)
3340 return &AMDGPU::VReg_64RegClass;
3341 if (BitWidth == 96)
3342 return &AMDGPU::VReg_96RegClass;
3343 if (BitWidth == 128)
3344 return &AMDGPU::VReg_128RegClass;
3345 if (BitWidth == 160)
3346 return &AMDGPU::VReg_160RegClass;
3347 if (BitWidth == 192)
3348 return &AMDGPU::VReg_192RegClass;
3349 if (BitWidth == 224)
3350 return &AMDGPU::VReg_224RegClass;
3351 if (BitWidth == 256)
3352 return &AMDGPU::VReg_256RegClass;
3353 if (BitWidth == 288)
3354 return &AMDGPU::VReg_288RegClass;
3355 if (BitWidth == 320)
3356 return &AMDGPU::VReg_320RegClass;
3357 if (BitWidth == 352)
3358 return &AMDGPU::VReg_352RegClass;
3359 if (BitWidth == 384)
3360 return &AMDGPU::VReg_384RegClass;
3361 if (BitWidth == 512)
3362 return &AMDGPU::VReg_512RegClass;
3363 if (BitWidth == 1024)
3364 return &AMDGPU::VReg_1024RegClass;
3365
3366 return nullptr;
3367}
3368
3369static const TargetRegisterClass *
3371 if (BitWidth == 64)
3372 return &AMDGPU::VReg_64_Align2RegClass;
3373 if (BitWidth == 96)
3374 return &AMDGPU::VReg_96_Align2RegClass;
3375 if (BitWidth == 128)
3376 return &AMDGPU::VReg_128_Align2RegClass;
3377 if (BitWidth == 160)
3378 return &AMDGPU::VReg_160_Align2RegClass;
3379 if (BitWidth == 192)
3380 return &AMDGPU::VReg_192_Align2RegClass;
3381 if (BitWidth == 224)
3382 return &AMDGPU::VReg_224_Align2RegClass;
3383 if (BitWidth == 256)
3384 return &AMDGPU::VReg_256_Align2RegClass;
3385 if (BitWidth == 288)
3386 return &AMDGPU::VReg_288_Align2RegClass;
3387 if (BitWidth == 320)
3388 return &AMDGPU::VReg_320_Align2RegClass;
3389 if (BitWidth == 352)
3390 return &AMDGPU::VReg_352_Align2RegClass;
3391 if (BitWidth == 384)
3392 return &AMDGPU::VReg_384_Align2RegClass;
3393 if (BitWidth == 512)
3394 return &AMDGPU::VReg_512_Align2RegClass;
3395 if (BitWidth == 1024)
3396 return &AMDGPU::VReg_1024_Align2RegClass;
3397
3398 return nullptr;
3399}
3400
3401const TargetRegisterClass *
3403 if (BitWidth == 1)
3404 return &AMDGPU::VReg_1RegClass;
3405 if (BitWidth == 16)
3406 return &AMDGPU::VGPR_16RegClass;
3407 if (BitWidth == 32)
3408 return &AMDGPU::VGPR_32RegClass;
3409 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
3411}
3412
3413const TargetRegisterClass *
3415 if (BitWidth <= 32)
3416 return &AMDGPU::VGPR_32_Lo256RegClass;
3417 if (BitWidth <= 64)
3418 return &AMDGPU::VReg_64_Lo256_Align2RegClass;
3419 if (BitWidth <= 96)
3420 return &AMDGPU::VReg_96_Lo256_Align2RegClass;
3421 if (BitWidth <= 128)
3422 return &AMDGPU::VReg_128_Lo256_Align2RegClass;
3423 if (BitWidth <= 160)
3424 return &AMDGPU::VReg_160_Lo256_Align2RegClass;
3425 if (BitWidth <= 192)
3426 return &AMDGPU::VReg_192_Lo256_Align2RegClass;
3427 if (BitWidth <= 224)
3428 return &AMDGPU::VReg_224_Lo256_Align2RegClass;
3429 if (BitWidth <= 256)
3430 return &AMDGPU::VReg_256_Lo256_Align2RegClass;
3431 if (BitWidth <= 288)
3432 return &AMDGPU::VReg_288_Lo256_Align2RegClass;
3433 if (BitWidth <= 320)
3434 return &AMDGPU::VReg_320_Lo256_Align2RegClass;
3435 if (BitWidth <= 352)
3436 return &AMDGPU::VReg_352_Lo256_Align2RegClass;
3437 if (BitWidth <= 384)
3438 return &AMDGPU::VReg_384_Lo256_Align2RegClass;
3439 if (BitWidth <= 512)
3440 return &AMDGPU::VReg_512_Lo256_Align2RegClass;
3441 if (BitWidth <= 1024)
3442 return &AMDGPU::VReg_1024_Lo256_Align2RegClass;
3443
3444 return nullptr;
3445}
3446
3447static const TargetRegisterClass *
3449 if (BitWidth == 64)
3450 return &AMDGPU::AReg_64RegClass;
3451 if (BitWidth == 96)
3452 return &AMDGPU::AReg_96RegClass;
3453 if (BitWidth == 128)
3454 return &AMDGPU::AReg_128RegClass;
3455 if (BitWidth == 160)
3456 return &AMDGPU::AReg_160RegClass;
3457 if (BitWidth == 192)
3458 return &AMDGPU::AReg_192RegClass;
3459 if (BitWidth == 224)
3460 return &AMDGPU::AReg_224RegClass;
3461 if (BitWidth == 256)
3462 return &AMDGPU::AReg_256RegClass;
3463 if (BitWidth == 288)
3464 return &AMDGPU::AReg_288RegClass;
3465 if (BitWidth == 320)
3466 return &AMDGPU::AReg_320RegClass;
3467 if (BitWidth == 352)
3468 return &AMDGPU::AReg_352RegClass;
3469 if (BitWidth == 384)
3470 return &AMDGPU::AReg_384RegClass;
3471 if (BitWidth == 512)
3472 return &AMDGPU::AReg_512RegClass;
3473 if (BitWidth == 1024)
3474 return &AMDGPU::AReg_1024RegClass;
3475
3476 return nullptr;
3477}
3478
3479static const TargetRegisterClass *
3481 if (BitWidth == 64)
3482 return &AMDGPU::AReg_64_Align2RegClass;
3483 if (BitWidth == 96)
3484 return &AMDGPU::AReg_96_Align2RegClass;
3485 if (BitWidth == 128)
3486 return &AMDGPU::AReg_128_Align2RegClass;
3487 if (BitWidth == 160)
3488 return &AMDGPU::AReg_160_Align2RegClass;
3489 if (BitWidth == 192)
3490 return &AMDGPU::AReg_192_Align2RegClass;
3491 if (BitWidth == 224)
3492 return &AMDGPU::AReg_224_Align2RegClass;
3493 if (BitWidth == 256)
3494 return &AMDGPU::AReg_256_Align2RegClass;
3495 if (BitWidth == 288)
3496 return &AMDGPU::AReg_288_Align2RegClass;
3497 if (BitWidth == 320)
3498 return &AMDGPU::AReg_320_Align2RegClass;
3499 if (BitWidth == 352)
3500 return &AMDGPU::AReg_352_Align2RegClass;
3501 if (BitWidth == 384)
3502 return &AMDGPU::AReg_384_Align2RegClass;
3503 if (BitWidth == 512)
3504 return &AMDGPU::AReg_512_Align2RegClass;
3505 if (BitWidth == 1024)
3506 return &AMDGPU::AReg_1024_Align2RegClass;
3507
3508 return nullptr;
3509}
3510
3511const TargetRegisterClass *
3513 if (BitWidth == 16)
3514 return &AMDGPU::AGPR_LO16RegClass;
3515 if (BitWidth == 32)
3516 return &AMDGPU::AGPR_32RegClass;
3517 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
3519}
3520
3521static const TargetRegisterClass *
3523 if (BitWidth == 64)
3524 return &AMDGPU::AV_64RegClass;
3525 if (BitWidth == 96)
3526 return &AMDGPU::AV_96RegClass;
3527 if (BitWidth == 128)
3528 return &AMDGPU::AV_128RegClass;
3529 if (BitWidth == 160)
3530 return &AMDGPU::AV_160RegClass;
3531 if (BitWidth == 192)
3532 return &AMDGPU::AV_192RegClass;
3533 if (BitWidth == 224)
3534 return &AMDGPU::AV_224RegClass;
3535 if (BitWidth == 256)
3536 return &AMDGPU::AV_256RegClass;
3537 if (BitWidth == 288)
3538 return &AMDGPU::AV_288RegClass;
3539 if (BitWidth == 320)
3540 return &AMDGPU::AV_320RegClass;
3541 if (BitWidth == 352)
3542 return &AMDGPU::AV_352RegClass;
3543 if (BitWidth == 384)
3544 return &AMDGPU::AV_384RegClass;
3545 if (BitWidth == 512)
3546 return &AMDGPU::AV_512RegClass;
3547 if (BitWidth == 1024)
3548 return &AMDGPU::AV_1024RegClass;
3549
3550 return nullptr;
3551}
3552
3553static const TargetRegisterClass *
3555 if (BitWidth == 64)
3556 return &AMDGPU::AV_64_Align2RegClass;
3557 if (BitWidth == 96)
3558 return &AMDGPU::AV_96_Align2RegClass;
3559 if (BitWidth == 128)
3560 return &AMDGPU::AV_128_Align2RegClass;
3561 if (BitWidth == 160)
3562 return &AMDGPU::AV_160_Align2RegClass;
3563 if (BitWidth == 192)
3564 return &AMDGPU::AV_192_Align2RegClass;
3565 if (BitWidth == 224)
3566 return &AMDGPU::AV_224_Align2RegClass;
3567 if (BitWidth == 256)
3568 return &AMDGPU::AV_256_Align2RegClass;
3569 if (BitWidth == 288)
3570 return &AMDGPU::AV_288_Align2RegClass;
3571 if (BitWidth == 320)
3572 return &AMDGPU::AV_320_Align2RegClass;
3573 if (BitWidth == 352)
3574 return &AMDGPU::AV_352_Align2RegClass;
3575 if (BitWidth == 384)
3576 return &AMDGPU::AV_384_Align2RegClass;
3577 if (BitWidth == 512)
3578 return &AMDGPU::AV_512_Align2RegClass;
3579 if (BitWidth == 1024)
3580 return &AMDGPU::AV_1024_Align2RegClass;
3581
3582 return nullptr;
3583}
3584
3585const TargetRegisterClass *
3587 if (BitWidth == 32)
3588 return &AMDGPU::AV_32RegClass;
3589 return ST.needsAlignedVGPRs()
3592}
3593
3594const TargetRegisterClass *
3596 // TODO: In principle this should use AV classes for gfx908 too. This is
3597 // limited to 90a+ to avoid regressing special case copy optimizations which
3598 // need new handling. The core issue is that it's not possible to directly
3599 // copy between AGPRs on gfx908, and the current optimizations around that
3600 // expect to see copies to VGPR.
3601 return ST.hasGFX90AInsts() ? getVectorSuperClassForBitWidth(BitWidth)
3603}
3604
3605const TargetRegisterClass *
3607 if (BitWidth == 16 || BitWidth == 32)
3608 return &AMDGPU::SReg_32RegClass;
3609 if (BitWidth == 64)
3610 return &AMDGPU::SReg_64RegClass;
3611 if (BitWidth == 96)
3612 return &AMDGPU::SGPR_96RegClass;
3613 if (BitWidth == 128)
3614 return &AMDGPU::SGPR_128RegClass;
3615 if (BitWidth == 160)
3616 return &AMDGPU::SGPR_160RegClass;
3617 if (BitWidth == 192)
3618 return &AMDGPU::SGPR_192RegClass;
3619 if (BitWidth == 224)
3620 return &AMDGPU::SGPR_224RegClass;
3621 if (BitWidth == 256)
3622 return &AMDGPU::SGPR_256RegClass;
3623 if (BitWidth == 288)
3624 return &AMDGPU::SGPR_288RegClass;
3625 if (BitWidth == 320)
3626 return &AMDGPU::SGPR_320RegClass;
3627 if (BitWidth == 352)
3628 return &AMDGPU::SGPR_352RegClass;
3629 if (BitWidth == 384)
3630 return &AMDGPU::SGPR_384RegClass;
3631 if (BitWidth == 512)
3632 return &AMDGPU::SGPR_512RegClass;
3633 if (BitWidth == 1024)
3634 return &AMDGPU::SGPR_1024RegClass;
3635
3636 return nullptr;
3637}
3638
3640 Register Reg) const {
3641 const TargetRegisterClass *RC;
3642 if (Reg.isVirtual())
3643 RC = MRI.getRegClass(Reg);
3644 else
3645 RC = getPhysRegBaseClass(Reg);
3646 return RC && isSGPRClass(RC);
3647}
3648
3649const TargetRegisterClass *
3651 unsigned Size = getRegSizeInBits(*SRC);
3652
3653 switch (SRC->getID()) {
3654 default:
3655 break;
3656 case AMDGPU::VS_32_Lo256RegClassID:
3657 case AMDGPU::VS_64_Lo256RegClassID:
3658 return getAllocatableClass(getAlignedLo256VGPRClassForBitWidth(Size));
3659 }
3660
3661 const TargetRegisterClass *VRC =
3662 getAllocatableClass(getVGPRClassForBitWidth(Size));
3663 assert(VRC && "Invalid register class size");
3664 return VRC;
3665}
3666
3667const TargetRegisterClass *
3669 unsigned Size = getRegSizeInBits(*SRC);
3671 assert(ARC && "Invalid register class size");
3672 return ARC;
3673}
3674
3675const TargetRegisterClass *
3677 unsigned Size = getRegSizeInBits(*SRC);
3679 assert(ARC && "Invalid register class size");
3680 return ARC;
3681}
3682
3683const TargetRegisterClass *
3685 unsigned Size = getRegSizeInBits(*VRC);
3686 if (Size == 32)
3687 return &AMDGPU::SGPR_32RegClass;
3689 assert(SRC && "Invalid register class size");
3690 return SRC;
3691}
3692
3693const TargetRegisterClass *
3695 const TargetRegisterClass *SubRC,
3696 unsigned SubIdx) const {
3697 // Ensure this subregister index is aligned in the super register.
3698 const TargetRegisterClass *MatchRC =
3699 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
3700 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
3701}
3702
3703bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
3706 return !ST.hasMFMAInlineLiteralBug();
3707
3708 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
3709 OpType <= AMDGPU::OPERAND_SRC_LAST;
3710}
3711
3712bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
3713 // TODO: 64-bit operands have extending behavior from 32-bit literal.
3714 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
3716}
3717
3718/// Returns a lowest register that is not used at any point in the function.
3719/// If all registers are used, then this function will return
3720/// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
3721/// highest unused register.
3724 const MachineFunction &MF, bool ReserveHighestRegister) const {
3725 if (ReserveHighestRegister) {
3726 for (MCRegister Reg : reverse(*RC))
3727 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3728 return Reg;
3729 } else {
3730 for (MCRegister Reg : *RC)
3731 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3732 return Reg;
3733 }
3734 return MCRegister();
3735}
3736
3738 const RegisterBankInfo &RBI,
3739 Register Reg) const {
3740 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo());
3741 if (!RB)
3742 return false;
3743
3744 return !RBI.isDivergentRegBank(RB);
3745}
3746
3748 unsigned EltSize) const {
3749 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
3750 assert(RegBitWidth >= 32 && RegBitWidth <= 1024 && EltSize >= 2);
3751
3752 const unsigned RegHalves = RegBitWidth / 16;
3753 const unsigned EltHalves = EltSize / 2;
3754 assert(RegSplitParts.size() + 1 >= EltHalves);
3755
3756 const std::vector<int16_t> &Parts = RegSplitParts[EltHalves - 1];
3757 const unsigned NumParts = RegHalves / EltHalves;
3758
3759 return ArrayRef(Parts.data(), NumParts);
3760}
3761
3764 Register Reg) const {
3765 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg);
3766}
3767
3768const TargetRegisterClass *
3770 const MachineOperand &MO) const {
3771 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg());
3772 return getSubRegisterClass(SrcRC, MO.getSubReg());
3773}
3774
3776 Register Reg) const {
3777 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3778 // Registers without classes are unaddressable, SGPR-like registers.
3779 return RC && isVGPRClass(RC);
3780}
3781
3783 Register Reg) const {
3784 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3785
3786 // Registers without classes are unaddressable, SGPR-like registers.
3787 return RC && isAGPRClass(RC);
3788}
3789
3791 MachineFunction &MF) const {
3792 unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
3793 switch (RC->getID()) {
3794 default:
3795 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
3796 case AMDGPU::VGPR_32RegClassID:
3797 return std::min(
3798 ST.getMaxNumVGPRs(
3799 MinOcc,
3801 ST.getMaxNumVGPRs(MF));
3802 case AMDGPU::SGPR_32RegClassID:
3803 case AMDGPU::SGPR_LO16RegClassID:
3804 return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
3805 }
3806}
3807
3809 unsigned Idx) const {
3810 switch (static_cast<AMDGPU::RegisterPressureSets>(Idx)) {
3811 case AMDGPU::RegisterPressureSets::VGPR_32:
3812 case AMDGPU::RegisterPressureSets::AGPR_32:
3813 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
3814 const_cast<MachineFunction &>(MF));
3815 case AMDGPU::RegisterPressureSets::SReg_32:
3816 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
3817 const_cast<MachineFunction &>(MF));
3818 }
3819
3820 llvm_unreachable("Unexpected register pressure set!");
3821}
3822
3823const int *SIRegisterInfo::getRegUnitPressureSets(MCRegUnit RegUnit) const {
3824 static const int Empty[] = { -1 };
3825
3826 if (RegPressureIgnoredUnits[static_cast<unsigned>(RegUnit)])
3827 return Empty;
3828
3829 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
3830}
3831
3833 ArrayRef<MCPhysReg> Order,
3835 const MachineFunction &MF,
3836 const VirtRegMap *VRM,
3837 const LiveRegMatrix *Matrix) const {
3838
3839 const MachineRegisterInfo &MRI = MF.getRegInfo();
3840 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3841
3842 std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
3843
3844 switch (Hint.first) {
3845 case AMDGPURI::Size32: {
3846 Register Paired = Hint.second;
3847 assert(Paired);
3848 Register PairedPhys;
3849 if (Paired.isPhysical()) {
3850 PairedPhys =
3851 getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
3852 } else if (VRM && VRM->hasPhys(Paired)) {
3853 PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
3854 &AMDGPU::VGPR_32RegClass);
3855 }
3856
3857 // Prefer the paired physreg.
3858 if (PairedPhys)
3859 // isLo(Paired) is implicitly true here from the API of
3860 // getMatchingSuperReg.
3861 Hints.push_back(PairedPhys);
3862 return false;
3863 }
3864 case AMDGPURI::Size16: {
3865 Register Paired = Hint.second;
3866 assert(Paired);
3867 Register PairedPhys;
3868 if (Paired.isPhysical()) {
3869 PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
3870 } else if (VRM && VRM->hasPhys(Paired)) {
3871 PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
3872 }
3873
3874 // First prefer the paired physreg.
3875 if (PairedPhys)
3876 Hints.push_back(PairedPhys);
3877 else {
3878 // Add all the lo16 physregs.
3879 // When the Paired operand has not yet been assigned a physreg it is
3880 // better to try putting VirtReg in a lo16 register, because possibly
3881 // later Paired can be assigned to the overlapping register and the COPY
3882 // can be eliminated.
3883 for (MCPhysReg PhysReg : Order) {
3884 if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this))
3885 continue;
3886 if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
3887 !MRI.isReserved(PhysReg))
3888 Hints.push_back(PhysReg);
3889 }
3890 }
3891 return false;
3892 }
3893 default:
3894 return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
3895 VRM);
3896 }
3897}
3898
3900 // Not a callee saved register.
3901 return AMDGPU::SGPR30_SGPR31;
3902}
3903
3904const TargetRegisterClass *
3906 const RegisterBank &RB) const {
3907 switch (RB.getID()) {
3908 case AMDGPU::VGPRRegBankID:
3910 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
3911 case AMDGPU::VCCRegBankID:
3912 assert(Size == 1);
3913 return getWaveMaskRegClass();
3914 case AMDGPU::SGPRRegBankID:
3915 return getSGPRClassForBitWidth(std::max(32u, Size));
3916 case AMDGPU::AGPRRegBankID:
3917 return getAGPRClassForBitWidth(std::max(32u, Size));
3918 default:
3919 llvm_unreachable("unknown register bank");
3920 }
3921}
3922
3923const TargetRegisterClass *
3925 const MachineRegisterInfo &MRI) const {
3926 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
3927 if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB))
3928 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
3929
3930 if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB))
3931 return getAllocatableClass(RC);
3932
3933 return nullptr;
3934}
3935
3937 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
3938}
3939
3941 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3942}
3943
3945 // VGPR tuples have an alignment requirement on gfx90a variants.
3946 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
3947 : &AMDGPU::VReg_64RegClass;
3948}
3949
3950// Find reaching register definition
3954 LiveIntervals *LIS) const {
3955 auto &MDT = LIS->getDomTree();
3956 SlotIndex UseIdx = LIS->getInstructionIndex(Use);
3957 SlotIndex DefIdx;
3958
3959 if (Reg.isVirtual()) {
3960 if (!LIS->hasInterval(Reg))
3961 return nullptr;
3962 LiveInterval &LI = LIS->getInterval(Reg);
3963 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
3964 : MRI.getMaxLaneMaskForVReg(Reg);
3965 VNInfo *V = nullptr;
3966 if (LI.hasSubRanges()) {
3967 for (auto &S : LI.subranges()) {
3968 if ((S.LaneMask & SubLanes) == SubLanes) {
3969 V = S.getVNInfoAt(UseIdx);
3970 break;
3971 }
3972 }
3973 } else {
3974 V = LI.getVNInfoAt(UseIdx);
3975 }
3976 if (!V)
3977 return nullptr;
3978 DefIdx = V->def;
3979 } else {
3980 // Find last def.
3981 for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
3982 LiveRange &LR = LIS->getRegUnit(Unit);
3983 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
3984 if (!DefIdx.isValid() ||
3985 MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
3986 LIS->getInstructionFromIndex(V->def)))
3987 DefIdx = V->def;
3988 } else {
3989 return nullptr;
3990 }
3991 }
3992 }
3993
3994 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
3995
3996 if (!Def || !MDT.dominates(Def, &Use))
3997 return nullptr;
3998
3999 assert(Def->modifiesRegister(Reg, this));
4000
4001 return Def;
4002}
4003
4005 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32);
4006
4007 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
4008 AMDGPU::SReg_32RegClass,
4009 AMDGPU::AGPR_32RegClass } ) {
4010 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
4011 return Super;
4012 }
4013 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
4014 &AMDGPU::VGPR_32RegClass)) {
4015 return Super;
4016 }
4017
4018 return AMDGPU::NoRegister;
4019}
4020
4022 if (!ST.needsAlignedVGPRs())
4023 return true;
4024
4025 if (isVGPRClass(&RC))
4026 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
4027 if (isAGPRClass(&RC))
4028 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
4029 if (isVectorSuperClass(&RC))
4030 return RC.hasSuperClassEq(
4031 getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
4032
4033 assert(&RC != &AMDGPU::VS_64RegClass);
4034
4035 return true;
4036}
4037
4040 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
4041}
4042
4045 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2);
4046}
4047
4050 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
4051}
4052
4053unsigned
4055 unsigned SubReg) const {
4056 switch (RC->TSFlags & SIRCFlags::RegKindMask) {
4057 case SIRCFlags::HasSGPR:
4058 return std::min(128u, getSubRegIdxSize(SubReg));
4059 case SIRCFlags::HasAGPR:
4060 case SIRCFlags::HasVGPR:
4062 return std::min(32u, getSubRegIdxSize(SubReg));
4063 default:
4064 break;
4065 }
4066 return 0;
4067}
4068
4070 const TargetRegisterClass &RC,
4071 bool IncludeCalls) const {
4072 unsigned NumArchVGPRs = ST.getAddressableNumArchVGPRs();
4074 (RC.getID() == AMDGPU::VGPR_32RegClassID)
4075 ? RC.getRegisters().take_front(NumArchVGPRs)
4076 : RC.getRegisters();
4077 for (MCPhysReg Reg : reverse(Registers))
4078 if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
4079 return getHWRegIndex(Reg) + 1;
4080 return 0;
4081}
4082
4085 const MachineFunction &MF) const {
4087 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4088 if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
4089 RegFlags.push_back("WWM_REG");
4090 return RegFlags;
4091}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Live Register Matrix
A set of register units.
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
if(PassOpts->AAPipeline)
This file declares the machine register scavenger class.
SI Pre allocate WWM Registers
static int getOffenMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyAGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFLoad(unsigned Opc)
static const std::array< unsigned, 17 > SubRegFromChannelTableWidthMap
static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI, const SIInstrInfo *TII)
static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI, const Twine &ErrMsg)
static const TargetRegisterClass * getAlignedAGPRClassForBitWidth(unsigned BitWidth)
static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, MachineFrameInfo &MFI, MachineBasicBlock::iterator MI, int Index, int64_t Offset)
static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, unsigned LoadStoreOp, unsigned EltSize)
static const TargetRegisterClass * getAlignedVGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyVGPRClassForBitWidth(unsigned BitWidth)
static cl::opt< bool > EnableSpillSGPRToVGPR("amdgpu-spill-sgpr-to-vgpr", cl::desc("Enable spilling SGPRs to VGPRs"), cl::ReallyHidden, cl::init(true))
static const TargetRegisterClass * getAlignedVectorSuperClassForBitWidth(unsigned BitWidth)
static const TargetRegisterClass * getAnyVectorSuperClassForBitWidth(unsigned BitWidth)
static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, int Index, unsigned Lane, unsigned ValueReg, bool IsKill)
static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI, const MachineInstr &MI)
static int getOffenMUBUFLoad(unsigned Opc)
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static const char * getRegisterName(MCRegister Reg)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
bool test(unsigned Idx) const
Definition BitVector.h:480
bool empty() const
empty - Tests whether there are no bits in this bitvector.
Definition BitVector.h:175
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
Register getReg() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
bool hasInterval(Register Reg) const
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
MachineDominatorTree & getDomTree()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LiveRange & getRegUnit(MCRegUnit Unit)
Return the live range for register unit Unit.
This class represents the liveness of a register, stack slot, etc.
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
A set of register units used to track register liveness.
bool available(MCRegister Reg) const
Returns true if no part of physical register Reg is live.
Describe properties that are true of each instruction in the target description file.
MCRegAliasIterator enumerates all registers aliasing Reg.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition MCRegister.h:77
Generic base class for all target subtargets.
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasCalls() const
Return true if the current function has any function calls.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
bool hasStackObjects() const
Return true if there are any stack objects in this function.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
void setAsmPrinterFlag(uint8_t Flag)
Set a flag for the AsmPrinter.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
Holds all the information related to register banks.
virtual bool isDivergentRegBank(const RegisterBank *RB) const
Returns true if the register bank is considered divergent.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static bool isFLATScratch(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isVOP3(const MCInstrDesc &Desc)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
ArrayRef< MCPhysReg > getAGPRSpillVGPRs() const
MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< MCPhysReg > getVGPRSpillAGPRs() const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const
uint32_t getMaskForVGPRBlockOps(Register RegisterBlock) const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const
bool checkFlag(Register Reg, uint8_t Flag) const
const ReservedRegSet & getWWMReservedRegs() const
Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, int64_t Offset) const override
int64_t getScratchInstrOffset(const MachineInstr *MI) const
bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const override
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
ArrayRef< MCPhysReg > getAllSGPR64(const MachineFunction &MF) const
Return all SGPR64 which satisfy the waves per execution unit requirement of the subtarget.
MCRegister findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, const MachineFunction &MF, bool ReserveHighestVGPR=false) const
Returns a lowest register that is not used at any point in the function.
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
bool requiresFrameIndexReplacementScavenging(const MachineFunction &MF) const override
bool shouldRealignStack(const MachineFunction &MF) const override
bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
Register getFrameRegister(const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getVectorSuperClassForBitWidth(unsigned BitWidth) const
bool spillEmergencySGPR(MachineBasicBlock::iterator MI, MachineBasicBlock &RestoreMBB, Register SGPR, RegScavenger *RS) const
SIRegisterInfo(const GCNSubtarget &ST)
const uint32_t * getAllVGPRRegMask() const
MCRegister getReturnAddressReg(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
bool hasBasePointer(const MachineFunction &MF) const
const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override
Returns a legal register class to copy a register in the specified class to or from.
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
ArrayRef< MCPhysReg > getAllSGPR32(const MachineFunction &MF) const
Return all SGPR32 which satisfy the waves per execution unit requirement of the subtarget.
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch buffer in case spilling is needed.
bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool SpillToPhysVGPRLane=false) const
Special case of eliminateFrameIndex.
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
void buildSpillLoadStore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned LoadStoreOp, int Index, Register ValueReg, bool ValueIsKill, MCRegister ScratchOffsetReg, int64_t InstrOffset, MachineMemOperand *MMO, RegScavenger *RS, LiveRegUnits *LiveUnits=nullptr) const
bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override
LLVM_READONLY const TargetRegisterClass * getAGPRClassForBitWidth(unsigned BitWidth) const
static bool isChainScratchRegister(Register VGPR)
bool requiresRegisterScavenging(const MachineFunction &Fn) const override
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
bool isUniformReg(const MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, Register Reg) const override
const uint32_t * getNoPreservedMask() const override
StringRef getRegAsmName(MCRegister Reg) const override
const uint32_t * getAllAllocatableSRegMask() const
MCRegister getAlignedHighSGPRForRC(const MachineFunction &MF, const unsigned Align, const TargetRegisterClass *RC) const
Return the largest available SGPR aligned to Align for the register class RC.
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getHWRegIndex(MCRegister Reg) const
const MCPhysReg * getCalleeSavedRegsViaCopy(const MachineFunction *MF) const
const uint32_t * getAllVectorRegMask() const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
const TargetRegisterClass * getPointerRegClass(unsigned Kind=0) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
bool opCanUseLiteralConstant(unsigned OpType) const
Register getBaseRegister() const
bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override
LLVM_READONLY const TargetRegisterClass * getAlignedLo256VGPRClassForBitWidth(unsigned BitWidth) const
LLVM_READONLY const TargetRegisterClass * getVGPRClassForBitWidth(unsigned BitWidth) const
const TargetRegisterClass * getEquivalentAVClass(const TargetRegisterClass *SRC) const
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override
static bool isVGPRClass(const TargetRegisterClass *RC)
MachineInstr * findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
SmallVector< StringLiteral > getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
ArrayRef< MCPhysReg > getAllSGPR128(const MachineFunction &MF) const
Return all SGPR128 which satisfy the waves per execution unit requirement of the subtarget.
unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const override
BitVector getReservedRegs(const MachineFunction &MF) const override
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override
const TargetRegisterClass * getRegClassForOperandReg(const MachineRegisterInfo &MRI, const MachineOperand &MO) const
void addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB, Register BlockReg) const
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, const TargetRegisterClass &RC, bool IncludeCalls=true) const
const uint32_t * getAllAGPRRegMask() const
const int * getRegUnitPressureSets(MCRegUnit RegUnit) const override
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override
bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
If OnlyToVGPR is true, this will only succeed if this manages to find a free VGPR lane to spill.
MCRegister getExec() const
MCRegister getVCC() const
int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override
bool isVectorSuperClass(const TargetRegisterClass *RC) const
const TargetRegisterClass * getWaveMaskRegClass() const
unsigned getSubRegAlignmentNumBits(const TargetRegisterClass *RC, unsigned SubReg) const
void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override
bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override
const TargetRegisterClass * getVGPR64Class() const
void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill=true) const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
bool isValid() const
Returns true if this is a valid index.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
SlotIndex replaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
ReplaceMachineInstrInMaps - Replacing a machine instr with a new one in maps used by register allocat...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
const uint8_t TSFlags
Configurable target specific flags.
ArrayRef< MCPhysReg > getRegisters() const
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
virtual const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &) const
Returns the largest super class of RC that is legal to use in the current sub-target and has the same...
virtual bool shouldRealignStack(const MachineFunction &MF) const
True if storage within the function requires the stack pointer to be aligned more than the normal cal...
virtual bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM=nullptr, const LiveRegMatrix *Matrix=nullptr) const
Get a list of 'hint' registers that the register allocator should try first when allocating a physica...
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
VNInfo - Value Number Information.
MCRegister getPhys(Register virtReg) const
returns the physical register mapped to the specified virtual register
Definition VirtRegMap.h:91
bool hasPhys(Register virtReg) const
returns true if the specified virtual register is mapped to a physical register
Definition VirtRegMap.h:87
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ PRIVATE_ADDRESS
Address space for private memory.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
@ OPERAND_REG_IMM_FIRST
Definition SIDefines.h:252
@ OPERAND_REG_INLINE_AC_FIRST
Definition SIDefines.h:258
@ OPERAND_REG_INLINE_AC_LAST
Definition SIDefines.h:259
@ OPERAND_REG_IMM_LAST
Definition SIDefines.h:253
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY int32_t getFlatScratchInstSVfromSVS(uint32_t Opcode)
LLVM_READONLY int32_t getFlatScratchInstSVfromSS(uint32_t Opcode)
LLVM_READONLY int32_t getFlatScratchInstSTfromSS(uint32_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Offset
Definition DWP.cpp:532
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Renamable
Register that may be renamed.
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
@ HasSGPR
Definition SIDefines.h:26
@ HasVGPR
Definition SIDefines.h:24
@ RegKindMask
Definition SIDefines.h:29
@ HasAGPR
Definition SIDefines.h:25
constexpr RegState getDefRegState(bool B)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
constexpr bool hasRegState(RegState Value, RegState Test)
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition Threading.h:86
constexpr unsigned BitWidth
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static const MachineMemOperand::Flags MOThreadPrivate
Mark the MMO of accesses to memory locations that are never written to by other threads.
Definition SIInstrInfo.h:57
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI)
ArrayRef< int16_t > SplitParts
SIMachineFunctionInfo & MFI
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, int Index, RegScavenger *RS)
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, bool IsKill, int Index, RegScavenger *RS)
MachineBasicBlock::iterator MI
void readWriteTmpVGPR(unsigned Offset, bool IsLoad)
const SIRegisterInfo & TRI
MachineBasicBlock * MBB
const SIInstrInfo & TII
The llvm::once_flag structure.
Definition Threading.h:67