LLVM 20.0.0git
SIRegisterInfo.cpp
Go to the documentation of this file.
1//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI implementation of the TargetRegisterInfo class.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
16#include "GCNSubtarget.h"
20#include "SIRegisterInfo.h"
26
27using namespace llvm;
28
29#define GET_REGINFO_TARGET_DESC
30#include "AMDGPUGenRegisterInfo.inc"
31
33 "amdgpu-spill-sgpr-to-vgpr",
34 cl::desc("Enable spilling SGPRs to VGPRs"),
36 cl::init(true));
37
38std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
39std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
40
41// Map numbers of DWORDs to indexes in SubRegFromChannelTable.
42// Valid indexes are shifted 1, such that a 0 mapping means unsupported.
43// e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
44// meaning index 7 in SubRegFromChannelTable.
45static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
47
48namespace llvm {
49
50// A temporary struct to spill SGPRs.
51// This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
52// just v_writelane and v_readlane.
53//
54// When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
55// is saved to scratch (or the other way around for loads).
56// For this, a VGPR is required where the needed lanes can be clobbered. The
57// RegScavenger can provide a VGPR where currently active lanes can be
58// clobbered, but we still need to save inactive lanes.
59// The high-level steps are:
60// - Try to scavenge SGPR(s) to save exec
61// - Try to scavenge VGPR
62// - Save needed, all or inactive lanes of a TmpVGPR
63// - Spill/Restore SGPRs using TmpVGPR
64// - Restore TmpVGPR
65//
66// To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
67// cannot scavenge temporary SGPRs to save exec, we use the following code:
68// buffer_store_dword TmpVGPR ; only if active lanes need to be saved
69// s_not exec, exec
70// buffer_store_dword TmpVGPR ; save inactive lanes
71// s_not exec, exec
73 struct PerVGPRData {
74 unsigned PerVGPR;
75 unsigned NumVGPRs;
76 int64_t VGPRLanes;
77 };
78
79 // The SGPR to save
83 unsigned NumSubRegs;
84 bool IsKill;
85 const DebugLoc &DL;
86
87 /* When spilling to stack */
88 // The SGPRs are written into this VGPR, which is then written to scratch
89 // (or vice versa for loads).
90 Register TmpVGPR = AMDGPU::NoRegister;
91 // Temporary spill slot to save TmpVGPR to.
92 int TmpVGPRIndex = 0;
93 // If TmpVGPR is live before the spill or if it is scavenged.
94 bool TmpVGPRLive = false;
95 // Scavenged SGPR to save EXEC.
96 Register SavedExecReg = AMDGPU::NoRegister;
97 // Stack index to write the SGPRs to.
98 int Index;
99 unsigned EltSize = 4;
100
109 unsigned MovOpc;
110 unsigned NotOpc;
111
115 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
116 MI->getOperand(0).isKill(), Index, RS) {}
117
120 bool IsKill, int Index, RegScavenger *RS)
121 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
122 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
123 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
125 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
128
129 if (IsWave32) {
130 ExecReg = AMDGPU::EXEC_LO;
131 MovOpc = AMDGPU::S_MOV_B32;
132 NotOpc = AMDGPU::S_NOT_B32;
133 } else {
134 ExecReg = AMDGPU::EXEC;
135 MovOpc = AMDGPU::S_MOV_B64;
136 NotOpc = AMDGPU::S_NOT_B64;
137 }
138
139 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
140 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
141 SuperReg != AMDGPU::EXEC && "exec should never spill");
142 }
143
146 Data.PerVGPR = IsWave32 ? 32 : 64;
147 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
148 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
149 return Data;
150 }
151
152 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
153 // free.
154 // Writes these instructions if an SGPR can be scavenged:
155 // s_mov_b64 s[6:7], exec ; Save exec
156 // s_mov_b64 exec, 3 ; Wanted lanemask
157 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
158 //
159 // Writes these instructions if no SGPR can be scavenged:
160 // buffer_store_dword v0 ; Only if no free VGPR was found
161 // s_not_b64 exec, exec
162 // buffer_store_dword v0 ; Save inactive lanes
163 // ; exec stays inverted, it is flipped back in
164 // ; restore.
165 void prepare() {
166 // Scavenged temporary VGPR to use. It must be scavenged once for any number
167 // of spilled subregs.
168 // FIXME: The liveness analysis is limited and does not tell if a register
169 // is in use in lanes that are currently inactive. We can never be sure if
170 // a register as actually in use in another lane, so we need to save all
171 // used lanes of the chosen VGPR.
172 assert(RS && "Cannot spill SGPR to memory without RegScavenger");
173 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
174 0, false);
175
176 // Reserve temporary stack slot
178 if (TmpVGPR) {
179 // Found a register that is dead in the currently active lanes, we only
180 // need to spill inactive lanes.
181 TmpVGPRLive = false;
182 } else {
183 // Pick v0 because it doesn't make a difference.
184 TmpVGPR = AMDGPU::VGPR0;
185 TmpVGPRLive = true;
186 }
187
188 if (TmpVGPRLive) {
189 // We need to inform the scavenger that this index is already in use until
190 // we're done with the custom emergency spill.
192 }
193
194 // We may end up recursively calling the scavenger, and don't want to re-use
195 // the same register.
197
198 // Try to scavenge SGPRs to save exec
199 assert(!SavedExecReg && "Exec is already saved, refuse to save again");
200 const TargetRegisterClass &RC =
201 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
203 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
204
205 int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
206
207 if (SavedExecReg) {
209 // Set exec to needed lanes
211 auto I =
212 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
213 if (!TmpVGPRLive)
215 // Spill needed lanes
216 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
217 } else {
218 // The modify and restore of exec clobber SCC, which we would have to save
219 // and restore. FIXME: We probably would need to reserve a register for
220 // this.
221 if (RS->isRegUsed(AMDGPU::SCC))
222 MI->emitError("unhandled SGPR spill to memory");
223
224 // Spill active lanes
225 if (TmpVGPRLive)
226 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
227 /*IsKill*/ false);
228 // Spill inactive lanes
229 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
230 if (!TmpVGPRLive)
232 I->getOperand(2).setIsDead(); // Mark SCC as dead.
233 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
234 }
235 }
236
237 // Writes these instructions if an SGPR can be scavenged:
238 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
239 // s_waitcnt vmcnt(0) ; If a free VGPR was found
240 // s_mov_b64 exec, s[6:7] ; Save exec
241 //
242 // Writes these instructions if no SGPR can be scavenged:
243 // buffer_load_dword v0 ; Restore inactive lanes
244 // s_waitcnt vmcnt(0) ; If a free VGPR was found
245 // s_not_b64 exec, exec
246 // buffer_load_dword v0 ; Only if no free VGPR was found
247 void restore() {
248 if (SavedExecReg) {
249 // Restore used lanes
250 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
251 /*IsKill*/ false);
252 // Restore exec
253 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
255 // Add an implicit use of the load so it is not dead.
256 // FIXME This inserts an unnecessary waitcnt
257 if (!TmpVGPRLive) {
259 }
260 } else {
261 // Restore inactive lanes
262 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
263 /*IsKill*/ false);
264 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
265 if (!TmpVGPRLive)
267 I->getOperand(2).setIsDead(); // Mark SCC as dead.
268
269 // Restore active lanes
270 if (TmpVGPRLive)
271 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
272 }
273
274 // Inform the scavenger where we're releasing our custom scavenged register.
275 if (TmpVGPRLive) {
276 MachineBasicBlock::iterator RestorePt = std::prev(MI);
278 }
279 }
280
281 // Write TmpVGPR to memory or read TmpVGPR from memory.
282 // Either using a single buffer_load/store if exec is set to the needed mask
283 // or using
284 // buffer_load
285 // s_not exec, exec
286 // buffer_load
287 // s_not exec, exec
288 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
289 if (SavedExecReg) {
290 // Spill needed lanes
291 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
292 } else {
293 // The modify and restore of exec clobber SCC, which we would have to save
294 // and restore. FIXME: We probably would need to reserve a register for
295 // this.
296 if (RS->isRegUsed(AMDGPU::SCC))
297 MI->emitError("unhandled SGPR spill to memory");
298
299 // Spill active lanes
300 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
301 /*IsKill*/ false);
302 // Spill inactive lanes
303 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
304 Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
305 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
306 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
307 Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
308 }
309 }
310
312 assert(MBB->getParent() == &MF);
313 MI = NewMI;
314 MBB = NewMBB;
315 }
316};
317
318} // namespace llvm
319
321 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
322 ST.getAMDGPUDwarfFlavour()),
323 ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
324
325 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
326 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
327 (getSubRegIndexLaneMask(AMDGPU::lo16) |
328 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
329 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
330 "getNumCoveredRegs() will not work with generated subreg masks!");
331
332 RegPressureIgnoredUnits.resize(getNumRegUnits());
333 RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin());
334 for (auto Reg : AMDGPU::VGPR_16RegClass) {
335 if (AMDGPU::isHi16Reg(Reg, *this))
336 RegPressureIgnoredUnits.set(*regunits(Reg).begin());
337 }
338
339 // HACK: Until this is fully tablegen'd.
340 static llvm::once_flag InitializeRegSplitPartsFlag;
341
342 static auto InitializeRegSplitPartsOnce = [this]() {
343 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
344 unsigned Size = getSubRegIdxSize(Idx);
345 if (Size & 31)
346 continue;
347 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1];
348 unsigned Pos = getSubRegIdxOffset(Idx);
349 if (Pos % Size)
350 continue;
351 Pos /= Size;
352 if (Vec.empty()) {
353 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
354 Vec.resize(MaxNumParts);
355 }
356 Vec[Pos] = Idx;
357 }
358 };
359
360 static llvm::once_flag InitializeSubRegFromChannelTableFlag;
361
362 static auto InitializeSubRegFromChannelTableOnce = [this]() {
363 for (auto &Row : SubRegFromChannelTable)
364 Row.fill(AMDGPU::NoSubRegister);
365 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
366 unsigned Width = getSubRegIdxSize(Idx) / 32;
367 unsigned Offset = getSubRegIdxOffset(Idx) / 32;
369 Width = SubRegFromChannelTableWidthMap[Width];
370 if (Width == 0)
371 continue;
372 unsigned TableIdx = Width - 1;
373 assert(TableIdx < SubRegFromChannelTable.size());
374 assert(Offset < SubRegFromChannelTable[TableIdx].size());
375 SubRegFromChannelTable[TableIdx][Offset] = Idx;
376 }
377 };
378
379 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
380 llvm::call_once(InitializeSubRegFromChannelTableFlag,
381 InitializeSubRegFromChannelTableOnce);
382}
383
384void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
385 MCRegister Reg) const {
386 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
387 Reserved.set(*R);
388}
389
390// Forced to be here by one .inc
392 const MachineFunction *MF) const {
394 switch (CC) {
395 case CallingConv::C:
398 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
399 : CSR_AMDGPU_SaveList;
401 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
402 : CSR_AMDGPU_SI_Gfx_SaveList;
404 return CSR_AMDGPU_CS_ChainPreserve_SaveList;
405 default: {
406 // Dummy to not crash RegisterClassInfo.
407 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
408 return &NoCalleeSavedReg;
409 }
410 }
411}
412
413const MCPhysReg *
415 return nullptr;
416}
417
419 CallingConv::ID CC) const {
420 switch (CC) {
421 case CallingConv::C:
424 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
425 : CSR_AMDGPU_RegMask;
427 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
428 : CSR_AMDGPU_SI_Gfx_RegMask;
431 // Calls to these functions never return, so we can pretend everything is
432 // preserved.
433 return AMDGPU_AllVGPRs_RegMask;
434 default:
435 return nullptr;
436 }
437}
438
440 return CSR_AMDGPU_NoRegs_RegMask;
441}
442
444 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
445}
446
449 const MachineFunction &MF) const {
450 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
451 // equivalent AV class. If used one, the verifier will crash after
452 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
453 // until Instruction selection.
454 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
455 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
456 return &AMDGPU::AV_32RegClass;
457 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
458 return &AMDGPU::AV_64RegClass;
459 if (RC == &AMDGPU::VReg_64_Align2RegClass ||
460 RC == &AMDGPU::AReg_64_Align2RegClass)
461 return &AMDGPU::AV_64_Align2RegClass;
462 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
463 return &AMDGPU::AV_96RegClass;
464 if (RC == &AMDGPU::VReg_96_Align2RegClass ||
465 RC == &AMDGPU::AReg_96_Align2RegClass)
466 return &AMDGPU::AV_96_Align2RegClass;
467 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
468 return &AMDGPU::AV_128RegClass;
469 if (RC == &AMDGPU::VReg_128_Align2RegClass ||
470 RC == &AMDGPU::AReg_128_Align2RegClass)
471 return &AMDGPU::AV_128_Align2RegClass;
472 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
473 return &AMDGPU::AV_160RegClass;
474 if (RC == &AMDGPU::VReg_160_Align2RegClass ||
475 RC == &AMDGPU::AReg_160_Align2RegClass)
476 return &AMDGPU::AV_160_Align2RegClass;
477 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
478 return &AMDGPU::AV_192RegClass;
479 if (RC == &AMDGPU::VReg_192_Align2RegClass ||
480 RC == &AMDGPU::AReg_192_Align2RegClass)
481 return &AMDGPU::AV_192_Align2RegClass;
482 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
483 return &AMDGPU::AV_256RegClass;
484 if (RC == &AMDGPU::VReg_256_Align2RegClass ||
485 RC == &AMDGPU::AReg_256_Align2RegClass)
486 return &AMDGPU::AV_256_Align2RegClass;
487 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
488 return &AMDGPU::AV_512RegClass;
489 if (RC == &AMDGPU::VReg_512_Align2RegClass ||
490 RC == &AMDGPU::AReg_512_Align2RegClass)
491 return &AMDGPU::AV_512_Align2RegClass;
492 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
493 return &AMDGPU::AV_1024RegClass;
494 if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
495 RC == &AMDGPU::AReg_1024_Align2RegClass)
496 return &AMDGPU::AV_1024_Align2RegClass;
497 }
498
500}
501
503 const SIFrameLowering *TFI = ST.getFrameLowering();
505 // During ISel lowering we always reserve the stack pointer in entry and chain
506 // functions, but never actually want to reference it when accessing our own
507 // frame. If we need a frame pointer we use it, but otherwise we can just use
508 // an immediate "0" which we represent by returning NoRegister.
509 if (FuncInfo->isBottomOfStack()) {
510 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
511 }
512 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
513 : FuncInfo->getStackPtrOffsetReg();
514}
515
517 // When we need stack realignment, we can't reference off of the
518 // stack pointer, so we reserve a base pointer.
519 const MachineFrameInfo &MFI = MF.getFrameInfo();
520 return MFI.getNumFixedObjects() && shouldRealignStack(MF);
521}
522
523Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
524
526 return AMDGPU_AllVGPRs_RegMask;
527}
528
530 return AMDGPU_AllAGPRs_RegMask;
531}
532
534 return AMDGPU_AllVectorRegs_RegMask;
535}
536
538 return AMDGPU_AllAllocatableSRegs_RegMask;
539}
540
541unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
542 unsigned NumRegs) {
543 assert(NumRegs < SubRegFromChannelTableWidthMap.size());
544 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
545 assert(NumRegIndex && "Not implemented");
546 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
547 return SubRegFromChannelTable[NumRegIndex - 1][Channel];
548}
549
552 const unsigned Align,
553 const TargetRegisterClass *RC) const {
554 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align;
555 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
556 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
557}
558
560 const MachineFunction &MF) const {
561 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
562}
563
565 BitVector Reserved(getNumRegs());
566 Reserved.set(AMDGPU::MODE);
567
569
570 // Reserve special purpose registers.
571 //
572 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
573 // this seems likely to result in bugs, so I'm marking them as reserved.
574 reserveRegisterTuples(Reserved, AMDGPU::EXEC);
575 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
576
577 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
578 reserveRegisterTuples(Reserved, AMDGPU::M0);
579
580 // Reserve src_vccz, src_execz, src_scc.
581 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
582 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
583 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
584
585 // Reserve the memory aperture registers
586 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
587 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
588 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
589 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
590
591 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
592 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
593
594 // Reserve xnack_mask registers - support is not implemented in Codegen.
595 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
596
597 // Reserve lds_direct register - support is not implemented in Codegen.
598 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
599
600 // Reserve Trap Handler registers - support is not implemented in Codegen.
601 reserveRegisterTuples(Reserved, AMDGPU::TBA);
602 reserveRegisterTuples(Reserved, AMDGPU::TMA);
603 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
604 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
605 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
606 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
607 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
608 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
609 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
610 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
611
612 // Reserve null register - it shall never be allocated
613 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
614
615 // Reserve SGPRs.
616 //
617 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
618 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
619 for (const TargetRegisterClass *RC : regclasses()) {
620 if (RC->isBaseClass() && isSGPRClass(RC)) {
621 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
622 for (MCPhysReg Reg : *RC) {
623 unsigned Index = getHWRegIndex(Reg);
624 if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs)
625 Reserved.set(Reg);
626 }
627 }
628 }
629
630 Register ScratchRSrcReg = MFI->getScratchRSrcReg();
631 if (ScratchRSrcReg != AMDGPU::NoRegister) {
632 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
633 // need to spill.
634 // TODO: May need to reserve a VGPR if doing LDS spilling.
635 reserveRegisterTuples(Reserved, ScratchRSrcReg);
636 }
637
638 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
639 if (LongBranchReservedReg)
640 reserveRegisterTuples(Reserved, LongBranchReservedReg);
641
642 // We have to assume the SP is needed in case there are calls in the function,
643 // which is detected after the function is lowered. If we aren't really going
644 // to need SP, don't bother reserving it.
645 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
646 if (StackPtrReg) {
647 reserveRegisterTuples(Reserved, StackPtrReg);
648 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
649 }
650
651 MCRegister FrameReg = MFI->getFrameOffsetReg();
652 if (FrameReg) {
653 reserveRegisterTuples(Reserved, FrameReg);
654 assert(!isSubRegister(ScratchRSrcReg, FrameReg));
655 }
656
657 if (hasBasePointer(MF)) {
658 MCRegister BasePtrReg = getBaseRegister();
659 reserveRegisterTuples(Reserved, BasePtrReg);
660 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
661 }
662
663 // FIXME: Use same reserved register introduced in D149775
664 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
665 Register ExecCopyReg = MFI->getSGPRForEXECCopy();
666 if (ExecCopyReg)
667 reserveRegisterTuples(Reserved, ExecCopyReg);
668
669 // Reserve VGPRs/AGPRs.
670 //
671 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
672 unsigned MaxNumAGPRs = MaxNumVGPRs;
673 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
674
675 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
676 // a wave may have up to 512 total vector registers combining together both
677 // VGPRs and AGPRs. Hence, in an entry function without calls and without
678 // AGPRs used within it, it is possible to use the whole vector register
679 // budget for VGPRs.
680 //
681 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
682 // register file accordingly.
683 if (ST.hasGFX90AInsts()) {
684 if (MFI->usesAGPRs(MF)) {
685 MaxNumVGPRs /= 2;
686 MaxNumAGPRs = MaxNumVGPRs;
687 } else {
688 if (MaxNumVGPRs > TotalNumVGPRs) {
689 MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs;
690 MaxNumVGPRs = TotalNumVGPRs;
691 } else
692 MaxNumAGPRs = 0;
693 }
694 }
695
696 for (const TargetRegisterClass *RC : regclasses()) {
697 if (RC->isBaseClass() && isVGPRClass(RC)) {
698 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
699 for (MCPhysReg Reg : *RC) {
700 unsigned Index = getHWRegIndex(Reg);
701 if (Index + NumRegs > MaxNumVGPRs)
702 Reserved.set(Reg);
703 }
704 }
705 }
706
707 // Reserve all the AGPRs if there are no instructions to use it.
708 if (!ST.hasMAIInsts())
709 MaxNumAGPRs = 0;
710 for (const TargetRegisterClass *RC : regclasses()) {
711 if (RC->isBaseClass() && isAGPRClass(RC)) {
712 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
713 for (MCPhysReg Reg : *RC) {
714 unsigned Index = getHWRegIndex(Reg);
715 if (Index + NumRegs > MaxNumAGPRs)
716 Reserved.set(Reg);
717 }
718 }
719 }
720
721 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
722 // VGPR available at all times.
723 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
724 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
725 }
726
727 for (Register Reg : MFI->getWWMReservedRegs())
728 reserveRegisterTuples(Reserved, Reg);
729
730 // FIXME: Stop using reserved registers for this.
731 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
732 reserveRegisterTuples(Reserved, Reg);
733
734 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
735 reserveRegisterTuples(Reserved, Reg);
736
737 return Reserved;
738}
739
741 MCRegister PhysReg) const {
742 return !MF.getRegInfo().isReserved(PhysReg);
743}
744
747 // On entry or in chain functions, the base address is 0, so it can't possibly
748 // need any more alignment.
749
750 // FIXME: Should be able to specify the entry frame alignment per calling
751 // convention instead.
752 if (Info->isBottomOfStack())
753 return false;
754
756}
757
760 if (Info->isEntryFunction()) {
761 const MachineFrameInfo &MFI = Fn.getFrameInfo();
762 return MFI.hasStackObjects() || MFI.hasCalls();
763 }
764
765 // May need scavenger for dealing with callee saved registers.
766 return true;
767}
768
770 const MachineFunction &MF) const {
771 // Do not use frame virtual registers. They used to be used for SGPRs, but
772 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
773 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
774 // spill.
775 return false;
776}
777
779 const MachineFunction &MF) const {
780 const MachineFrameInfo &MFI = MF.getFrameInfo();
781 return MFI.hasStackObjects();
782}
783
785 const MachineFunction &) const {
786 // There are no special dedicated stack or frame pointers.
787 return true;
788}
789
792
793 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
794 AMDGPU::OpName::offset);
795 return MI->getOperand(OffIdx).getImm();
796}
797
799 int Idx) const {
800 switch (MI->getOpcode()) {
801 case AMDGPU::V_ADD_U32_e32:
802 case AMDGPU::V_ADD_U32_e64:
803 case AMDGPU::V_ADD_CO_U32_e32: {
804 int OtherIdx = Idx == 1 ? 2 : 1;
805 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
806 return OtherOp.isImm() ? OtherOp.getImm() : 0;
807 }
808 case AMDGPU::V_ADD_CO_U32_e64: {
809 int OtherIdx = Idx == 2 ? 3 : 2;
810 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
811 return OtherOp.isImm() ? OtherOp.getImm() : 0;
812 }
813 default:
814 break;
815 }
816
818 return 0;
819
820 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
821 AMDGPU::OpName::vaddr) ||
822 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
823 AMDGPU::OpName::saddr))) &&
824 "Should never see frame index on non-address operand");
825
827}
828
830 const MachineInstr &MI) {
831 assert(MI.getDesc().isAdd());
832 const MachineOperand &Src0 = MI.getOperand(1);
833 const MachineOperand &Src1 = MI.getOperand(2);
834
835 if (Src0.isFI()) {
836 return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
837 Src1.getReg()));
838 }
839
840 if (Src1.isFI()) {
841 return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
842 Src0.getReg()));
843 }
844
845 return false;
846}
847
849 // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
850 switch (MI->getOpcode()) {
851 case AMDGPU::V_ADD_U32_e32: {
852 // TODO: We could handle this but it requires work to avoid violating
853 // operand restrictions.
854 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
855 !isFIPlusImmOrVGPR(*this, *MI))
856 return false;
857 [[fallthrough]];
858 }
859 case AMDGPU::V_ADD_U32_e64:
860 // FIXME: This optimization is barely profitable enableFlatScratch as-is.
861 //
862 // Much of the benefit with the MUBUF handling is we avoid duplicating the
863 // shift of the frame register, which isn't needed with scratch.
864 //
865 // materializeFrameBaseRegister doesn't know the register classes of the
866 // uses, and unconditionally uses an s_add_i32, which will end up using a
867 // copy for the vector uses.
868 return !ST.enableFlatScratch();
869 case AMDGPU::V_ADD_CO_U32_e32:
870 if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
871 !isFIPlusImmOrVGPR(*this, *MI))
872 return false;
873 // We can't deal with the case where the carry out has a use (though this
874 // should never happen)
875 return MI->getOperand(3).isDead();
876 case AMDGPU::V_ADD_CO_U32_e64:
877 // TODO: Should we check use_empty instead?
878 return MI->getOperand(1).isDead();
879 default:
880 break;
881 }
882
884 return false;
885
886 int64_t FullOffset = Offset + getScratchInstrOffset(MI);
887
888 const SIInstrInfo *TII = ST.getInstrInfo();
890 return !TII->isLegalMUBUFImmOffset(FullOffset);
891
892 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
894}
895
897 int FrameIdx,
898 int64_t Offset) const {
900 DebugLoc DL; // Defaults to "unknown"
901
902 if (Ins != MBB->end())
903 DL = Ins->getDebugLoc();
904
906 const SIInstrInfo *TII = ST.getInstrInfo();
908 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
909 : AMDGPU::V_MOV_B32_e32;
910
911 Register BaseReg = MRI.createVirtualRegister(
912 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
913 : &AMDGPU::VGPR_32RegClass);
914
915 if (Offset == 0) {
916 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
917 .addFrameIndex(FrameIdx);
918 return BaseReg;
919 }
920
921 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
922
923 Register FIReg = MRI.createVirtualRegister(
924 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
925 : &AMDGPU::VGPR_32RegClass);
926
927 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
928 .addImm(Offset);
929 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
930 .addFrameIndex(FrameIdx);
931
932 if (ST.enableFlatScratch() ) {
933 // FIXME: Mark scc as dead
934 // FIXME: Make sure scc isn't live in.
935 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
936 .addReg(OffsetReg, RegState::Kill)
937 .addReg(FIReg);
938 return BaseReg;
939 }
940
941 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
942 .addReg(OffsetReg, RegState::Kill)
943 .addReg(FIReg)
944 .addImm(0); // clamp bit
945
946 return BaseReg;
947}
948
950 int64_t Offset) const {
951 const SIInstrInfo *TII = ST.getInstrInfo();
952
953 switch (MI.getOpcode()) {
954 case AMDGPU::V_ADD_U32_e32:
955 case AMDGPU::V_ADD_CO_U32_e32: {
956 MachineOperand *FIOp = &MI.getOperand(2);
957 MachineOperand *ImmOp = &MI.getOperand(1);
958 if (!FIOp->isFI())
959 std::swap(FIOp, ImmOp);
960
961 if (!ImmOp->isImm()) {
962 assert(Offset == 0);
963 FIOp->ChangeToRegister(BaseReg, false);
964 TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
965 return;
966 }
967
968 int64_t TotalOffset = ImmOp->getImm() + Offset;
969 if (TotalOffset == 0) {
970 MI.setDesc(TII->get(AMDGPU::COPY));
971 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
972 MI.removeOperand(I);
973
974 MI.getOperand(1).ChangeToRegister(BaseReg, false);
975 return;
976 }
977
978 ImmOp->setImm(TotalOffset);
979
980 MachineBasicBlock *MBB = MI.getParent();
983
984 // FIXME: materializeFrameBaseRegister does not know the register class of
985 // the uses of the frame index, and assumes SGPR for enableFlatScratch. Emit
986 // a copy so we have a legal operand and hope the register coalescer can
987 // clean it up.
988 if (isSGPRReg(MRI, BaseReg)) {
989 Register BaseRegVGPR =
990 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
991 BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
992 .addReg(BaseReg);
993 MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
994 } else {
995 MI.getOperand(2).ChangeToRegister(BaseReg, false);
996 }
997 return;
998 }
999 case AMDGPU::V_ADD_U32_e64:
1000 case AMDGPU::V_ADD_CO_U32_e64: {
1001 int Src0Idx = MI.getNumExplicitDefs();
1002 MachineOperand *FIOp = &MI.getOperand(Src0Idx);
1003 MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
1004 if (!FIOp->isFI())
1005 std::swap(FIOp, ImmOp);
1006
1007 if (!ImmOp->isImm()) {
1008 FIOp->ChangeToRegister(BaseReg, false);
1009 TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
1010 return;
1011 }
1012
1013 int64_t TotalOffset = ImmOp->getImm() + Offset;
1014 if (TotalOffset == 0) {
1015 MI.setDesc(TII->get(AMDGPU::COPY));
1016
1017 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1018 MI.removeOperand(I);
1019
1020 MI.getOperand(1).ChangeToRegister(BaseReg, false);
1021 } else {
1022 FIOp->ChangeToRegister(BaseReg, false);
1023 ImmOp->setImm(TotalOffset);
1024 }
1025
1026 return;
1027 }
1028 default:
1029 break;
1030 }
1031
1032 bool IsFlat = TII->isFLATScratch(MI);
1033
1034#ifndef NDEBUG
1035 // FIXME: Is it possible to be storing a frame index to itself?
1036 bool SeenFI = false;
1037 for (const MachineOperand &MO: MI.operands()) {
1038 if (MO.isFI()) {
1039 if (SeenFI)
1040 llvm_unreachable("should not see multiple frame indices");
1041
1042 SeenFI = true;
1043 }
1044 }
1045#endif
1046
1047 MachineOperand *FIOp =
1048 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
1049 : AMDGPU::OpName::vaddr);
1050
1051 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
1052 int64_t NewOffset = OffsetOp->getImm() + Offset;
1053
1054 assert(FIOp && FIOp->isFI() && "frame index must be address operand");
1055 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
1056
1057 if (IsFlat) {
1058 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1060 "offset should be legal");
1061 FIOp->ChangeToRegister(BaseReg, false);
1062 OffsetOp->setImm(NewOffset);
1063 return;
1064 }
1065
1066#ifndef NDEBUG
1067 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
1068 assert(SOffset->isImm() && SOffset->getImm() == 0);
1069#endif
1070
1071 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
1072
1073 FIOp->ChangeToRegister(BaseReg, false);
1074 OffsetOp->setImm(NewOffset);
1075}
1076
1078 Register BaseReg,
1079 int64_t Offset) const {
1080
1081 switch (MI->getOpcode()) {
1082 case AMDGPU::V_ADD_U32_e32:
1083 case AMDGPU::V_ADD_CO_U32_e32:
1084 return true;
1085 case AMDGPU::V_ADD_U32_e64:
1086 case AMDGPU::V_ADD_CO_U32_e64:
1088 default:
1089 break;
1090 }
1091
1093 return false;
1094
1095 int64_t NewOffset = Offset + getScratchInstrOffset(MI);
1096
1097 const SIInstrInfo *TII = ST.getInstrInfo();
1099 return TII->isLegalMUBUFImmOffset(NewOffset);
1100
1101 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1103}
1104
1106 const MachineFunction &MF, unsigned Kind) const {
1107 // This is inaccurate. It depends on the instruction and address space. The
1108 // only place where we should hit this is for dealing with frame indexes /
1109 // private accesses, so this is correct in that case.
1110 return &AMDGPU::VGPR_32RegClass;
1111}
1112
1113const TargetRegisterClass *
1115 if (isAGPRClass(RC) && !ST.hasGFX90AInsts())
1116 return getEquivalentVGPRClass(RC);
1117 if (RC == &AMDGPU::SCC_CLASSRegClass)
1118 return getWaveMaskRegClass();
1119
1120 return RC;
1121}
1122
1123static unsigned getNumSubRegsForSpillOp(unsigned Op) {
1124
1125 switch (Op) {
1126 case AMDGPU::SI_SPILL_S1024_SAVE:
1127 case AMDGPU::SI_SPILL_S1024_RESTORE:
1128 case AMDGPU::SI_SPILL_V1024_SAVE:
1129 case AMDGPU::SI_SPILL_V1024_RESTORE:
1130 case AMDGPU::SI_SPILL_A1024_SAVE:
1131 case AMDGPU::SI_SPILL_A1024_RESTORE:
1132 case AMDGPU::SI_SPILL_AV1024_SAVE:
1133 case AMDGPU::SI_SPILL_AV1024_RESTORE:
1134 return 32;
1135 case AMDGPU::SI_SPILL_S512_SAVE:
1136 case AMDGPU::SI_SPILL_S512_RESTORE:
1137 case AMDGPU::SI_SPILL_V512_SAVE:
1138 case AMDGPU::SI_SPILL_V512_RESTORE:
1139 case AMDGPU::SI_SPILL_A512_SAVE:
1140 case AMDGPU::SI_SPILL_A512_RESTORE:
1141 case AMDGPU::SI_SPILL_AV512_SAVE:
1142 case AMDGPU::SI_SPILL_AV512_RESTORE:
1143 return 16;
1144 case AMDGPU::SI_SPILL_S384_SAVE:
1145 case AMDGPU::SI_SPILL_S384_RESTORE:
1146 case AMDGPU::SI_SPILL_V384_SAVE:
1147 case AMDGPU::SI_SPILL_V384_RESTORE:
1148 case AMDGPU::SI_SPILL_A384_SAVE:
1149 case AMDGPU::SI_SPILL_A384_RESTORE:
1150 case AMDGPU::SI_SPILL_AV384_SAVE:
1151 case AMDGPU::SI_SPILL_AV384_RESTORE:
1152 return 12;
1153 case AMDGPU::SI_SPILL_S352_SAVE:
1154 case AMDGPU::SI_SPILL_S352_RESTORE:
1155 case AMDGPU::SI_SPILL_V352_SAVE:
1156 case AMDGPU::SI_SPILL_V352_RESTORE:
1157 case AMDGPU::SI_SPILL_A352_SAVE:
1158 case AMDGPU::SI_SPILL_A352_RESTORE:
1159 case AMDGPU::SI_SPILL_AV352_SAVE:
1160 case AMDGPU::SI_SPILL_AV352_RESTORE:
1161 return 11;
1162 case AMDGPU::SI_SPILL_S320_SAVE:
1163 case AMDGPU::SI_SPILL_S320_RESTORE:
1164 case AMDGPU::SI_SPILL_V320_SAVE:
1165 case AMDGPU::SI_SPILL_V320_RESTORE:
1166 case AMDGPU::SI_SPILL_A320_SAVE:
1167 case AMDGPU::SI_SPILL_A320_RESTORE:
1168 case AMDGPU::SI_SPILL_AV320_SAVE:
1169 case AMDGPU::SI_SPILL_AV320_RESTORE:
1170 return 10;
1171 case AMDGPU::SI_SPILL_S288_SAVE:
1172 case AMDGPU::SI_SPILL_S288_RESTORE:
1173 case AMDGPU::SI_SPILL_V288_SAVE:
1174 case AMDGPU::SI_SPILL_V288_RESTORE:
1175 case AMDGPU::SI_SPILL_A288_SAVE:
1176 case AMDGPU::SI_SPILL_A288_RESTORE:
1177 case AMDGPU::SI_SPILL_AV288_SAVE:
1178 case AMDGPU::SI_SPILL_AV288_RESTORE:
1179 return 9;
1180 case AMDGPU::SI_SPILL_S256_SAVE:
1181 case AMDGPU::SI_SPILL_S256_RESTORE:
1182 case AMDGPU::SI_SPILL_V256_SAVE:
1183 case AMDGPU::SI_SPILL_V256_RESTORE:
1184 case AMDGPU::SI_SPILL_A256_SAVE:
1185 case AMDGPU::SI_SPILL_A256_RESTORE:
1186 case AMDGPU::SI_SPILL_AV256_SAVE:
1187 case AMDGPU::SI_SPILL_AV256_RESTORE:
1188 return 8;
1189 case AMDGPU::SI_SPILL_S224_SAVE:
1190 case AMDGPU::SI_SPILL_S224_RESTORE:
1191 case AMDGPU::SI_SPILL_V224_SAVE:
1192 case AMDGPU::SI_SPILL_V224_RESTORE:
1193 case AMDGPU::SI_SPILL_A224_SAVE:
1194 case AMDGPU::SI_SPILL_A224_RESTORE:
1195 case AMDGPU::SI_SPILL_AV224_SAVE:
1196 case AMDGPU::SI_SPILL_AV224_RESTORE:
1197 return 7;
1198 case AMDGPU::SI_SPILL_S192_SAVE:
1199 case AMDGPU::SI_SPILL_S192_RESTORE:
1200 case AMDGPU::SI_SPILL_V192_SAVE:
1201 case AMDGPU::SI_SPILL_V192_RESTORE:
1202 case AMDGPU::SI_SPILL_A192_SAVE:
1203 case AMDGPU::SI_SPILL_A192_RESTORE:
1204 case AMDGPU::SI_SPILL_AV192_SAVE:
1205 case AMDGPU::SI_SPILL_AV192_RESTORE:
1206 return 6;
1207 case AMDGPU::SI_SPILL_S160_SAVE:
1208 case AMDGPU::SI_SPILL_S160_RESTORE:
1209 case AMDGPU::SI_SPILL_V160_SAVE:
1210 case AMDGPU::SI_SPILL_V160_RESTORE:
1211 case AMDGPU::SI_SPILL_A160_SAVE:
1212 case AMDGPU::SI_SPILL_A160_RESTORE:
1213 case AMDGPU::SI_SPILL_AV160_SAVE:
1214 case AMDGPU::SI_SPILL_AV160_RESTORE:
1215 return 5;
1216 case AMDGPU::SI_SPILL_S128_SAVE:
1217 case AMDGPU::SI_SPILL_S128_RESTORE:
1218 case AMDGPU::SI_SPILL_V128_SAVE:
1219 case AMDGPU::SI_SPILL_V128_RESTORE:
1220 case AMDGPU::SI_SPILL_A128_SAVE:
1221 case AMDGPU::SI_SPILL_A128_RESTORE:
1222 case AMDGPU::SI_SPILL_AV128_SAVE:
1223 case AMDGPU::SI_SPILL_AV128_RESTORE:
1224 return 4;
1225 case AMDGPU::SI_SPILL_S96_SAVE:
1226 case AMDGPU::SI_SPILL_S96_RESTORE:
1227 case AMDGPU::SI_SPILL_V96_SAVE:
1228 case AMDGPU::SI_SPILL_V96_RESTORE:
1229 case AMDGPU::SI_SPILL_A96_SAVE:
1230 case AMDGPU::SI_SPILL_A96_RESTORE:
1231 case AMDGPU::SI_SPILL_AV96_SAVE:
1232 case AMDGPU::SI_SPILL_AV96_RESTORE:
1233 return 3;
1234 case AMDGPU::SI_SPILL_S64_SAVE:
1235 case AMDGPU::SI_SPILL_S64_RESTORE:
1236 case AMDGPU::SI_SPILL_V64_SAVE:
1237 case AMDGPU::SI_SPILL_V64_RESTORE:
1238 case AMDGPU::SI_SPILL_A64_SAVE:
1239 case AMDGPU::SI_SPILL_A64_RESTORE:
1240 case AMDGPU::SI_SPILL_AV64_SAVE:
1241 case AMDGPU::SI_SPILL_AV64_RESTORE:
1242 return 2;
1243 case AMDGPU::SI_SPILL_S32_SAVE:
1244 case AMDGPU::SI_SPILL_S32_RESTORE:
1245 case AMDGPU::SI_SPILL_V32_SAVE:
1246 case AMDGPU::SI_SPILL_V32_RESTORE:
1247 case AMDGPU::SI_SPILL_A32_SAVE:
1248 case AMDGPU::SI_SPILL_A32_RESTORE:
1249 case AMDGPU::SI_SPILL_AV32_SAVE:
1250 case AMDGPU::SI_SPILL_AV32_RESTORE:
1251 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
1252 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
1253 case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
1254 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
1255 return 1;
1256 default: llvm_unreachable("Invalid spill opcode");
1257 }
1258}
1259
1260static int getOffsetMUBUFStore(unsigned Opc) {
1261 switch (Opc) {
1262 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
1263 return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1264 case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
1265 return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
1266 case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
1267 return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
1268 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
1269 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
1270 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
1271 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
1272 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
1273 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
1274 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
1275 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
1276 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
1277 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
1278 default:
1279 return -1;
1280 }
1281}
1282
1283static int getOffsetMUBUFLoad(unsigned Opc) {
1284 switch (Opc) {
1285 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
1286 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1287 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
1288 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
1289 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
1290 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
1291 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
1292 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
1293 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
1294 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
1295 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
1296 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
1297 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
1298 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
1299 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
1300 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
1301 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
1302 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
1303 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
1304 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
1305 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
1306 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
1307 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
1308 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
1309 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
1310 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
1311 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
1312 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
1313 default:
1314 return -1;
1315 }
1316}
1317
1318static int getOffenMUBUFStore(unsigned Opc) {
1319 switch (Opc) {
1320 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
1321 return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
1322 case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
1323 return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
1324 case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
1325 return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
1326 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
1327 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
1328 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
1329 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
1330 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
1331 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
1332 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
1333 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
1334 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
1335 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
1336 default:
1337 return -1;
1338 }
1339}
1340
1341static int getOffenMUBUFLoad(unsigned Opc) {
1342 switch (Opc) {
1343 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
1344 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
1345 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
1346 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
1347 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
1348 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
1349 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
1350 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
1351 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
1352 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
1353 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
1354 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
1355 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
1356 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
1357 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
1358 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
1359 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
1360 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
1361 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
1362 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
1363 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
1364 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
1365 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
1366 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
1367 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
1368 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
1369 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
1370 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
1371 default:
1372 return -1;
1373 }
1374}
1375
1379 int Index, unsigned Lane,
1380 unsigned ValueReg, bool IsKill) {
1383 const SIInstrInfo *TII = ST.getInstrInfo();
1384
1385 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
1386
1387 if (Reg == AMDGPU::NoRegister)
1388 return MachineInstrBuilder();
1389
1390 bool IsStore = MI->mayStore();
1392 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1393
1394 unsigned Dst = IsStore ? Reg : ValueReg;
1395 unsigned Src = IsStore ? ValueReg : Reg;
1396 bool IsVGPR = TRI->isVGPR(MRI, Reg);
1397 DebugLoc DL = MI->getDebugLoc();
1398 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
1399 // Spiller during regalloc may restore a spilled register to its superclass.
1400 // It could result in AGPR spills restored to VGPRs or the other way around,
1401 // making the src and dst with identical regclasses at this point. It just
1402 // needs a copy in such cases.
1403 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
1404 .addReg(Src, getKillRegState(IsKill));
1406 return CopyMIB;
1407 }
1408 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1409 : AMDGPU::V_ACCVGPR_READ_B32_e64;
1410
1411 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
1412 .addReg(Src, getKillRegState(IsKill));
1414 return MIB;
1415}
1416
1417// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1418// need to handle the case where an SGPR may need to be spilled while spilling.
1420 MachineFrameInfo &MFI,
1422 int Index,
1423 int64_t Offset) {
1424 const SIInstrInfo *TII = ST.getInstrInfo();
1425 MachineBasicBlock *MBB = MI->getParent();
1426 const DebugLoc &DL = MI->getDebugLoc();
1427 bool IsStore = MI->mayStore();
1428
1429 unsigned Opc = MI->getOpcode();
1430 int LoadStoreOp = IsStore ?
1432 if (LoadStoreOp == -1)
1433 return false;
1434
1435 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
1436 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr())
1437 return true;
1438
1439 MachineInstrBuilder NewMI =
1440 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
1441 .add(*Reg)
1442 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
1443 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
1444 .addImm(Offset)
1445 .addImm(0) // cpol
1446 .addImm(0) // swz
1447 .cloneMemRefs(*MI);
1448
1449 const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
1450 AMDGPU::OpName::vdata_in);
1451 if (VDataIn)
1452 NewMI.add(*VDataIn);
1453 return true;
1454}
1455
1457 unsigned LoadStoreOp,
1458 unsigned EltSize) {
1459 bool IsStore = TII->get(LoadStoreOp).mayStore();
1460 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr);
1461 bool UseST =
1462 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
1463
1464 switch (EltSize) {
1465 case 4:
1466 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1467 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
1468 break;
1469 case 8:
1470 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1471 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
1472 break;
1473 case 12:
1474 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1475 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
1476 break;
1477 case 16:
1478 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1479 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
1480 break;
1481 default:
1482 llvm_unreachable("Unexpected spill load/store size!");
1483 }
1484
1485 if (HasVAddr)
1486 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1487 else if (UseST)
1488 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1489
1490 return LoadStoreOp;
1491}
1492
1495 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1496 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1497 RegScavenger *RS, LiveRegUnits *LiveUnits) const {
1498 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
1499
1501 const SIInstrInfo *TII = ST.getInstrInfo();
1502 const MachineFrameInfo &MFI = MF->getFrameInfo();
1503 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1504
1505 const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
1506 bool IsStore = Desc->mayStore();
1507 bool IsFlat = TII->isFLATScratch(LoadStoreOp);
1508
1509 bool CanClobberSCC = false;
1510 bool Scavenged = false;
1511 MCRegister SOffset = ScratchOffsetReg;
1512
1513 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
1514 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1515 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
1516 const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
1517
1518 // Always use 4 byte operations for AGPRs because we need to scavenge
1519 // a temporary VGPR.
1520 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u;
1521 unsigned NumSubRegs = RegWidth / EltSize;
1522 unsigned Size = NumSubRegs * EltSize;
1523 unsigned RemSize = RegWidth - Size;
1524 unsigned NumRemSubRegs = RemSize ? 1 : 0;
1525 int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
1526 int64_t MaterializedOffset = Offset;
1527
1528 int64_t MaxOffset = Offset + Size + RemSize - EltSize;
1529 int64_t ScratchOffsetRegDelta = 0;
1530
1531 if (IsFlat && EltSize > 4) {
1532 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1533 Desc = &TII->get(LoadStoreOp);
1534 }
1535
1536 Align Alignment = MFI.getObjectAlign(Index);
1537 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
1538
1539 assert((IsFlat || ((Offset % EltSize) == 0)) &&
1540 "unexpected VGPR spill offset");
1541
1542 // Track a VGPR to use for a constant offset we need to materialize.
1543 Register TmpOffsetVGPR;
1544
1545 // Track a VGPR to use as an intermediate value.
1546 Register TmpIntermediateVGPR;
1547 bool UseVGPROffset = false;
1548
1549 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1550 // combination.
1551 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
1552 int64_t VOffset) {
1553 // We are using a VGPR offset
1554 if (IsFlat && SGPRBase) {
1555 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1556 // SGPR, so perform the add as vector.
1557 // We don't need a base SGPR in the kernel.
1558
1559 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
1560 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
1561 .addReg(SGPRBase)
1562 .addImm(VOffset)
1563 .addImm(0); // clamp
1564 } else {
1565 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1566 .addReg(SGPRBase);
1567 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
1568 .addImm(VOffset)
1569 .addReg(TmpOffsetVGPR);
1570 }
1571 } else {
1572 assert(TmpOffsetVGPR);
1573 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1574 .addImm(VOffset);
1575 }
1576 };
1577
1578 bool IsOffsetLegal =
1579 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1581 : TII->isLegalMUBUFImmOffset(MaxOffset);
1582 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
1583 SOffset = MCRegister();
1584
1585 // We don't have access to the register scavenger if this function is called
1586 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1587 // TODO: Clobbering SCC is not necessary for scratch instructions in the
1588 // entry.
1589 if (RS) {
1590 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false);
1591
1592 // Piggy back on the liveness scan we just did see if SCC is dead.
1593 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
1594 } else if (LiveUnits) {
1595 CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
1596 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
1597 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1598 SOffset = Reg;
1599 break;
1600 }
1601 }
1602 }
1603
1604 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
1605 SOffset = Register();
1606
1607 if (!SOffset) {
1608 UseVGPROffset = true;
1609
1610 if (RS) {
1611 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1612 } else {
1613 assert(LiveUnits);
1614 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
1615 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1616 TmpOffsetVGPR = Reg;
1617 break;
1618 }
1619 }
1620 }
1621
1622 assert(TmpOffsetVGPR);
1623 } else if (!SOffset && CanClobberSCC) {
1624 // There are no free SGPRs, and since we are in the process of spilling
1625 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
1626 // on SI/CI and on VI it is true until we implement spilling using scalar
1627 // stores), we have no way to free up an SGPR. Our solution here is to
1628 // add the offset directly to the ScratchOffset or StackPtrOffset
1629 // register, and then subtract the offset after the spill to return the
1630 // register to it's original value.
1631
1632 // TODO: If we don't have to do an emergency stack slot spill, converting
1633 // to use the VGPR offset is fewer instructions.
1634 if (!ScratchOffsetReg)
1635 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
1636 SOffset = ScratchOffsetReg;
1637 ScratchOffsetRegDelta = Offset;
1638 } else {
1639 Scavenged = true;
1640 }
1641
1642 // We currently only support spilling VGPRs to EltSize boundaries, meaning
1643 // we can simplify the adjustment of Offset here to just scale with
1644 // WavefrontSize.
1645 if (!IsFlat && !UseVGPROffset)
1646 Offset *= ST.getWavefrontSize();
1647
1648 if (!UseVGPROffset && !SOffset)
1649 report_fatal_error("could not scavenge SGPR to spill in entry function");
1650
1651 if (UseVGPROffset) {
1652 // We are using a VGPR offset
1653 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
1654 } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
1655 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
1656 } else {
1657 assert(Offset != 0);
1658 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1659 .addReg(ScratchOffsetReg)
1660 .addImm(Offset);
1661 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1662 }
1663
1664 Offset = 0;
1665 }
1666
1667 if (IsFlat && SOffset == AMDGPU::NoRegister) {
1668 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
1669 && "Unexpected vaddr for flat scratch with a FI operand");
1670
1671 if (UseVGPROffset) {
1672 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1673 } else {
1675 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1676 }
1677
1678 Desc = &TII->get(LoadStoreOp);
1679 }
1680
1681 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
1682 ++i, RegOffset += EltSize) {
1683 if (i == NumSubRegs) {
1684 EltSize = RemSize;
1685 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1686 }
1687 Desc = &TII->get(LoadStoreOp);
1688
1689 if (!IsFlat && UseVGPROffset) {
1690 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
1691 : getOffenMUBUFLoad(LoadStoreOp);
1692 Desc = &TII->get(NewLoadStoreOp);
1693 }
1694
1695 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
1696 // If we are spilling an AGPR beyond the range of the memory instruction
1697 // offset and need to use a VGPR offset, we ideally have at least 2
1698 // scratch VGPRs. If we don't have a second free VGPR without spilling,
1699 // recycle the VGPR used for the offset which requires resetting after
1700 // each subregister.
1701
1702 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
1703 }
1704
1705 unsigned NumRegs = EltSize / 4;
1706 Register SubReg = e == 1
1707 ? ValueReg
1708 : Register(getSubReg(ValueReg,
1709 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1710
1711 unsigned SOffsetRegState = 0;
1712 unsigned SrcDstRegState = getDefRegState(!IsStore);
1713 const bool IsLastSubReg = i + 1 == e;
1714 const bool IsFirstSubReg = i == 0;
1715 if (IsLastSubReg) {
1716 SOffsetRegState |= getKillRegState(Scavenged);
1717 // The last implicit use carries the "Kill" flag.
1718 SrcDstRegState |= getKillRegState(IsKill);
1719 }
1720
1721 // Make sure the whole register is defined if there are undef components by
1722 // adding an implicit def of the super-reg on the first instruction.
1723 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg;
1724 bool NeedSuperRegImpOperand = e > 1;
1725
1726 // Remaining element size to spill into memory after some parts of it
1727 // spilled into either AGPRs or VGPRs.
1728 unsigned RemEltSize = EltSize;
1729
1730 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1731 // starting from the last lane. In case if a register cannot be completely
1732 // spilled into another register that will ensure its alignment does not
1733 // change. For targets with VGPR alignment requirement this is important
1734 // in case of flat scratch usage as we might get a scratch_load or
1735 // scratch_store of an unaligned register otherwise.
1736 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
1737 LaneE = RegOffset / 4;
1738 Lane >= LaneE; --Lane) {
1739 bool IsSubReg = e > 1 || EltSize > 4;
1740 Register Sub = IsSubReg
1741 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
1742 : ValueReg;
1743 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
1744 if (!MIB.getInstr())
1745 break;
1746 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
1747 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1748 NeedSuperRegDef = false;
1749 }
1750 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
1751 NeedSuperRegImpOperand = true;
1752 unsigned State = SrcDstRegState;
1753 if (!IsLastSubReg || (Lane != LaneE))
1754 State &= ~RegState::Kill;
1755 if (!IsFirstSubReg || (Lane != LaneS))
1756 State &= ~RegState::Define;
1757 MIB.addReg(ValueReg, RegState::Implicit | State);
1758 }
1759 RemEltSize -= 4;
1760 }
1761
1762 if (!RemEltSize) // Fully spilled into AGPRs.
1763 continue;
1764
1765 if (RemEltSize != EltSize) { // Partially spilled to AGPRs
1766 assert(IsFlat && EltSize > 4);
1767
1768 unsigned NumRegs = RemEltSize / 4;
1769 SubReg = Register(getSubReg(ValueReg,
1770 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1771 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
1772 Desc = &TII->get(Opc);
1773 }
1774
1775 unsigned FinalReg = SubReg;
1776
1777 if (IsAGPR) {
1778 assert(EltSize == 4);
1779
1780 if (!TmpIntermediateVGPR) {
1781 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
1782 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
1783 }
1784 if (IsStore) {
1785 auto AccRead = BuildMI(MBB, MI, DL,
1786 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
1787 TmpIntermediateVGPR)
1788 .addReg(SubReg, getKillRegState(IsKill));
1789 if (NeedSuperRegDef)
1790 AccRead.addReg(ValueReg, RegState::ImplicitDefine);
1792 }
1793 SubReg = TmpIntermediateVGPR;
1794 } else if (UseVGPROffset) {
1795 if (!TmpOffsetVGPR) {
1796 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1797 MI, false, 0);
1798 RS->setRegUsed(TmpOffsetVGPR);
1799 }
1800 }
1801
1802 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
1803 MachineMemOperand *NewMMO =
1804 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
1805 commonAlignment(Alignment, RegOffset));
1806
1807 auto MIB =
1808 BuildMI(MBB, MI, DL, *Desc)
1809 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
1810
1811 if (UseVGPROffset) {
1812 // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1813 // intermediate accvgpr_write.
1814 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
1815 }
1816
1817 if (!IsFlat)
1818 MIB.addReg(FuncInfo->getScratchRSrcReg());
1819
1820 if (SOffset == AMDGPU::NoRegister) {
1821 if (!IsFlat) {
1822 if (UseVGPROffset && ScratchOffsetReg) {
1823 MIB.addReg(ScratchOffsetReg);
1824 } else {
1825 assert(FuncInfo->isBottomOfStack());
1826 MIB.addImm(0);
1827 }
1828 }
1829 } else {
1830 MIB.addReg(SOffset, SOffsetRegState);
1831 }
1832
1833 MIB.addImm(Offset + RegOffset);
1834
1835 bool LastUse = MMO->getFlags() & MOLastUse;
1836 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1837
1838 if (!IsFlat)
1839 MIB.addImm(0); // swz
1840 MIB.addMemOperand(NewMMO);
1841
1842 if (!IsAGPR && NeedSuperRegDef)
1843 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1844
1845 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
1846 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
1847 FinalReg)
1848 .addReg(TmpIntermediateVGPR, RegState::Kill);
1850 }
1851
1852 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1853 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
1854
1855 // The epilog restore of a wwm-scratch register can cause undesired
1856 // optimization during machine-cp post PrologEpilogInserter if the same
1857 // register was assigned for return value ABI lowering with a COPY
1858 // instruction. As given below, with the epilog reload, the earlier COPY
1859 // appeared to be dead during machine-cp.
1860 // ...
1861 // v0 in WWM operation, needs the WWM spill at prolog/epilog.
1862 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
1863 // ...
1864 // Epilog block:
1865 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
1866 // ...
1867 // WWM spill restore to preserve the inactive lanes of v0.
1868 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
1869 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
1870 // $exec = S_MOV_B64 killed $sgpr4_sgpr5
1871 // ...
1872 // SI_RETURN implicit $vgpr0
1873 // ...
1874 // To fix it, mark the same reg as a tied op for such restore instructions
1875 // so that it marks a usage for the preceding COPY.
1876 if (!IsStore && MI != MBB.end() && MI->isReturn() &&
1877 MI->readsRegister(SubReg, this)) {
1878 MIB.addReg(SubReg, RegState::Implicit);
1879 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1880 }
1881 }
1882
1883 if (ScratchOffsetRegDelta != 0) {
1884 // Subtract the offset we added to the ScratchOffset register.
1885 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1886 .addReg(SOffset)
1887 .addImm(-ScratchOffsetRegDelta);
1888 }
1889}
1890
1892 int Offset, bool IsLoad,
1893 bool IsKill) const {
1894 // Load/store VGPR
1895 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
1897
1898 Register FrameReg =
1899 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
1900 ? getBaseRegister()
1901 : getFrameRegister(SB.MF);
1902
1903 Align Alignment = FrameInfo.getObjectAlign(Index);
1907 SB.EltSize, Alignment);
1908
1909 if (IsLoad) {
1910 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1911 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1912 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
1913 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1914 } else {
1915 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1916 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1917 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
1918 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1919 // This only ever adds one VGPR spill
1920 SB.MFI.addToSpilledVGPRs(1);
1921 }
1922}
1923
1925 RegScavenger *RS, SlotIndexes *Indexes,
1926 LiveIntervals *LIS, bool OnlyToVGPR,
1927 bool SpillToPhysVGPRLane) const {
1928 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
1929
1930 ArrayRef<SpilledReg> VGPRSpills =
1931 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
1933 bool SpillToVGPR = !VGPRSpills.empty();
1934 if (OnlyToVGPR && !SpillToVGPR)
1935 return false;
1936
1937 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
1938 SB.SuperReg != SB.MFI.getFrameOffsetReg()));
1939
1940 if (SpillToVGPR) {
1941
1942 assert(SB.NumSubRegs == VGPRSpills.size() &&
1943 "Num of VGPR lanes should be equal to num of SGPRs spilled");
1944
1945 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
1947 SB.NumSubRegs == 1
1948 ? SB.SuperReg
1949 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
1950 SpilledReg Spill = VGPRSpills[i];
1951
1952 bool IsFirstSubreg = i == 0;
1953 bool IsLastSubreg = i == SB.NumSubRegs - 1;
1954 bool UseKill = SB.IsKill && IsLastSubreg;
1955
1956
1957 // Mark the "old value of vgpr" input undef only if this is the first sgpr
1958 // spill to this specific vgpr in the first basic block.
1959 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
1960 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
1961 .addReg(SubReg, getKillRegState(UseKill))
1962 .addImm(Spill.Lane)
1963 .addReg(Spill.VGPR);
1964 if (Indexes) {
1965 if (IsFirstSubreg)
1966 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
1967 else
1968 Indexes->insertMachineInstrInMaps(*MIB);
1969 }
1970
1971 if (IsFirstSubreg && SB.NumSubRegs > 1) {
1972 // We may be spilling a super-register which is only partially defined,
1973 // and need to ensure later spills think the value is defined.
1974 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
1975 }
1976
1977 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg))
1978 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit);
1979
1980 // FIXME: Since this spills to another register instead of an actual
1981 // frame index, we should delete the frame index when all references to
1982 // it are fixed.
1983 }
1984 } else {
1985 SB.prepare();
1986
1987 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
1988 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
1989
1990 // Per VGPR helper data
1991 auto PVD = SB.getPerVGPRData();
1992
1993 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
1994 unsigned TmpVGPRFlags = RegState::Undef;
1995
1996 // Write sub registers into the VGPR
1997 for (unsigned i = Offset * PVD.PerVGPR,
1998 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
1999 i < e; ++i) {
2001 SB.NumSubRegs == 1
2002 ? SB.SuperReg
2003 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2004
2005 MachineInstrBuilder WriteLane =
2006 BuildMI(*SB.MBB, MI, SB.DL,
2007 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
2008 .addReg(SubReg, SubKillState)
2009 .addImm(i % PVD.PerVGPR)
2010 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2011 TmpVGPRFlags = 0;
2012
2013 if (Indexes) {
2014 if (i == 0)
2015 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane);
2016 else
2017 Indexes->insertMachineInstrInMaps(*WriteLane);
2018 }
2019
2020 // There could be undef components of a spilled super register.
2021 // TODO: Can we detect this and skip the spill?
2022 if (SB.NumSubRegs > 1) {
2023 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2024 unsigned SuperKillState = 0;
2025 if (i + 1 == SB.NumSubRegs)
2026 SuperKillState |= getKillRegState(SB.IsKill);
2027 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2028 }
2029 }
2030
2031 // Write out VGPR
2032 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
2033 }
2034
2035 SB.restore();
2036 }
2037
2038 MI->eraseFromParent();
2040
2041 if (LIS)
2043
2044 return true;
2045}
2046
2048 RegScavenger *RS, SlotIndexes *Indexes,
2049 LiveIntervals *LIS, bool OnlyToVGPR,
2050 bool SpillToPhysVGPRLane) const {
2051 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2052
2053 ArrayRef<SpilledReg> VGPRSpills =
2054 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2056 bool SpillToVGPR = !VGPRSpills.empty();
2057 if (OnlyToVGPR && !SpillToVGPR)
2058 return false;
2059
2060 if (SpillToVGPR) {
2061 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2063 SB.NumSubRegs == 1
2064 ? SB.SuperReg
2065 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2066
2067 SpilledReg Spill = VGPRSpills[i];
2068 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2069 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2070 .addReg(Spill.VGPR)
2071 .addImm(Spill.Lane);
2072 if (SB.NumSubRegs > 1 && i == 0)
2074 if (Indexes) {
2075 if (i == e - 1)
2076 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2077 else
2078 Indexes->insertMachineInstrInMaps(*MIB);
2079 }
2080 }
2081 } else {
2082 SB.prepare();
2083
2084 // Per VGPR helper data
2085 auto PVD = SB.getPerVGPRData();
2086
2087 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2088 // Load in VGPR data
2089 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
2090
2091 // Unpack lanes
2092 for (unsigned i = Offset * PVD.PerVGPR,
2093 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2094 i < e; ++i) {
2096 SB.NumSubRegs == 1
2097 ? SB.SuperReg
2098 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2099
2100 bool LastSubReg = (i + 1 == e);
2101 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2102 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2103 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2104 .addImm(i);
2105 if (SB.NumSubRegs > 1 && i == 0)
2107 if (Indexes) {
2108 if (i == e - 1)
2109 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2110 else
2111 Indexes->insertMachineInstrInMaps(*MIB);
2112 }
2113 }
2114 }
2115
2116 SB.restore();
2117 }
2118
2119 MI->eraseFromParent();
2120
2121 if (LIS)
2123
2124 return true;
2125}
2126
2128 MachineBasicBlock &RestoreMBB,
2129 Register SGPR, RegScavenger *RS) const {
2130 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
2131 RS);
2132 SB.prepare();
2133 // Generate the spill of SGPR to SB.TmpVGPR.
2134 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2135 auto PVD = SB.getPerVGPRData();
2136 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2137 unsigned TmpVGPRFlags = RegState::Undef;
2138 // Write sub registers into the VGPR
2139 for (unsigned i = Offset * PVD.PerVGPR,
2140 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2141 i < e; ++i) {
2143 SB.NumSubRegs == 1
2144 ? SB.SuperReg
2145 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2146
2147 MachineInstrBuilder WriteLane =
2148 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
2149 SB.TmpVGPR)
2150 .addReg(SubReg, SubKillState)
2151 .addImm(i % PVD.PerVGPR)
2152 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2153 TmpVGPRFlags = 0;
2154 // There could be undef components of a spilled super register.
2155 // TODO: Can we detect this and skip the spill?
2156 if (SB.NumSubRegs > 1) {
2157 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2158 unsigned SuperKillState = 0;
2159 if (i + 1 == SB.NumSubRegs)
2160 SuperKillState |= getKillRegState(SB.IsKill);
2161 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2162 }
2163 }
2164 // Don't need to write VGPR out.
2165 }
2166
2167 // Restore clobbered registers in the specified restore block.
2168 MI = RestoreMBB.end();
2169 SB.setMI(&RestoreMBB, MI);
2170 // Generate the restore of SGPR from SB.TmpVGPR.
2171 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2172 // Don't need to load VGPR in.
2173 // Unpack lanes
2174 for (unsigned i = Offset * PVD.PerVGPR,
2175 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2176 i < e; ++i) {
2178 SB.NumSubRegs == 1
2179 ? SB.SuperReg
2180 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2181 bool LastSubReg = (i + 1 == e);
2182 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
2183 SubReg)
2184 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2185 .addImm(i);
2186 if (SB.NumSubRegs > 1 && i == 0)
2188 }
2189 }
2190 SB.restore();
2191
2193 return false;
2194}
2195
2196/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2197/// a VGPR and the stack slot can be safely eliminated when all other users are
2198/// handled.
2201 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
2202 switch (MI->getOpcode()) {
2203 case AMDGPU::SI_SPILL_S1024_SAVE:
2204 case AMDGPU::SI_SPILL_S512_SAVE:
2205 case AMDGPU::SI_SPILL_S384_SAVE:
2206 case AMDGPU::SI_SPILL_S352_SAVE:
2207 case AMDGPU::SI_SPILL_S320_SAVE:
2208 case AMDGPU::SI_SPILL_S288_SAVE:
2209 case AMDGPU::SI_SPILL_S256_SAVE:
2210 case AMDGPU::SI_SPILL_S224_SAVE:
2211 case AMDGPU::SI_SPILL_S192_SAVE:
2212 case AMDGPU::SI_SPILL_S160_SAVE:
2213 case AMDGPU::SI_SPILL_S128_SAVE:
2214 case AMDGPU::SI_SPILL_S96_SAVE:
2215 case AMDGPU::SI_SPILL_S64_SAVE:
2216 case AMDGPU::SI_SPILL_S32_SAVE:
2217 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2218 case AMDGPU::SI_SPILL_S1024_RESTORE:
2219 case AMDGPU::SI_SPILL_S512_RESTORE:
2220 case AMDGPU::SI_SPILL_S384_RESTORE:
2221 case AMDGPU::SI_SPILL_S352_RESTORE:
2222 case AMDGPU::SI_SPILL_S320_RESTORE:
2223 case AMDGPU::SI_SPILL_S288_RESTORE:
2224 case AMDGPU::SI_SPILL_S256_RESTORE:
2225 case AMDGPU::SI_SPILL_S224_RESTORE:
2226 case AMDGPU::SI_SPILL_S192_RESTORE:
2227 case AMDGPU::SI_SPILL_S160_RESTORE:
2228 case AMDGPU::SI_SPILL_S128_RESTORE:
2229 case AMDGPU::SI_SPILL_S96_RESTORE:
2230 case AMDGPU::SI_SPILL_S64_RESTORE:
2231 case AMDGPU::SI_SPILL_S32_RESTORE:
2232 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2233 default:
2234 llvm_unreachable("not an SGPR spill instruction");
2235 }
2236}
2237
2239 int SPAdj, unsigned FIOperandNum,
2240 RegScavenger *RS) const {
2241 MachineFunction *MF = MI->getParent()->getParent();
2242 MachineBasicBlock *MBB = MI->getParent();
2244 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2245 const SIInstrInfo *TII = ST.getInstrInfo();
2246 DebugLoc DL = MI->getDebugLoc();
2247
2248 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
2249
2251 "unreserved scratch RSRC register");
2252
2253 MachineOperand &FIOp = MI->getOperand(FIOperandNum);
2254 int Index = MI->getOperand(FIOperandNum).getIndex();
2255
2256 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
2257 ? getBaseRegister()
2258 : getFrameRegister(*MF);
2259
2260 switch (MI->getOpcode()) {
2261 // SGPR register spill
2262 case AMDGPU::SI_SPILL_S1024_SAVE:
2263 case AMDGPU::SI_SPILL_S512_SAVE:
2264 case AMDGPU::SI_SPILL_S384_SAVE:
2265 case AMDGPU::SI_SPILL_S352_SAVE:
2266 case AMDGPU::SI_SPILL_S320_SAVE:
2267 case AMDGPU::SI_SPILL_S288_SAVE:
2268 case AMDGPU::SI_SPILL_S256_SAVE:
2269 case AMDGPU::SI_SPILL_S224_SAVE:
2270 case AMDGPU::SI_SPILL_S192_SAVE:
2271 case AMDGPU::SI_SPILL_S160_SAVE:
2272 case AMDGPU::SI_SPILL_S128_SAVE:
2273 case AMDGPU::SI_SPILL_S96_SAVE:
2274 case AMDGPU::SI_SPILL_S64_SAVE:
2275 case AMDGPU::SI_SPILL_S32_SAVE: {
2276 return spillSGPR(MI, Index, RS);
2277 }
2278
2279 // SGPR register restore
2280 case AMDGPU::SI_SPILL_S1024_RESTORE:
2281 case AMDGPU::SI_SPILL_S512_RESTORE:
2282 case AMDGPU::SI_SPILL_S384_RESTORE:
2283 case AMDGPU::SI_SPILL_S352_RESTORE:
2284 case AMDGPU::SI_SPILL_S320_RESTORE:
2285 case AMDGPU::SI_SPILL_S288_RESTORE:
2286 case AMDGPU::SI_SPILL_S256_RESTORE:
2287 case AMDGPU::SI_SPILL_S224_RESTORE:
2288 case AMDGPU::SI_SPILL_S192_RESTORE:
2289 case AMDGPU::SI_SPILL_S160_RESTORE:
2290 case AMDGPU::SI_SPILL_S128_RESTORE:
2291 case AMDGPU::SI_SPILL_S96_RESTORE:
2292 case AMDGPU::SI_SPILL_S64_RESTORE:
2293 case AMDGPU::SI_SPILL_S32_RESTORE: {
2294 return restoreSGPR(MI, Index, RS);
2295 }
2296
2297 // VGPR register spill
2298 case AMDGPU::SI_SPILL_V1024_SAVE:
2299 case AMDGPU::SI_SPILL_V512_SAVE:
2300 case AMDGPU::SI_SPILL_V384_SAVE:
2301 case AMDGPU::SI_SPILL_V352_SAVE:
2302 case AMDGPU::SI_SPILL_V320_SAVE:
2303 case AMDGPU::SI_SPILL_V288_SAVE:
2304 case AMDGPU::SI_SPILL_V256_SAVE:
2305 case AMDGPU::SI_SPILL_V224_SAVE:
2306 case AMDGPU::SI_SPILL_V192_SAVE:
2307 case AMDGPU::SI_SPILL_V160_SAVE:
2308 case AMDGPU::SI_SPILL_V128_SAVE:
2309 case AMDGPU::SI_SPILL_V96_SAVE:
2310 case AMDGPU::SI_SPILL_V64_SAVE:
2311 case AMDGPU::SI_SPILL_V32_SAVE:
2312 case AMDGPU::SI_SPILL_A1024_SAVE:
2313 case AMDGPU::SI_SPILL_A512_SAVE:
2314 case AMDGPU::SI_SPILL_A384_SAVE:
2315 case AMDGPU::SI_SPILL_A352_SAVE:
2316 case AMDGPU::SI_SPILL_A320_SAVE:
2317 case AMDGPU::SI_SPILL_A288_SAVE:
2318 case AMDGPU::SI_SPILL_A256_SAVE:
2319 case AMDGPU::SI_SPILL_A224_SAVE:
2320 case AMDGPU::SI_SPILL_A192_SAVE:
2321 case AMDGPU::SI_SPILL_A160_SAVE:
2322 case AMDGPU::SI_SPILL_A128_SAVE:
2323 case AMDGPU::SI_SPILL_A96_SAVE:
2324 case AMDGPU::SI_SPILL_A64_SAVE:
2325 case AMDGPU::SI_SPILL_A32_SAVE:
2326 case AMDGPU::SI_SPILL_AV1024_SAVE:
2327 case AMDGPU::SI_SPILL_AV512_SAVE:
2328 case AMDGPU::SI_SPILL_AV384_SAVE:
2329 case AMDGPU::SI_SPILL_AV352_SAVE:
2330 case AMDGPU::SI_SPILL_AV320_SAVE:
2331 case AMDGPU::SI_SPILL_AV288_SAVE:
2332 case AMDGPU::SI_SPILL_AV256_SAVE:
2333 case AMDGPU::SI_SPILL_AV224_SAVE:
2334 case AMDGPU::SI_SPILL_AV192_SAVE:
2335 case AMDGPU::SI_SPILL_AV160_SAVE:
2336 case AMDGPU::SI_SPILL_AV128_SAVE:
2337 case AMDGPU::SI_SPILL_AV96_SAVE:
2338 case AMDGPU::SI_SPILL_AV64_SAVE:
2339 case AMDGPU::SI_SPILL_AV32_SAVE:
2340 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
2341 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
2342 const MachineOperand *VData = TII->getNamedOperand(*MI,
2343 AMDGPU::OpName::vdata);
2344 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2345 MFI->getStackPtrOffsetReg());
2346
2347 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2348 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2349 auto *MBB = MI->getParent();
2350 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2351 if (IsWWMRegSpill) {
2352 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2353 RS->isRegUsed(AMDGPU::SCC));
2354 }
2356 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2357 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2358 *MI->memoperands_begin(), RS);
2359 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
2360 if (IsWWMRegSpill)
2361 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2362
2363 MI->eraseFromParent();
2364 return true;
2365 }
2366 case AMDGPU::SI_SPILL_V32_RESTORE:
2367 case AMDGPU::SI_SPILL_V64_RESTORE:
2368 case AMDGPU::SI_SPILL_V96_RESTORE:
2369 case AMDGPU::SI_SPILL_V128_RESTORE:
2370 case AMDGPU::SI_SPILL_V160_RESTORE:
2371 case AMDGPU::SI_SPILL_V192_RESTORE:
2372 case AMDGPU::SI_SPILL_V224_RESTORE:
2373 case AMDGPU::SI_SPILL_V256_RESTORE:
2374 case AMDGPU::SI_SPILL_V288_RESTORE:
2375 case AMDGPU::SI_SPILL_V320_RESTORE:
2376 case AMDGPU::SI_SPILL_V352_RESTORE:
2377 case AMDGPU::SI_SPILL_V384_RESTORE:
2378 case AMDGPU::SI_SPILL_V512_RESTORE:
2379 case AMDGPU::SI_SPILL_V1024_RESTORE:
2380 case AMDGPU::SI_SPILL_A32_RESTORE:
2381 case AMDGPU::SI_SPILL_A64_RESTORE:
2382 case AMDGPU::SI_SPILL_A96_RESTORE:
2383 case AMDGPU::SI_SPILL_A128_RESTORE:
2384 case AMDGPU::SI_SPILL_A160_RESTORE:
2385 case AMDGPU::SI_SPILL_A192_RESTORE:
2386 case AMDGPU::SI_SPILL_A224_RESTORE:
2387 case AMDGPU::SI_SPILL_A256_RESTORE:
2388 case AMDGPU::SI_SPILL_A288_RESTORE:
2389 case AMDGPU::SI_SPILL_A320_RESTORE:
2390 case AMDGPU::SI_SPILL_A352_RESTORE:
2391 case AMDGPU::SI_SPILL_A384_RESTORE:
2392 case AMDGPU::SI_SPILL_A512_RESTORE:
2393 case AMDGPU::SI_SPILL_A1024_RESTORE:
2394 case AMDGPU::SI_SPILL_AV32_RESTORE:
2395 case AMDGPU::SI_SPILL_AV64_RESTORE:
2396 case AMDGPU::SI_SPILL_AV96_RESTORE:
2397 case AMDGPU::SI_SPILL_AV128_RESTORE:
2398 case AMDGPU::SI_SPILL_AV160_RESTORE:
2399 case AMDGPU::SI_SPILL_AV192_RESTORE:
2400 case AMDGPU::SI_SPILL_AV224_RESTORE:
2401 case AMDGPU::SI_SPILL_AV256_RESTORE:
2402 case AMDGPU::SI_SPILL_AV288_RESTORE:
2403 case AMDGPU::SI_SPILL_AV320_RESTORE:
2404 case AMDGPU::SI_SPILL_AV352_RESTORE:
2405 case AMDGPU::SI_SPILL_AV384_RESTORE:
2406 case AMDGPU::SI_SPILL_AV512_RESTORE:
2407 case AMDGPU::SI_SPILL_AV1024_RESTORE:
2408 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
2409 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
2410 const MachineOperand *VData = TII->getNamedOperand(*MI,
2411 AMDGPU::OpName::vdata);
2412 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2413 MFI->getStackPtrOffsetReg());
2414
2415 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2416 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2417 auto *MBB = MI->getParent();
2418 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2419 if (IsWWMRegSpill) {
2420 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2421 RS->isRegUsed(AMDGPU::SCC));
2422 }
2423
2425 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2426 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2427 *MI->memoperands_begin(), RS);
2428
2429 if (IsWWMRegSpill)
2430 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2431
2432 MI->eraseFromParent();
2433 return true;
2434 }
2435 case AMDGPU::S_ADD_I32: {
2436 // TODO: Handle s_or_b32, s_and_b32.
2437 unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
2438 MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
2439
2440 assert(FrameReg || MFI->isBottomOfStack());
2441
2442 MachineOperand &DstOp = MI->getOperand(0);
2443 const DebugLoc &DL = MI->getDebugLoc();
2444 Register MaterializedReg = FrameReg;
2445
2446 // Defend against live scc, which should never happen in practice.
2447 bool DeadSCC = MI->getOperand(3).isDead();
2448
2449 Register TmpReg;
2450
2451 if (FrameReg && !ST.enableFlatScratch()) {
2452 // FIXME: In the common case where the add does not also read its result
2453 // (i.e. this isn't a reg += fi), it's not finding the dest reg as
2454 // available.
2455 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass, MI,
2456 false, 0);
2457 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
2458 .addDef(TmpReg, RegState::Renamable)
2459 .addReg(FrameReg)
2461 .setOperandDead(3); // Set SCC dead
2462 MaterializedReg = TmpReg;
2463 }
2464
2465 int64_t Offset = FrameInfo.getObjectOffset(Index);
2466
2467 // For the non-immediate case, we could fall through to the default
2468 // handling, but we do an in-place update of the result register here to
2469 // avoid scavenging another register.
2470 if (OtherOp.isImm()) {
2471 OtherOp.setImm(OtherOp.getImm() + Offset);
2472 Offset = 0;
2473
2474 if (MaterializedReg)
2475 FIOp.ChangeToRegister(MaterializedReg, false);
2476 else
2477 FIOp.ChangeToImmediate(0);
2478 } else if (MaterializedReg) {
2479 // If we can't fold the other operand, do another increment.
2480 Register DstReg = DstOp.getReg();
2481
2482 if (!TmpReg && MaterializedReg == FrameReg) {
2483 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2484 MI, false, 0);
2485 DstReg = TmpReg;
2486 }
2487
2488 auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_ADD_I32))
2489 .addDef(DstReg, RegState::Renamable)
2490 .addReg(MaterializedReg, RegState::Kill)
2491 .add(OtherOp);
2492 if (DeadSCC)
2493 AddI32.setOperandDead(3);
2494
2495 MaterializedReg = DstReg;
2496
2497 OtherOp.ChangeToRegister(MaterializedReg, false);
2498 OtherOp.setIsKill(true);
2499 OtherOp.setIsRenamable(true);
2501 } else {
2502 // If we don't have any other offset to apply, we can just directly
2503 // interpret the frame index as the offset.
2505 }
2506
2507 if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
2508 assert(Offset == 0);
2509 MI->removeOperand(3);
2510 MI->removeOperand(OtherOpIdx);
2511 MI->setDesc(TII->get(FIOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2512 } else if (DeadSCC && FIOp.isImm() && FIOp.getImm() == 0) {
2513 assert(Offset == 0);
2514 MI->removeOperand(3);
2515 MI->removeOperand(FIOperandNum);
2516 MI->setDesc(
2517 TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2518 }
2519
2520 assert(!FIOp.isFI());
2521 return true;
2522 }
2523 default: {
2524 // Other access to frame index
2525 const DebugLoc &DL = MI->getDebugLoc();
2526
2527 int64_t Offset = FrameInfo.getObjectOffset(Index);
2528 if (ST.enableFlatScratch()) {
2529 if (TII->isFLATScratch(*MI)) {
2530 assert((int16_t)FIOperandNum ==
2531 AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2532 AMDGPU::OpName::saddr));
2533
2534 // The offset is always swizzled, just replace it
2535 if (FrameReg)
2536 FIOp.ChangeToRegister(FrameReg, false);
2537
2538 MachineOperand *OffsetOp =
2539 TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2540 int64_t NewOffset = Offset + OffsetOp->getImm();
2541 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
2543 OffsetOp->setImm(NewOffset);
2544 if (FrameReg)
2545 return false;
2546 Offset = 0;
2547 }
2548
2549 if (!Offset) {
2550 unsigned Opc = MI->getOpcode();
2551 int NewOpc = -1;
2552 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
2554 } else if (ST.hasFlatScratchSTMode()) {
2555 // On GFX10 we have ST mode to use no registers for an address.
2556 // Otherwise we need to materialize 0 into an SGPR.
2558 }
2559
2560 if (NewOpc != -1) {
2561 // removeOperand doesn't fixup tied operand indexes as it goes, so
2562 // it asserts. Untie vdst_in for now and retie them afterwards.
2563 int VDstIn = AMDGPU::getNamedOperandIdx(Opc,
2564 AMDGPU::OpName::vdst_in);
2565 bool TiedVDst = VDstIn != -1 &&
2566 MI->getOperand(VDstIn).isReg() &&
2567 MI->getOperand(VDstIn).isTied();
2568 if (TiedVDst)
2569 MI->untieRegOperand(VDstIn);
2570
2571 MI->removeOperand(
2572 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
2573
2574 if (TiedVDst) {
2575 int NewVDst =
2576 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
2577 int NewVDstIn =
2578 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
2579 assert (NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
2580 MI->tieOperands(NewVDst, NewVDstIn);
2581 }
2582 MI->setDesc(TII->get(NewOpc));
2583 return false;
2584 }
2585 }
2586 }
2587
2588 if (!FrameReg) {
2590 if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp))
2591 return false;
2592 }
2593
2594 // We need to use register here. Check if we can use an SGPR or need
2595 // a VGPR.
2596 FIOp.ChangeToRegister(AMDGPU::M0, false);
2597 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp);
2598
2599 if (!Offset && FrameReg && UseSGPR) {
2600 FIOp.setReg(FrameReg);
2601 return false;
2602 }
2603
2604 const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass
2605 : &AMDGPU::VGPR_32RegClass;
2606
2607 Register TmpReg =
2608 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2609 FIOp.setReg(TmpReg);
2610 FIOp.setIsKill();
2611
2612 if ((!FrameReg || !Offset) && TmpReg) {
2613 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2614 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
2615 if (FrameReg)
2616 MIB.addReg(FrameReg);
2617 else
2618 MIB.addImm(Offset);
2619
2620 return false;
2621 }
2622
2623 bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) &&
2624 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
2625
2626 Register TmpSReg =
2627 UseSGPR ? TmpReg
2628 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2629 MI, false, 0, !UseSGPR);
2630
2631 // TODO: for flat scratch another attempt can be made with a VGPR index
2632 // if no SGPRs can be scavenged.
2633 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
2634 report_fatal_error("Cannot scavenge register in FI elimination!");
2635
2636 if (!TmpSReg) {
2637 // Use frame register and restore it after.
2638 TmpSReg = FrameReg;
2639 FIOp.setReg(FrameReg);
2640 FIOp.setIsKill(false);
2641 }
2642
2643 if (NeedSaveSCC) {
2644 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
2645 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
2646 .addReg(FrameReg)
2647 .addImm(Offset);
2648 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
2649 .addReg(TmpSReg)
2650 .addImm(0);
2651 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
2652 .addImm(0)
2653 .addReg(TmpSReg);
2654 } else {
2655 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
2656 .addReg(FrameReg)
2657 .addImm(Offset);
2658 }
2659
2660 if (!UseSGPR)
2661 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
2662 .addReg(TmpSReg, RegState::Kill);
2663
2664 if (TmpSReg == FrameReg) {
2665 // Undo frame register modification.
2666 if (NeedSaveSCC &&
2667 !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
2669 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
2670 TmpSReg)
2671 .addReg(FrameReg)
2672 .addImm(-Offset);
2673 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
2674 .addReg(TmpSReg)
2675 .addImm(0);
2676 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
2677 TmpSReg)
2678 .addImm(0)
2679 .addReg(TmpSReg);
2680 } else {
2681 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
2682 FrameReg)
2683 .addReg(FrameReg)
2684 .addImm(-Offset);
2685 }
2686 }
2687
2688 return false;
2689 }
2690
2691 bool IsMUBUF = TII->isMUBUF(*MI);
2692
2693 if (!IsMUBUF && !MFI->isBottomOfStack()) {
2694 // Convert to a swizzled stack address by scaling by the wave size.
2695 // In an entry function/kernel the offset is already swizzled.
2696 bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
2697 bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
2698 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
2699 const TargetRegisterClass *RC = IsSALU && !LiveSCC
2700 ? &AMDGPU::SReg_32RegClass
2701 : &AMDGPU::VGPR_32RegClass;
2702 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
2703 MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
2704 MI->getOpcode() == AMDGPU::S_MOV_B32;
2705 Register ResultReg =
2706 IsCopy ? MI->getOperand(0).getReg()
2707 : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
2708
2709 int64_t Offset = FrameInfo.getObjectOffset(Index);
2710 if (Offset == 0) {
2711 unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
2712 : AMDGPU::V_LSHRREV_B32_e64;
2713 Register TmpResultReg = ResultReg;
2714 if (IsSALU && LiveSCC) {
2715 TmpResultReg = RS->scavengeRegisterBackwards(
2716 AMDGPU::VGPR_32RegClass, MI, false, 0);
2717 }
2718
2719 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
2720 if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
2721 // For V_LSHRREV, the operands are reversed (the shift count goes
2722 // first).
2723 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
2724 else
2725 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
2726 if (IsSALU && !LiveSCC)
2727 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
2728 if (IsSALU && LiveSCC) {
2729 Register NewDest =
2730 IsCopy ? ResultReg
2731 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
2732 Shift, false, 0);
2733 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
2734 NewDest)
2735 .addReg(TmpResultReg);
2736 ResultReg = NewDest;
2737 }
2738 } else {
2740 if (!IsSALU) {
2741 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
2742 nullptr) {
2743 // Reuse ResultReg in intermediate step.
2744 Register ScaledReg = ResultReg;
2745
2746 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
2747 ScaledReg)
2749 .addReg(FrameReg);
2750
2751 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
2752
2753 // TODO: Fold if use instruction is another add of a constant.
2755 // FIXME: This can fail
2756 MIB.addImm(Offset);
2757 MIB.addReg(ScaledReg, RegState::Kill);
2758 if (!IsVOP2)
2759 MIB.addImm(0); // clamp bit
2760 } else {
2761 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
2762 "Need to reuse carry out register");
2763
2764 // Use scavenged unused carry out as offset register.
2765 Register ConstOffsetReg;
2766 if (!isWave32)
2767 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
2768 else
2769 ConstOffsetReg = MIB.getReg(1);
2770
2771 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
2772 .addImm(Offset);
2773 MIB.addReg(ConstOffsetReg, RegState::Kill);
2774 MIB.addReg(ScaledReg, RegState::Kill);
2775 MIB.addImm(0); // clamp bit
2776 }
2777 }
2778 }
2779 if (!MIB || IsSALU) {
2780 // We have to produce a carry out, and there isn't a free SGPR pair
2781 // for it. We can keep the whole computation on the SALU to avoid
2782 // clobbering an additional register at the cost of an extra mov.
2783
2784 // We may have 1 free scratch SGPR even though a carry out is
2785 // unavailable. Only one additional mov is needed.
2786 Register TmpScaledReg = IsCopy && IsSALU
2787 ? ResultReg
2789 AMDGPU::SReg_32_XM0RegClass, MI,
2790 false, 0, /*AllowSpill=*/false);
2791 Register ScaledReg =
2792 TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
2793 Register TmpResultReg = ScaledReg;
2794
2795 if (!LiveSCC) {
2796 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
2797 .addReg(FrameReg)
2799 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
2800 .addReg(TmpResultReg, RegState::Kill)
2801 .addImm(Offset);
2802 } else {
2803 TmpResultReg = RS->scavengeRegisterBackwards(
2804 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
2805
2807 if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
2808 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
2809 TmpResultReg)
2811 .addReg(FrameReg);
2812 if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
2813 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32),
2814 ResultReg)
2815 .addImm(Offset);
2816 Add.addReg(ResultReg, RegState::Kill)
2817 .addReg(TmpResultReg, RegState::Kill)
2818 .addImm(0);
2819 } else
2820 Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
2821 } else {
2822 assert(Offset > 0 &&
2823 isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
2824 "offset is unsafe for v_mad_u32_u24");
2825
2826 // We start with a frame pointer with a wave space value, and
2827 // an offset in lane-space. We are materializing a lane space
2828 // value. We can either do a right shift of the frame pointer
2829 // to get to lane space, or a left shift of the offset to get
2830 // to wavespace. We can right shift after the computation to
2831 // get back to the desired per-lane value. We are using the
2832 // mad_u32_u24 primarily as an add with no carry out clobber.
2833 bool IsInlinableLiteral = AMDGPU::isInlinableLiteral32(
2835 if (!IsInlinableLiteral) {
2836 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
2837 TmpResultReg)
2838 .addImm(Offset);
2839 }
2840
2841 Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
2842 TmpResultReg);
2843
2844 if (!IsInlinableLiteral) {
2845 Add.addReg(TmpResultReg, RegState::Kill);
2846 } else {
2847 // We fold the offset into mad itself if its inlinable.
2848 Add.addImm(Offset);
2849 }
2850 Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
2851 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
2852 TmpResultReg)
2854 .addReg(TmpResultReg);
2855 }
2856
2857 Register NewDest = IsCopy ? ResultReg
2859 AMDGPU::SReg_32RegClass, *Add,
2860 false, 0, /*AllowSpill=*/true);
2861 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
2862 NewDest)
2863 .addReg(TmpResultReg);
2864 ResultReg = NewDest;
2865 }
2866 if (!IsSALU)
2867 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
2868 .addReg(TmpResultReg, RegState::Kill);
2869 else
2870 ResultReg = TmpResultReg;
2871 // If there were truly no free SGPRs, we need to undo everything.
2872 if (!TmpScaledReg.isValid()) {
2873 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
2874 .addReg(ScaledReg, RegState::Kill)
2875 .addImm(-Offset);
2876 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
2877 .addReg(FrameReg)
2879 }
2880 }
2881 }
2882
2883 // Don't introduce an extra copy if we're just materializing in a mov.
2884 if (IsCopy) {
2885 MI->eraseFromParent();
2886 return true;
2887 }
2888 FIOp.ChangeToRegister(ResultReg, false, false, true);
2889 return false;
2890 }
2891
2892 if (IsMUBUF) {
2893 // Disable offen so we don't need a 0 vgpr base.
2894 assert(static_cast<int>(FIOperandNum) ==
2895 AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2896 AMDGPU::OpName::vaddr));
2897
2898 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
2899 assert((SOffset.isImm() && SOffset.getImm() == 0));
2900
2901 if (FrameReg != AMDGPU::NoRegister)
2902 SOffset.ChangeToRegister(FrameReg, false);
2903
2904 int64_t Offset = FrameInfo.getObjectOffset(Index);
2905 int64_t OldImm
2906 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
2907 int64_t NewOffset = OldImm + Offset;
2908
2909 if (TII->isLegalMUBUFImmOffset(NewOffset) &&
2910 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
2911 MI->eraseFromParent();
2912 return true;
2913 }
2914 }
2915
2916 // If the offset is simply too big, don't convert to a scratch wave offset
2917 // relative index.
2918
2920 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
2921 Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
2922 MI, false, 0);
2923 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
2924 .addImm(Offset);
2925 FIOp.ChangeToRegister(TmpReg, false, false, true);
2926 }
2927 }
2928 }
2929 return false;
2930}
2931
2934}
2935
2937 return getRegBitWidth(RC.getID());
2938}
2939
2940static const TargetRegisterClass *
2942 if (BitWidth == 64)
2943 return &AMDGPU::VReg_64RegClass;
2944 if (BitWidth == 96)
2945 return &AMDGPU::VReg_96RegClass;
2946 if (BitWidth == 128)
2947 return &AMDGPU::VReg_128RegClass;
2948 if (BitWidth == 160)
2949 return &AMDGPU::VReg_160RegClass;
2950 if (BitWidth == 192)
2951 return &AMDGPU::VReg_192RegClass;
2952 if (BitWidth == 224)
2953 return &AMDGPU::VReg_224RegClass;
2954 if (BitWidth == 256)
2955 return &AMDGPU::VReg_256RegClass;
2956 if (BitWidth == 288)
2957 return &AMDGPU::VReg_288RegClass;
2958 if (BitWidth == 320)
2959 return &AMDGPU::VReg_320RegClass;
2960 if (BitWidth == 352)
2961 return &AMDGPU::VReg_352RegClass;
2962 if (BitWidth == 384)
2963 return &AMDGPU::VReg_384RegClass;
2964 if (BitWidth == 512)
2965 return &AMDGPU::VReg_512RegClass;
2966 if (BitWidth == 1024)
2967 return &AMDGPU::VReg_1024RegClass;
2968
2969 return nullptr;
2970}
2971
2972static const TargetRegisterClass *
2974 if (BitWidth == 64)
2975 return &AMDGPU::VReg_64_Align2RegClass;
2976 if (BitWidth == 96)
2977 return &AMDGPU::VReg_96_Align2RegClass;
2978 if (BitWidth == 128)
2979 return &AMDGPU::VReg_128_Align2RegClass;
2980 if (BitWidth == 160)
2981 return &AMDGPU::VReg_160_Align2RegClass;
2982 if (BitWidth == 192)
2983 return &AMDGPU::VReg_192_Align2RegClass;
2984 if (BitWidth == 224)
2985 return &AMDGPU::VReg_224_Align2RegClass;
2986 if (BitWidth == 256)
2987 return &AMDGPU::VReg_256_Align2RegClass;
2988 if (BitWidth == 288)
2989 return &AMDGPU::VReg_288_Align2RegClass;
2990 if (BitWidth == 320)
2991 return &AMDGPU::VReg_320_Align2RegClass;
2992 if (BitWidth == 352)
2993 return &AMDGPU::VReg_352_Align2RegClass;
2994 if (BitWidth == 384)
2995 return &AMDGPU::VReg_384_Align2RegClass;
2996 if (BitWidth == 512)
2997 return &AMDGPU::VReg_512_Align2RegClass;
2998 if (BitWidth == 1024)
2999 return &AMDGPU::VReg_1024_Align2RegClass;
3000
3001 return nullptr;
3002}
3003
3004const TargetRegisterClass *
3006 if (BitWidth == 1)
3007 return &AMDGPU::VReg_1RegClass;
3008 if (BitWidth == 16)
3009 return &AMDGPU::VGPR_16RegClass;
3010 if (BitWidth == 32)
3011 return &AMDGPU::VGPR_32RegClass;
3014}
3015
3016static const TargetRegisterClass *
3018 if (BitWidth == 64)
3019 return &AMDGPU::AReg_64RegClass;
3020 if (BitWidth == 96)
3021 return &AMDGPU::AReg_96RegClass;
3022 if (BitWidth == 128)
3023 return &AMDGPU::AReg_128RegClass;
3024 if (BitWidth == 160)
3025 return &AMDGPU::AReg_160RegClass;
3026 if (BitWidth == 192)
3027 return &AMDGPU::AReg_192RegClass;
3028 if (BitWidth == 224)
3029 return &AMDGPU::AReg_224RegClass;
3030 if (BitWidth == 256)
3031 return &AMDGPU::AReg_256RegClass;
3032 if (BitWidth == 288)
3033 return &AMDGPU::AReg_288RegClass;
3034 if (BitWidth == 320)
3035 return &AMDGPU::AReg_320RegClass;
3036 if (BitWidth == 352)
3037 return &AMDGPU::AReg_352RegClass;
3038 if (BitWidth == 384)
3039 return &AMDGPU::AReg_384RegClass;
3040 if (BitWidth == 512)
3041 return &AMDGPU::AReg_512RegClass;
3042 if (BitWidth == 1024)
3043 return &AMDGPU::AReg_1024RegClass;
3044
3045 return nullptr;
3046}
3047
3048static const TargetRegisterClass *
3050 if (BitWidth == 64)
3051 return &AMDGPU::AReg_64_Align2RegClass;
3052 if (BitWidth == 96)
3053 return &AMDGPU::AReg_96_Align2RegClass;
3054 if (BitWidth == 128)
3055 return &AMDGPU::AReg_128_Align2RegClass;
3056 if (BitWidth == 160)
3057 return &AMDGPU::AReg_160_Align2RegClass;
3058 if (BitWidth == 192)
3059 return &AMDGPU::AReg_192_Align2RegClass;
3060 if (BitWidth == 224)
3061 return &AMDGPU::AReg_224_Align2RegClass;
3062 if (BitWidth == 256)
3063 return &AMDGPU::AReg_256_Align2RegClass;
3064 if (BitWidth == 288)
3065 return &AMDGPU::AReg_288_Align2RegClass;
3066 if (BitWidth == 320)
3067 return &AMDGPU::AReg_320_Align2RegClass;
3068 if (BitWidth == 352)
3069 return &AMDGPU::AReg_352_Align2RegClass;
3070 if (BitWidth == 384)
3071 return &AMDGPU::AReg_384_Align2RegClass;
3072 if (BitWidth == 512)
3073 return &AMDGPU::AReg_512_Align2RegClass;
3074 if (BitWidth == 1024)
3075 return &AMDGPU::AReg_1024_Align2RegClass;
3076
3077 return nullptr;
3078}
3079
3080const TargetRegisterClass *
3082 if (BitWidth == 16)
3083 return &AMDGPU::AGPR_LO16RegClass;
3084 if (BitWidth == 32)
3085 return &AMDGPU::AGPR_32RegClass;
3088}
3089
3090static const TargetRegisterClass *
3092 if (BitWidth == 64)
3093 return &AMDGPU::AV_64RegClass;
3094 if (BitWidth == 96)
3095 return &AMDGPU::AV_96RegClass;
3096 if (BitWidth == 128)
3097 return &AMDGPU::AV_128RegClass;
3098 if (BitWidth == 160)
3099 return &AMDGPU::AV_160RegClass;
3100 if (BitWidth == 192)
3101 return &AMDGPU::AV_192RegClass;
3102 if (BitWidth == 224)
3103 return &AMDGPU::AV_224RegClass;
3104 if (BitWidth == 256)
3105 return &AMDGPU::AV_256RegClass;
3106 if (BitWidth == 288)
3107 return &AMDGPU::AV_288RegClass;
3108 if (BitWidth == 320)
3109 return &AMDGPU::AV_320RegClass;
3110 if (BitWidth == 352)
3111 return &AMDGPU::AV_352RegClass;
3112 if (BitWidth == 384)
3113 return &AMDGPU::AV_384RegClass;
3114 if (BitWidth == 512)
3115 return &AMDGPU::AV_512RegClass;
3116 if (BitWidth == 1024)
3117 return &AMDGPU::AV_1024RegClass;
3118
3119 return nullptr;
3120}
3121
3122static const TargetRegisterClass *
3124 if (BitWidth == 64)
3125 return &AMDGPU::AV_64_Align2RegClass;
3126 if (BitWidth == 96)
3127 return &AMDGPU::AV_96_Align2RegClass;
3128 if (BitWidth == 128)
3129 return &AMDGPU::AV_128_Align2RegClass;
3130 if (BitWidth == 160)
3131 return &AMDGPU::AV_160_Align2RegClass;
3132 if (BitWidth == 192)
3133 return &AMDGPU::AV_192_Align2RegClass;
3134 if (BitWidth == 224)
3135 return &AMDGPU::AV_224_Align2RegClass;
3136 if (BitWidth == 256)
3137 return &AMDGPU::AV_256_Align2RegClass;
3138 if (BitWidth == 288)
3139 return &AMDGPU::AV_288_Align2RegClass;
3140 if (BitWidth == 320)
3141 return &AMDGPU::AV_320_Align2RegClass;
3142 if (BitWidth == 352)
3143 return &AMDGPU::AV_352_Align2RegClass;
3144 if (BitWidth == 384)
3145 return &AMDGPU::AV_384_Align2RegClass;
3146 if (BitWidth == 512)
3147 return &AMDGPU::AV_512_Align2RegClass;
3148 if (BitWidth == 1024)
3149 return &AMDGPU::AV_1024_Align2RegClass;
3150
3151 return nullptr;
3152}
3153
3154const TargetRegisterClass *
3156 if (BitWidth == 32)
3157 return &AMDGPU::AV_32RegClass;
3158 return ST.needsAlignedVGPRs()
3161}
3162
3163const TargetRegisterClass *
3165 if (BitWidth == 16)
3166 return &AMDGPU::SGPR_LO16RegClass;
3167 if (BitWidth == 32)
3168 return &AMDGPU::SReg_32RegClass;
3169 if (BitWidth == 64)
3170 return &AMDGPU::SReg_64RegClass;
3171 if (BitWidth == 96)
3172 return &AMDGPU::SGPR_96RegClass;
3173 if (BitWidth == 128)
3174 return &AMDGPU::SGPR_128RegClass;
3175 if (BitWidth == 160)
3176 return &AMDGPU::SGPR_160RegClass;
3177 if (BitWidth == 192)
3178 return &AMDGPU::SGPR_192RegClass;
3179 if (BitWidth == 224)
3180 return &AMDGPU::SGPR_224RegClass;
3181 if (BitWidth == 256)
3182 return &AMDGPU::SGPR_256RegClass;
3183 if (BitWidth == 288)
3184 return &AMDGPU::SGPR_288RegClass;
3185 if (BitWidth == 320)
3186 return &AMDGPU::SGPR_320RegClass;
3187 if (BitWidth == 352)
3188 return &AMDGPU::SGPR_352RegClass;
3189 if (BitWidth == 384)
3190 return &AMDGPU::SGPR_384RegClass;
3191 if (BitWidth == 512)
3192 return &AMDGPU::SGPR_512RegClass;
3193 if (BitWidth == 1024)
3194 return &AMDGPU::SGPR_1024RegClass;
3195
3196 return nullptr;
3197}
3198
3200 Register Reg) const {
3201 const TargetRegisterClass *RC;
3202 if (Reg.isVirtual())
3203 RC = MRI.getRegClass(Reg);
3204 else
3205 RC = getPhysRegBaseClass(Reg);
3206 return RC ? isSGPRClass(RC) : false;
3207}
3208
3209const TargetRegisterClass *
3211 unsigned Size = getRegSizeInBits(*SRC);
3213 assert(VRC && "Invalid register class size");
3214 return VRC;
3215}
3216
3217const TargetRegisterClass *
3219 unsigned Size = getRegSizeInBits(*SRC);
3221 assert(ARC && "Invalid register class size");
3222 return ARC;
3223}
3224
3225const TargetRegisterClass *
3227 unsigned Size = getRegSizeInBits(*VRC);
3228 if (Size == 32)
3229 return &AMDGPU::SGPR_32RegClass;
3231 assert(SRC && "Invalid register class size");
3232 return SRC;
3233}
3234
3235const TargetRegisterClass *
3237 const TargetRegisterClass *SubRC,
3238 unsigned SubIdx) const {
3239 // Ensure this subregister index is aligned in the super register.
3240 const TargetRegisterClass *MatchRC =
3241 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
3242 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
3243}
3244
3245bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
3248 return !ST.hasMFMAInlineLiteralBug();
3249
3250 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
3251 OpType <= AMDGPU::OPERAND_SRC_LAST;
3252}
3253
3255 const TargetRegisterClass *DefRC,
3256 unsigned DefSubReg,
3257 const TargetRegisterClass *SrcRC,
3258 unsigned SrcSubReg) const {
3259 // We want to prefer the smallest register class possible, so we don't want to
3260 // stop and rewrite on anything that looks like a subregister
3261 // extract. Operations mostly don't care about the super register class, so we
3262 // only want to stop on the most basic of copies between the same register
3263 // class.
3264 //
3265 // e.g. if we have something like
3266 // %0 = ...
3267 // %1 = ...
3268 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
3269 // %3 = COPY %2, sub0
3270 //
3271 // We want to look through the COPY to find:
3272 // => %3 = COPY %0
3273
3274 // Plain copy.
3275 return getCommonSubClass(DefRC, SrcRC) != nullptr;
3276}
3277
3278bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
3279 // TODO: 64-bit operands have extending behavior from 32-bit literal.
3280 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
3282}
3283
3284/// Returns a lowest register that is not used at any point in the function.
3285/// If all registers are used, then this function will return
3286/// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
3287/// highest unused register.
3290 const MachineFunction &MF, bool ReserveHighestRegister) const {
3291 if (ReserveHighestRegister) {
3292 for (MCRegister Reg : reverse(*RC))
3293 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3294 return Reg;
3295 } else {
3296 for (MCRegister Reg : *RC)
3297 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3298 return Reg;
3299 }
3300 return MCRegister();
3301}
3302
3304 const RegisterBankInfo &RBI,
3305 Register Reg) const {
3306 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo());
3307 if (!RB)
3308 return false;
3309
3310 return !RBI.isDivergentRegBank(RB);
3311}
3312
3314 unsigned EltSize) const {
3315 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
3316 assert(RegBitWidth >= 32 && RegBitWidth <= 1024);
3317
3318 const unsigned RegDWORDs = RegBitWidth / 32;
3319 const unsigned EltDWORDs = EltSize / 4;
3320 assert(RegSplitParts.size() + 1 >= EltDWORDs);
3321
3322 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1];
3323 const unsigned NumParts = RegDWORDs / EltDWORDs;
3324
3325 return ArrayRef(Parts.data(), NumParts);
3326}
3327
3330 Register Reg) const {
3331 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg);
3332}
3333
3334const TargetRegisterClass *
3336 const MachineOperand &MO) const {
3337 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg());
3338 return getSubRegisterClass(SrcRC, MO.getSubReg());
3339}
3340
3342 Register Reg) const {
3343 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3344 // Registers without classes are unaddressable, SGPR-like registers.
3345 return RC && isVGPRClass(RC);
3346}
3347
3349 Register Reg) const {
3350 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3351
3352 // Registers without classes are unaddressable, SGPR-like registers.
3353 return RC && isAGPRClass(RC);
3354}
3355
3357 const TargetRegisterClass *SrcRC,
3358 unsigned SubReg,
3359 const TargetRegisterClass *DstRC,
3360 unsigned DstSubReg,
3361 const TargetRegisterClass *NewRC,
3362 LiveIntervals &LIS) const {
3363 unsigned SrcSize = getRegSizeInBits(*SrcRC);
3364 unsigned DstSize = getRegSizeInBits(*DstRC);
3365 unsigned NewSize = getRegSizeInBits(*NewRC);
3366
3367 // Do not increase size of registers beyond dword, we would need to allocate
3368 // adjacent registers and constraint regalloc more than needed.
3369
3370 // Always allow dword coalescing.
3371 if (SrcSize <= 32 || DstSize <= 32)
3372 return true;
3373
3374 return NewSize <= DstSize || NewSize <= SrcSize;
3375}
3376
3378 MachineFunction &MF) const {
3380
3381 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
3382 MF.getFunction());
3383 switch (RC->getID()) {
3384 default:
3385 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
3386 case AMDGPU::VGPR_32RegClassID:
3387 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
3388 case AMDGPU::SGPR_32RegClassID:
3389 case AMDGPU::SGPR_LO16RegClassID:
3390 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
3391 }
3392}
3393
3395 unsigned Idx) const {
3396 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
3397 Idx == AMDGPU::RegisterPressureSets::AGPR_32)
3398 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
3399 const_cast<MachineFunction &>(MF));
3400
3401 if (Idx == AMDGPU::RegisterPressureSets::SReg_32)
3402 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
3403 const_cast<MachineFunction &>(MF));
3404
3405 llvm_unreachable("Unexpected register pressure set!");
3406}
3407
3408const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
3409 static const int Empty[] = { -1 };
3410
3411 if (RegPressureIgnoredUnits[RegUnit])
3412 return Empty;
3413
3414 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
3415}
3416
3418 // Not a callee saved register.
3419 return AMDGPU::SGPR30_SGPR31;
3420}
3421
3422const TargetRegisterClass *
3424 const RegisterBank &RB) const {
3425 switch (RB.getID()) {
3426 case AMDGPU::VGPRRegBankID:
3428 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
3429 case AMDGPU::VCCRegBankID:
3430 assert(Size == 1);
3431 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
3432 : &AMDGPU::SReg_64_XEXECRegClass;
3433 case AMDGPU::SGPRRegBankID:
3434 return getSGPRClassForBitWidth(std::max(32u, Size));
3435 case AMDGPU::AGPRRegBankID:
3436 return getAGPRClassForBitWidth(std::max(32u, Size));
3437 default:
3438 llvm_unreachable("unknown register bank");
3439 }
3440}
3441
3442const TargetRegisterClass *
3444 const MachineRegisterInfo &MRI) const {
3445 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
3446 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>())
3447 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
3448
3449 if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>())
3450 return getAllocatableClass(RC);
3451
3452 return nullptr;
3453}
3454
3456 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
3457}
3458
3460 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3461}
3462
3464 // VGPR tuples have an alignment requirement on gfx90a variants.
3465 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
3466 : &AMDGPU::VReg_64RegClass;
3467}
3468
3469const TargetRegisterClass *
3470SIRegisterInfo::getRegClass(unsigned RCID) const {
3471 switch ((int)RCID) {
3472 case AMDGPU::SReg_1RegClassID:
3473 return getBoolRC();
3474 case AMDGPU::SReg_1_XEXECRegClassID:
3475 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
3476 : &AMDGPU::SReg_64_XEXECRegClass;
3477 case -1:
3478 return nullptr;
3479 default:
3480 return AMDGPUGenRegisterInfo::getRegClass(RCID);
3481 }
3482}
3483
3484// Find reaching register definition
3488 LiveIntervals *LIS) const {
3489 auto &MDT = LIS->getDomTree();
3490 SlotIndex UseIdx = LIS->getInstructionIndex(Use);
3491 SlotIndex DefIdx;
3492
3493 if (Reg.isVirtual()) {
3494 if (!LIS->hasInterval(Reg))
3495 return nullptr;
3496 LiveInterval &LI = LIS->getInterval(Reg);
3497 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
3498 : MRI.getMaxLaneMaskForVReg(Reg);
3499 VNInfo *V = nullptr;
3500 if (LI.hasSubRanges()) {
3501 for (auto &S : LI.subranges()) {
3502 if ((S.LaneMask & SubLanes) == SubLanes) {
3503 V = S.getVNInfoAt(UseIdx);
3504 break;
3505 }
3506 }
3507 } else {
3508 V = LI.getVNInfoAt(UseIdx);
3509 }
3510 if (!V)
3511 return nullptr;
3512 DefIdx = V->def;
3513 } else {
3514 // Find last def.
3515 for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
3516 LiveRange &LR = LIS->getRegUnit(Unit);
3517 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
3518 if (!DefIdx.isValid() ||
3519 MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
3520 LIS->getInstructionFromIndex(V->def)))
3521 DefIdx = V->def;
3522 } else {
3523 return nullptr;
3524 }
3525 }
3526 }
3527
3528 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
3529
3530 if (!Def || !MDT.dominates(Def, &Use))
3531 return nullptr;
3532
3533 assert(Def->modifiesRegister(Reg, this));
3534
3535 return Def;
3536}
3537
3539 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32);
3540
3541 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
3542 AMDGPU::SReg_32RegClass,
3543 AMDGPU::AGPR_32RegClass } ) {
3544 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
3545 return Super;
3546 }
3547 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
3548 &AMDGPU::VGPR_32RegClass)) {
3549 return Super;
3550 }
3551
3552 return AMDGPU::NoRegister;
3553}
3554
3556 if (!ST.needsAlignedVGPRs())
3557 return true;
3558
3559 if (isVGPRClass(&RC))
3560 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
3561 if (isAGPRClass(&RC))
3562 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
3563 if (isVectorSuperClass(&RC))
3564 return RC.hasSuperClassEq(
3565 getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
3566
3567 return true;
3568}
3569
3570const TargetRegisterClass *
3572 if (!RC || !ST.needsAlignedVGPRs())
3573 return RC;
3574
3575 unsigned Size = getRegSizeInBits(*RC);
3576 if (Size <= 32)
3577 return RC;
3578
3579 if (isVGPRClass(RC))
3581 if (isAGPRClass(RC))
3583 if (isVectorSuperClass(RC))
3585
3586 return RC;
3587}
3588
3591 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
3592}
3593
3596 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2);
3597}
3598
3601 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
3602}
3603
3604unsigned
3606 unsigned SubReg) const {
3607 switch (RC->TSFlags & SIRCFlags::RegKindMask) {
3608 case SIRCFlags::HasSGPR:
3609 return std::min(128u, getSubRegIdxSize(SubReg));
3610 case SIRCFlags::HasAGPR:
3611 case SIRCFlags::HasVGPR:
3613 return std::min(32u, getSubRegIdxSize(SubReg));
3614 default:
3615 break;
3616 }
3617 return 0;
3618}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A set of register units.
#define I(x, y, z)
Definition: MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
This file declares the machine register scavenger class.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static int getOffenMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyAGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFLoad(unsigned Opc)
static const std::array< unsigned, 17 > SubRegFromChannelTableWidthMap
static const TargetRegisterClass * getAlignedAGPRClassForBitWidth(unsigned BitWidth)
static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, MachineFrameInfo &MFI, MachineBasicBlock::iterator MI, int Index, int64_t Offset)
static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, unsigned LoadStoreOp, unsigned EltSize)
static const TargetRegisterClass * getAlignedVGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyVGPRClassForBitWidth(unsigned BitWidth)
static cl::opt< bool > EnableSpillSGPRToVGPR("amdgpu-spill-sgpr-to-vgpr", cl::desc("Enable spilling SGPRs to VGPRs"), cl::ReallyHidden, cl::init(true))
static unsigned getNumSubRegsForSpillOp(unsigned Op)
static const TargetRegisterClass * getAlignedVectorSuperClassForBitWidth(unsigned BitWidth)
static const TargetRegisterClass * getAnyVectorSuperClassForBitWidth(unsigned BitWidth)
static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, int Index, unsigned Lane, unsigned ValueReg, bool IsKill)
static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI, const MachineInstr &MI)
static int getOffenMUBUFLoad(unsigned Opc)
Interface definition for SIRegisterInfo.
static const char * getRegisterName(MCRegister Reg)
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
void resize(unsigned N, bool t=false)
resize - Grow or shrink the bitvector.
Definition: BitVector.h:341
BitVector & set()
Definition: BitVector.h:351
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Register getReg() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:281
bool hasGFX90AInsts() const
bool hasMAIInsts() const
Definition: GCNSubtarget.h:815
bool hasMFMAInlineLiteralBug() const
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:266
unsigned getConstantBusLimit(unsigned Opcode) const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool enableFlatScratch() const
Definition: GCNSubtarget.h:652
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:270
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:924
bool hasFlatScratchSTMode() const
Definition: GCNSubtarget.h:642
unsigned getMaxWaveScratchSize() const
Definition: GCNSubtarget.h:321
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:687
bool hasSubRanges() const
Returns true if subregister liveness information is available.
Definition: LiveInterval.h:810
iterator_range< subrange_iterator > subranges()
Definition: LiveInterval.h:782
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
bool hasInterval(Register Reg) const
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
MachineDominatorTree & getDomTree()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveRange & getRegUnit(unsigned Unit)
Return the live range for register unit Unit.
LiveInterval & getInterval(Register Reg)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
Definition: LiveInterval.h:421
A set of register units used to track register liveness.
Definition: LiveRegUnits.h:30
bool available(MCPhysReg Reg) const
Returns true if no part of physical register Reg is live.
Definition: LiveRegUnits.h:116
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
MCRegAliasIterator enumerates all registers aliasing Reg.
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition: MCRegister.h:74
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasCalls() const
Return true if the current function has any function calls.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
bool hasStackObjects() const
Return true if there are any stack objects in this function.
uint8_t getStackID(int ObjectIdx) const
unsigned getNumFixedObjects() const
Return the number of fixed objects.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
void setAsmPrinterFlag(uint8_t Flag)
Set a flag for the AsmPrinter.
Definition: MachineInstr.h:377
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void assignRegToScavengingIndex(int FI, Register Reg, MachineInstr *Restore=nullptr)
Record that Reg is in use at scavenging index FI.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
Holds all the information related to register banks.
virtual bool isDivergentRegBank(const RegisterBank *RB) const
Returns true if the register bank is considered divergent.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
bool hasFP(const MachineFunction &MF) const override
hasFP - Return true if the specified function should have a dedicated frame pointer register.
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:528
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool usesAGPRs(const MachineFunction &MF) const
ArrayRef< MCPhysReg > getAGPRSpillVGPRs() const
MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< MCPhysReg > getVGPRSpillAGPRs() const
int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI)
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const
const ReservedRegSet & getWWMReservedRegs() const
Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, int64_t Offset) const override
int64_t getScratchInstrOffset(const MachineInstr *MI) const
bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const override
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
ArrayRef< MCPhysReg > getAllSGPR64(const MachineFunction &MF) const
Return all SGPR64 which satisfy the waves per execution unit requirement of the subtarget.
MCRegister findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, const MachineFunction &MF, bool ReserveHighestVGPR=false) const
Returns a lowest register that is not used at any point in the function.
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
bool requiresFrameIndexReplacementScavenging(const MachineFunction &MF) const override
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool shouldRealignStack(const MachineFunction &MF) const override
bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
Register getFrameRegister(const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getVectorSuperClassForBitWidth(unsigned BitWidth) const
bool spillEmergencySGPR(MachineBasicBlock::iterator MI, MachineBasicBlock &RestoreMBB, Register SGPR, RegScavenger *RS) const
SIRegisterInfo(const GCNSubtarget &ST)
const uint32_t * getAllVGPRRegMask() const
MCRegister getReturnAddressReg(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
bool hasBasePointer(const MachineFunction &MF) const
const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override
Returns a legal register class to copy a register in the specified class to or from.
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
ArrayRef< MCPhysReg > getAllSGPR32(const MachineFunction &MF) const
Return all SGPR32 which satisfy the waves per execution unit requirement of the subtarget.
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch buffer in case spilling is needed.
bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool SpillToPhysVGPRLane=false) const
Special case of eliminateFrameIndex.
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
void buildSpillLoadStore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned LoadStoreOp, int Index, Register ValueReg, bool ValueIsKill, MCRegister ScratchOffsetReg, int64_t InstrOffset, MachineMemOperand *MMO, RegScavenger *RS, LiveRegUnits *LiveUnits=nullptr) const
bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override
LLVM_READONLY const TargetRegisterClass * getAGPRClassForBitWidth(unsigned BitWidth) const
static bool isChainScratchRegister(Register VGPR)
bool requiresRegisterScavenging(const MachineFunction &Fn) const override
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
bool isUniformReg(const MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, Register Reg) const override
const uint32_t * getNoPreservedMask() const override
StringRef getRegAsmName(MCRegister Reg) const override
const uint32_t * getAllAllocatableSRegMask() const
MCRegister getAlignedHighSGPRForRC(const MachineFunction &MF, const unsigned Align, const TargetRegisterClass *RC) const
Return the largest available SGPR aligned to Align for the register class RC.
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const MCPhysReg * getCalleeSavedRegsViaCopy(const MachineFunction *MF) const
const uint32_t * getAllVectorRegMask() const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
bool opCanUseLiteralConstant(unsigned OpType) const
Register getBaseRegister() const
LLVM_READONLY const TargetRegisterClass * getVGPRClassForBitWidth(unsigned BitWidth) const
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override
bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, unsigned DefSubReg, const TargetRegisterClass *SrcRC, unsigned SrcSubReg) const override
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
MachineInstr * findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
ArrayRef< MCPhysReg > getAllSGPR128(const MachineFunction &MF) const
Return all SGPR128 which satisfy the waves per execution unit requirement of the subtarget.
unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const override
BitVector getReservedRegs(const MachineFunction &MF) const override
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override
const TargetRegisterClass * getRegClassForOperandReg(const MachineRegisterInfo &MRI, const MachineOperand &MO) const
const uint32_t * getAllAGPRRegMask() const
bool shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg, const TargetRegisterClass *DstRC, unsigned DstSubReg, const TargetRegisterClass *NewRC, LiveIntervals &LIS) const override
const TargetRegisterClass * getBoolRC() const
const TargetRegisterClass * getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const override
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override
bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
If OnlyToVGPR is true, this will only succeed if this manages to find a free VGPR lane to spill.
MCRegister getExec() const
MCRegister getVCC() const
int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override
bool isVectorSuperClass(const TargetRegisterClass *RC) const
const TargetRegisterClass * getWaveMaskRegClass() const
unsigned getSubRegAlignmentNumBits(const TargetRegisterClass *RC, unsigned SubReg) const
void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override
bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override
const TargetRegisterClass * getVGPR64Class() const
void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill=true) const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
const int * getRegUnitPressureSets(unsigned RegUnit) const override
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:65
bool isValid() const
Returns true if this is a valid index.
Definition: SlotIndexes.h:130
SlotIndexes pass.
Definition: SlotIndexes.h:297
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:531
SlotIndex replaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
ReplaceMachineInstrInMaps - Replacing a machine instr with a new one in maps used by register allocat...
Definition: SlotIndexes.h:588
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
const uint8_t TSFlags
Configurable target specific flags.
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
virtual const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &) const
Returns the largest super class of RC that is legal to use in the current sub-target and has the same...
virtual bool shouldRealignStack(const MachineFunction &MF) const
True if storage within the function requires the stack pointer to be aligned more than the normal cal...
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
VNInfo - Value Number Information.
Definition: LiveInterval.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ PRIVATE_ADDRESS
Address space for private memory.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
LLVM_READONLY int getFlatScratchInstSTfromSS(uint16_t Opcode)
LLVM_READONLY int getFlatScratchInstSVfromSVS(uint16_t Opcode)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
@ OPERAND_REG_IMM_FIRST
Definition: SIDefines.h:256
@ OPERAND_SRC_FIRST
Definition: SIDefines.h:265
@ OPERAND_REG_INLINE_AC_FIRST
Definition: SIDefines.h:262
@ OPERAND_REG_INLINE_AC_LAST
Definition: SIDefines.h:263
@ OPERAND_REG_IMM_LAST
Definition: SIDefines.h:257
@ OPERAND_SRC_LAST
Definition: SIDefines.h:266
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Renamable
Register that may be renamed.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition: MCRegister.h:21
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
@ HasSGPR
Definition: SIDefines.h:26
@ HasVGPR
Definition: SIDefines.h:24
@ RegKindMask
Definition: SIDefines.h:29
@ HasAGPR
Definition: SIDefines.h:25
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
unsigned getDefRegState(bool B)
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition: Threading.h:87
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Description of the encoding of one expression Op.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI)
ArrayRef< int16_t > SplitParts
SIMachineFunctionInfo & MFI
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, int Index, RegScavenger *RS)
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, bool IsKill, int Index, RegScavenger *RS)
PerVGPRData getPerVGPRData()
MachineBasicBlock::iterator MI
void readWriteTmpVGPR(unsigned Offset, bool IsLoad)
const SIRegisterInfo & TRI
MachineFunction & MF
MachineBasicBlock * MBB
const SIInstrInfo & TII
The llvm::once_flag structure.
Definition: Threading.h:68