LLVM 20.0.0git
SIRegisterInfo.cpp
Go to the documentation of this file.
1//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI implementation of the TargetRegisterInfo class.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
16#include "GCNSubtarget.h"
20#include "SIRegisterInfo.h"
26
27using namespace llvm;
28
29#define GET_REGINFO_TARGET_DESC
30#include "AMDGPUGenRegisterInfo.inc"
31
33 "amdgpu-spill-sgpr-to-vgpr",
34 cl::desc("Enable spilling SGPRs to VGPRs"),
36 cl::init(true));
37
38std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
39std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
40
41// Map numbers of DWORDs to indexes in SubRegFromChannelTable.
42// Valid indexes are shifted 1, such that a 0 mapping means unsupported.
43// e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
44// meaning index 7 in SubRegFromChannelTable.
45static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
47
48static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI,
49 const Twine &ErrMsg) {
51 DiagnosticInfoUnsupported(Fn, ErrMsg, MI.getDebugLoc()));
52}
53
54namespace llvm {
55
56// A temporary struct to spill SGPRs.
57// This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
58// just v_writelane and v_readlane.
59//
60// When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
61// is saved to scratch (or the other way around for loads).
62// For this, a VGPR is required where the needed lanes can be clobbered. The
63// RegScavenger can provide a VGPR where currently active lanes can be
64// clobbered, but we still need to save inactive lanes.
65// The high-level steps are:
66// - Try to scavenge SGPR(s) to save exec
67// - Try to scavenge VGPR
68// - Save needed, all or inactive lanes of a TmpVGPR
69// - Spill/Restore SGPRs using TmpVGPR
70// - Restore TmpVGPR
71//
72// To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
73// cannot scavenge temporary SGPRs to save exec, we use the following code:
74// buffer_store_dword TmpVGPR ; only if active lanes need to be saved
75// s_not exec, exec
76// buffer_store_dword TmpVGPR ; save inactive lanes
77// s_not exec, exec
79 struct PerVGPRData {
80 unsigned PerVGPR;
81 unsigned NumVGPRs;
82 int64_t VGPRLanes;
83 };
84
85 // The SGPR to save
89 unsigned NumSubRegs;
90 bool IsKill;
91 const DebugLoc &DL;
92
93 /* When spilling to stack */
94 // The SGPRs are written into this VGPR, which is then written to scratch
95 // (or vice versa for loads).
96 Register TmpVGPR = AMDGPU::NoRegister;
97 // Temporary spill slot to save TmpVGPR to.
98 int TmpVGPRIndex = 0;
99 // If TmpVGPR is live before the spill or if it is scavenged.
100 bool TmpVGPRLive = false;
101 // Scavenged SGPR to save EXEC.
102 Register SavedExecReg = AMDGPU::NoRegister;
103 // Stack index to write the SGPRs to.
104 int Index;
105 unsigned EltSize = 4;
106
115 unsigned MovOpc;
116 unsigned NotOpc;
117
121 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
122 MI->getOperand(0).isKill(), Index, RS) {}
123
126 bool IsKill, int Index, RegScavenger *RS)
127 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
128 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
129 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
131 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
134
135 if (IsWave32) {
136 ExecReg = AMDGPU::EXEC_LO;
137 MovOpc = AMDGPU::S_MOV_B32;
138 NotOpc = AMDGPU::S_NOT_B32;
139 } else {
140 ExecReg = AMDGPU::EXEC;
141 MovOpc = AMDGPU::S_MOV_B64;
142 NotOpc = AMDGPU::S_NOT_B64;
143 }
144
145 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
146 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
147 SuperReg != AMDGPU::EXEC && "exec should never spill");
148 }
149
152 Data.PerVGPR = IsWave32 ? 32 : 64;
153 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
154 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
155 return Data;
156 }
157
158 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
159 // free.
160 // Writes these instructions if an SGPR can be scavenged:
161 // s_mov_b64 s[6:7], exec ; Save exec
162 // s_mov_b64 exec, 3 ; Wanted lanemask
163 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
164 //
165 // Writes these instructions if no SGPR can be scavenged:
166 // buffer_store_dword v0 ; Only if no free VGPR was found
167 // s_not_b64 exec, exec
168 // buffer_store_dword v0 ; Save inactive lanes
169 // ; exec stays inverted, it is flipped back in
170 // ; restore.
171 void prepare() {
172 // Scavenged temporary VGPR to use. It must be scavenged once for any number
173 // of spilled subregs.
174 // FIXME: The liveness analysis is limited and does not tell if a register
175 // is in use in lanes that are currently inactive. We can never be sure if
176 // a register as actually in use in another lane, so we need to save all
177 // used lanes of the chosen VGPR.
178 assert(RS && "Cannot spill SGPR to memory without RegScavenger");
179 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
180 0, false);
181
182 // Reserve temporary stack slot
184 if (TmpVGPR) {
185 // Found a register that is dead in the currently active lanes, we only
186 // need to spill inactive lanes.
187 TmpVGPRLive = false;
188 } else {
189 // Pick v0 because it doesn't make a difference.
190 TmpVGPR = AMDGPU::VGPR0;
191 TmpVGPRLive = true;
192 }
193
194 if (TmpVGPRLive) {
195 // We need to inform the scavenger that this index is already in use until
196 // we're done with the custom emergency spill.
198 }
199
200 // We may end up recursively calling the scavenger, and don't want to re-use
201 // the same register.
203
204 // Try to scavenge SGPRs to save exec
205 assert(!SavedExecReg && "Exec is already saved, refuse to save again");
206 const TargetRegisterClass &RC =
207 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
209 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
210
211 int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
212
213 if (SavedExecReg) {
215 // Set exec to needed lanes
217 auto I =
218 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
219 if (!TmpVGPRLive)
221 // Spill needed lanes
222 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
223 } else {
224 // The modify and restore of exec clobber SCC, which we would have to save
225 // and restore. FIXME: We probably would need to reserve a register for
226 // this.
227 if (RS->isRegUsed(AMDGPU::SCC))
229 "unhandled SGPR spill to memory");
230
231 // Spill active lanes
232 if (TmpVGPRLive)
233 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
234 /*IsKill*/ false);
235 // Spill inactive lanes
236 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
237 if (!TmpVGPRLive)
239 I->getOperand(2).setIsDead(); // Mark SCC as dead.
240 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
241 }
242 }
243
244 // Writes these instructions if an SGPR can be scavenged:
245 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
246 // s_waitcnt vmcnt(0) ; If a free VGPR was found
247 // s_mov_b64 exec, s[6:7] ; Save exec
248 //
249 // Writes these instructions if no SGPR can be scavenged:
250 // buffer_load_dword v0 ; Restore inactive lanes
251 // s_waitcnt vmcnt(0) ; If a free VGPR was found
252 // s_not_b64 exec, exec
253 // buffer_load_dword v0 ; Only if no free VGPR was found
254 void restore() {
255 if (SavedExecReg) {
256 // Restore used lanes
257 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
258 /*IsKill*/ false);
259 // Restore exec
260 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
262 // Add an implicit use of the load so it is not dead.
263 // FIXME This inserts an unnecessary waitcnt
264 if (!TmpVGPRLive) {
266 }
267 } else {
268 // Restore inactive lanes
269 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
270 /*IsKill*/ false);
271 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
272 if (!TmpVGPRLive)
274 I->getOperand(2).setIsDead(); // Mark SCC as dead.
275
276 // Restore active lanes
277 if (TmpVGPRLive)
278 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
279 }
280
281 // Inform the scavenger where we're releasing our custom scavenged register.
282 if (TmpVGPRLive) {
283 MachineBasicBlock::iterator RestorePt = std::prev(MI);
285 }
286 }
287
288 // Write TmpVGPR to memory or read TmpVGPR from memory.
289 // Either using a single buffer_load/store if exec is set to the needed mask
290 // or using
291 // buffer_load
292 // s_not exec, exec
293 // buffer_load
294 // s_not exec, exec
295 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
296 if (SavedExecReg) {
297 // Spill needed lanes
298 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
299 } else {
300 // The modify and restore of exec clobber SCC, which we would have to save
301 // and restore. FIXME: We probably would need to reserve a register for
302 // this.
303 if (RS->isRegUsed(AMDGPU::SCC))
305 "unhandled SGPR spill to memory");
306
307 // Spill active lanes
308 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
309 /*IsKill*/ false);
310 // Spill inactive lanes
311 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
312 Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
313 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
314 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
315 Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
316 }
317 }
318
320 assert(MBB->getParent() == &MF);
321 MI = NewMI;
322 MBB = NewMBB;
323 }
324};
325
326} // namespace llvm
327
329 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
330 ST.getAMDGPUDwarfFlavour(),
331 /*PC=*/0, ST.getHwMode()),
332 ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
333
334 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
335 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
336 (getSubRegIndexLaneMask(AMDGPU::lo16) |
337 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
338 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
339 "getNumCoveredRegs() will not work with generated subreg masks!");
340
341 RegPressureIgnoredUnits.resize(getNumRegUnits());
342 RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin());
343 for (auto Reg : AMDGPU::VGPR_16RegClass) {
344 if (AMDGPU::isHi16Reg(Reg, *this))
345 RegPressureIgnoredUnits.set(*regunits(Reg).begin());
346 }
347
348 // HACK: Until this is fully tablegen'd.
349 static llvm::once_flag InitializeRegSplitPartsFlag;
350
351 static auto InitializeRegSplitPartsOnce = [this]() {
352 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
353 unsigned Size = getSubRegIdxSize(Idx);
354 if (Size & 31)
355 continue;
356 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1];
357 unsigned Pos = getSubRegIdxOffset(Idx);
358 if (Pos % Size)
359 continue;
360 Pos /= Size;
361 if (Vec.empty()) {
362 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
363 Vec.resize(MaxNumParts);
364 }
365 Vec[Pos] = Idx;
366 }
367 };
368
369 static llvm::once_flag InitializeSubRegFromChannelTableFlag;
370
371 static auto InitializeSubRegFromChannelTableOnce = [this]() {
372 for (auto &Row : SubRegFromChannelTable)
373 Row.fill(AMDGPU::NoSubRegister);
374 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
375 unsigned Width = getSubRegIdxSize(Idx) / 32;
376 unsigned Offset = getSubRegIdxOffset(Idx) / 32;
378 Width = SubRegFromChannelTableWidthMap[Width];
379 if (Width == 0)
380 continue;
381 unsigned TableIdx = Width - 1;
382 assert(TableIdx < SubRegFromChannelTable.size());
383 assert(Offset < SubRegFromChannelTable[TableIdx].size());
384 SubRegFromChannelTable[TableIdx][Offset] = Idx;
385 }
386 };
387
388 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
389 llvm::call_once(InitializeSubRegFromChannelTableFlag,
390 InitializeSubRegFromChannelTableOnce);
391}
392
393void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
394 MCRegister Reg) const {
395 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
396 Reserved.set(*R);
397}
398
399// Forced to be here by one .inc
401 const MachineFunction *MF) const {
403 switch (CC) {
404 case CallingConv::C:
407 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
408 : CSR_AMDGPU_SaveList;
410 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
411 : CSR_AMDGPU_SI_Gfx_SaveList;
413 return CSR_AMDGPU_CS_ChainPreserve_SaveList;
414 default: {
415 // Dummy to not crash RegisterClassInfo.
416 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
417 return &NoCalleeSavedReg;
418 }
419 }
420}
421
422const MCPhysReg *
424 return nullptr;
425}
426
428 CallingConv::ID CC) const {
429 switch (CC) {
430 case CallingConv::C:
433 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
434 : CSR_AMDGPU_RegMask;
436 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
437 : CSR_AMDGPU_SI_Gfx_RegMask;
440 // Calls to these functions never return, so we can pretend everything is
441 // preserved.
442 return AMDGPU_AllVGPRs_RegMask;
443 default:
444 return nullptr;
445 }
446}
447
449 return CSR_AMDGPU_NoRegs_RegMask;
450}
451
453 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
454}
455
458 const MachineFunction &MF) const {
459 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
460 // equivalent AV class. If used one, the verifier will crash after
461 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
462 // until Instruction selection.
463 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
464 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
465 return &AMDGPU::AV_32RegClass;
466 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
467 return &AMDGPU::AV_64RegClass;
468 if (RC == &AMDGPU::VReg_64_Align2RegClass ||
469 RC == &AMDGPU::AReg_64_Align2RegClass)
470 return &AMDGPU::AV_64_Align2RegClass;
471 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
472 return &AMDGPU::AV_96RegClass;
473 if (RC == &AMDGPU::VReg_96_Align2RegClass ||
474 RC == &AMDGPU::AReg_96_Align2RegClass)
475 return &AMDGPU::AV_96_Align2RegClass;
476 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
477 return &AMDGPU::AV_128RegClass;
478 if (RC == &AMDGPU::VReg_128_Align2RegClass ||
479 RC == &AMDGPU::AReg_128_Align2RegClass)
480 return &AMDGPU::AV_128_Align2RegClass;
481 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
482 return &AMDGPU::AV_160RegClass;
483 if (RC == &AMDGPU::VReg_160_Align2RegClass ||
484 RC == &AMDGPU::AReg_160_Align2RegClass)
485 return &AMDGPU::AV_160_Align2RegClass;
486 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
487 return &AMDGPU::AV_192RegClass;
488 if (RC == &AMDGPU::VReg_192_Align2RegClass ||
489 RC == &AMDGPU::AReg_192_Align2RegClass)
490 return &AMDGPU::AV_192_Align2RegClass;
491 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
492 return &AMDGPU::AV_256RegClass;
493 if (RC == &AMDGPU::VReg_256_Align2RegClass ||
494 RC == &AMDGPU::AReg_256_Align2RegClass)
495 return &AMDGPU::AV_256_Align2RegClass;
496 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
497 return &AMDGPU::AV_512RegClass;
498 if (RC == &AMDGPU::VReg_512_Align2RegClass ||
499 RC == &AMDGPU::AReg_512_Align2RegClass)
500 return &AMDGPU::AV_512_Align2RegClass;
501 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
502 return &AMDGPU::AV_1024RegClass;
503 if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
504 RC == &AMDGPU::AReg_1024_Align2RegClass)
505 return &AMDGPU::AV_1024_Align2RegClass;
506 }
507
509}
510
512 const SIFrameLowering *TFI = ST.getFrameLowering();
514 // During ISel lowering we always reserve the stack pointer in entry and chain
515 // functions, but never actually want to reference it when accessing our own
516 // frame. If we need a frame pointer we use it, but otherwise we can just use
517 // an immediate "0" which we represent by returning NoRegister.
518 if (FuncInfo->isBottomOfStack()) {
519 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
520 }
521 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
522 : FuncInfo->getStackPtrOffsetReg();
523}
524
526 // When we need stack realignment, we can't reference off of the
527 // stack pointer, so we reserve a base pointer.
528 return shouldRealignStack(MF);
529}
530
531Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
532
534 return AMDGPU_AllVGPRs_RegMask;
535}
536
538 return AMDGPU_AllAGPRs_RegMask;
539}
540
542 return AMDGPU_AllVectorRegs_RegMask;
543}
544
546 return AMDGPU_AllAllocatableSRegs_RegMask;
547}
548
549unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
550 unsigned NumRegs) {
551 assert(NumRegs < SubRegFromChannelTableWidthMap.size());
552 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
553 assert(NumRegIndex && "Not implemented");
554 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
555 return SubRegFromChannelTable[NumRegIndex - 1][Channel];
556}
557
560 const unsigned Align,
561 const TargetRegisterClass *RC) const {
562 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align;
563 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
564 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
565}
566
568 const MachineFunction &MF) const {
569 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
570}
571
572std::pair<unsigned, unsigned>
575 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
576 unsigned MaxNumAGPRs = MaxNumVGPRs;
577 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
578
579 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
580 // a wave may have up to 512 total vector registers combining together both
581 // VGPRs and AGPRs. Hence, in an entry function without calls and without
582 // AGPRs used within it, it is possible to use the whole vector register
583 // budget for VGPRs.
584 //
585 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
586 // register file accordingly.
587 if (ST.hasGFX90AInsts()) {
588 if (MFI->usesAGPRs(MF)) {
589 MaxNumVGPRs /= 2;
590 MaxNumAGPRs = MaxNumVGPRs;
591 } else {
592 if (MaxNumVGPRs > TotalNumVGPRs) {
593 MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs;
594 MaxNumVGPRs = TotalNumVGPRs;
595 } else
596 MaxNumAGPRs = 0;
597 }
598 }
599
600 return std::pair(MaxNumVGPRs, MaxNumAGPRs);
601}
602
604 BitVector Reserved(getNumRegs());
605 Reserved.set(AMDGPU::MODE);
606
608
609 // Reserve special purpose registers.
610 //
611 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
612 // this seems likely to result in bugs, so I'm marking them as reserved.
613 reserveRegisterTuples(Reserved, AMDGPU::EXEC);
614 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
615
616 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
617 reserveRegisterTuples(Reserved, AMDGPU::M0);
618
619 // Reserve src_vccz, src_execz, src_scc.
620 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
621 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
622 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
623
624 // Reserve the memory aperture registers
625 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
626 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
627 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
628 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
629
630 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
631 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
632
633 // Reserve xnack_mask registers - support is not implemented in Codegen.
634 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
635
636 // Reserve lds_direct register - support is not implemented in Codegen.
637 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
638
639 // Reserve Trap Handler registers - support is not implemented in Codegen.
640 reserveRegisterTuples(Reserved, AMDGPU::TBA);
641 reserveRegisterTuples(Reserved, AMDGPU::TMA);
642 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
643 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
644 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
645 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
646 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
647 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
648 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
649 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
650
651 // Reserve null register - it shall never be allocated
652 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
653
654 // Reserve SGPRs.
655 //
656 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
657 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
658 for (const TargetRegisterClass *RC : regclasses()) {
659 if (RC->isBaseClass() && isSGPRClass(RC)) {
660 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
661 for (MCPhysReg Reg : *RC) {
662 unsigned Index = getHWRegIndex(Reg);
663 if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs)
664 Reserved.set(Reg);
665 }
666 }
667 }
668
669 Register ScratchRSrcReg = MFI->getScratchRSrcReg();
670 if (ScratchRSrcReg != AMDGPU::NoRegister) {
671 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
672 // need to spill.
673 // TODO: May need to reserve a VGPR if doing LDS spilling.
674 reserveRegisterTuples(Reserved, ScratchRSrcReg);
675 }
676
677 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
678 if (LongBranchReservedReg)
679 reserveRegisterTuples(Reserved, LongBranchReservedReg);
680
681 // We have to assume the SP is needed in case there are calls in the function,
682 // which is detected after the function is lowered. If we aren't really going
683 // to need SP, don't bother reserving it.
684 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
685 if (StackPtrReg) {
686 reserveRegisterTuples(Reserved, StackPtrReg);
687 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
688 }
689
690 MCRegister FrameReg = MFI->getFrameOffsetReg();
691 if (FrameReg) {
692 reserveRegisterTuples(Reserved, FrameReg);
693 assert(!isSubRegister(ScratchRSrcReg, FrameReg));
694 }
695
696 if (hasBasePointer(MF)) {
697 MCRegister BasePtrReg = getBaseRegister();
698 reserveRegisterTuples(Reserved, BasePtrReg);
699 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
700 }
701
702 // FIXME: Use same reserved register introduced in D149775
703 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
704 Register ExecCopyReg = MFI->getSGPRForEXECCopy();
705 if (ExecCopyReg)
706 reserveRegisterTuples(Reserved, ExecCopyReg);
707
708 // Reserve VGPRs/AGPRs.
709 //
710 auto [MaxNumVGPRs, MaxNumAGPRs] = getMaxNumVectorRegs(MF);
711
712 for (const TargetRegisterClass *RC : regclasses()) {
713 if (RC->isBaseClass() && isVGPRClass(RC)) {
714 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
715 for (MCPhysReg Reg : *RC) {
716 unsigned Index = getHWRegIndex(Reg);
717 if (Index + NumRegs > MaxNumVGPRs)
718 Reserved.set(Reg);
719 }
720 }
721 }
722
723 // Reserve all the AGPRs if there are no instructions to use it.
724 if (!ST.hasMAIInsts())
725 MaxNumAGPRs = 0;
726 for (const TargetRegisterClass *RC : regclasses()) {
727 if (RC->isBaseClass() && isAGPRClass(RC)) {
728 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
729 for (MCPhysReg Reg : *RC) {
730 unsigned Index = getHWRegIndex(Reg);
731 if (Index + NumRegs > MaxNumAGPRs)
732 Reserved.set(Reg);
733 }
734 }
735 }
736
737 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
738 // VGPR available at all times.
739 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
740 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
741 }
742
743 // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The
744 // MFI->getNonWWMRegMask() field will have a valid bitmask only during
745 // wwm-regalloc and it would be empty otherwise.
746 BitVector NonWWMRegMask = MFI->getNonWWMRegMask();
747 if (!NonWWMRegMask.empty()) {
748 for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs;
749 RegI < RegE; ++RegI) {
750 if (NonWWMRegMask.test(RegI))
751 reserveRegisterTuples(Reserved, RegI);
752 }
753 }
754
755 for (Register Reg : MFI->getWWMReservedRegs())
756 reserveRegisterTuples(Reserved, Reg);
757
758 // FIXME: Stop using reserved registers for this.
759 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
760 reserveRegisterTuples(Reserved, Reg);
761
762 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
763 reserveRegisterTuples(Reserved, Reg);
764
765 return Reserved;
766}
767
769 MCRegister PhysReg) const {
770 return !MF.getRegInfo().isReserved(PhysReg);
771}
772
775 // On entry or in chain functions, the base address is 0, so it can't possibly
776 // need any more alignment.
777
778 // FIXME: Should be able to specify the entry frame alignment per calling
779 // convention instead.
780 if (Info->isBottomOfStack())
781 return false;
782
784}
785
788 if (Info->isEntryFunction()) {
789 const MachineFrameInfo &MFI = Fn.getFrameInfo();
790 return MFI.hasStackObjects() || MFI.hasCalls();
791 }
792
793 // May need scavenger for dealing with callee saved registers.
794 return true;
795}
796
798 const MachineFunction &MF) const {
799 // Do not use frame virtual registers. They used to be used for SGPRs, but
800 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
801 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
802 // spill.
803 return false;
804}
805
807 const MachineFunction &MF) const {
808 const MachineFrameInfo &MFI = MF.getFrameInfo();
809 return MFI.hasStackObjects();
810}
811
813 const MachineFunction &) const {
814 // There are no special dedicated stack or frame pointers.
815 return true;
816}
817
820
821 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
822 AMDGPU::OpName::offset);
823 return MI->getOperand(OffIdx).getImm();
824}
825
827 int Idx) const {
828 switch (MI->getOpcode()) {
829 case AMDGPU::V_ADD_U32_e32:
830 case AMDGPU::V_ADD_U32_e64:
831 case AMDGPU::V_ADD_CO_U32_e32: {
832 int OtherIdx = Idx == 1 ? 2 : 1;
833 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
834 return OtherOp.isImm() ? OtherOp.getImm() : 0;
835 }
836 case AMDGPU::V_ADD_CO_U32_e64: {
837 int OtherIdx = Idx == 2 ? 3 : 2;
838 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
839 return OtherOp.isImm() ? OtherOp.getImm() : 0;
840 }
841 default:
842 break;
843 }
844
846 return 0;
847
848 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
849 AMDGPU::OpName::vaddr) ||
850 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
851 AMDGPU::OpName::saddr))) &&
852 "Should never see frame index on non-address operand");
853
855}
856
858 const MachineInstr &MI) {
859 assert(MI.getDesc().isAdd());
860 const MachineOperand &Src0 = MI.getOperand(1);
861 const MachineOperand &Src1 = MI.getOperand(2);
862
863 if (Src0.isFI()) {
864 return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
865 Src1.getReg()));
866 }
867
868 if (Src1.isFI()) {
869 return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
870 Src0.getReg()));
871 }
872
873 return false;
874}
875
877 // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
878 switch (MI->getOpcode()) {
879 case AMDGPU::V_ADD_U32_e32: {
880 // TODO: We could handle this but it requires work to avoid violating
881 // operand restrictions.
882 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
883 !isFIPlusImmOrVGPR(*this, *MI))
884 return false;
885 [[fallthrough]];
886 }
887 case AMDGPU::V_ADD_U32_e64:
888 // FIXME: This optimization is barely profitable enableFlatScratch as-is.
889 //
890 // Much of the benefit with the MUBUF handling is we avoid duplicating the
891 // shift of the frame register, which isn't needed with scratch.
892 //
893 // materializeFrameBaseRegister doesn't know the register classes of the
894 // uses, and unconditionally uses an s_add_i32, which will end up using a
895 // copy for the vector uses.
896 return !ST.enableFlatScratch();
897 case AMDGPU::V_ADD_CO_U32_e32:
898 if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
899 !isFIPlusImmOrVGPR(*this, *MI))
900 return false;
901 // We can't deal with the case where the carry out has a use (though this
902 // should never happen)
903 return MI->getOperand(3).isDead();
904 case AMDGPU::V_ADD_CO_U32_e64:
905 // TODO: Should we check use_empty instead?
906 return MI->getOperand(1).isDead();
907 default:
908 break;
909 }
910
912 return false;
913
914 int64_t FullOffset = Offset + getScratchInstrOffset(MI);
915
916 const SIInstrInfo *TII = ST.getInstrInfo();
918 return !TII->isLegalMUBUFImmOffset(FullOffset);
919
920 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
922}
923
925 int FrameIdx,
926 int64_t Offset) const {
928 DebugLoc DL; // Defaults to "unknown"
929
930 if (Ins != MBB->end())
931 DL = Ins->getDebugLoc();
932
934 const SIInstrInfo *TII = ST.getInstrInfo();
936 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
937 : AMDGPU::V_MOV_B32_e32;
938
939 Register BaseReg = MRI.createVirtualRegister(
940 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
941 : &AMDGPU::VGPR_32RegClass);
942
943 if (Offset == 0) {
944 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
945 .addFrameIndex(FrameIdx);
946 return BaseReg;
947 }
948
949 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
950
951 Register FIReg = MRI.createVirtualRegister(
952 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
953 : &AMDGPU::VGPR_32RegClass);
954
955 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
956 .addImm(Offset);
957 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
958 .addFrameIndex(FrameIdx);
959
960 if (ST.enableFlatScratch() ) {
961 // FIXME: Make sure scc isn't live in.
962 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
963 .addReg(OffsetReg, RegState::Kill)
964 .addReg(FIReg)
965 .setOperandDead(3); // scc
966 return BaseReg;
967 }
968
969 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
970 .addReg(OffsetReg, RegState::Kill)
971 .addReg(FIReg)
972 .addImm(0); // clamp bit
973
974 return BaseReg;
975}
976
978 int64_t Offset) const {
979 const SIInstrInfo *TII = ST.getInstrInfo();
980
981 switch (MI.getOpcode()) {
982 case AMDGPU::V_ADD_U32_e32:
983 case AMDGPU::V_ADD_CO_U32_e32: {
984 MachineOperand *FIOp = &MI.getOperand(2);
985 MachineOperand *ImmOp = &MI.getOperand(1);
986 if (!FIOp->isFI())
987 std::swap(FIOp, ImmOp);
988
989 if (!ImmOp->isImm()) {
990 assert(Offset == 0);
991 FIOp->ChangeToRegister(BaseReg, false);
992 TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
993 return;
994 }
995
996 int64_t TotalOffset = ImmOp->getImm() + Offset;
997 if (TotalOffset == 0) {
998 MI.setDesc(TII->get(AMDGPU::COPY));
999 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1000 MI.removeOperand(I);
1001
1002 MI.getOperand(1).ChangeToRegister(BaseReg, false);
1003 return;
1004 }
1005
1006 ImmOp->setImm(TotalOffset);
1007
1008 MachineBasicBlock *MBB = MI.getParent();
1009 MachineFunction *MF = MBB->getParent();
1011
1012 // FIXME: materializeFrameBaseRegister does not know the register class of
1013 // the uses of the frame index, and assumes SGPR for enableFlatScratch. Emit
1014 // a copy so we have a legal operand and hope the register coalescer can
1015 // clean it up.
1016 if (isSGPRReg(MRI, BaseReg)) {
1017 Register BaseRegVGPR =
1018 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1019 BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
1020 .addReg(BaseReg);
1021 MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
1022 } else {
1023 MI.getOperand(2).ChangeToRegister(BaseReg, false);
1024 }
1025 return;
1026 }
1027 case AMDGPU::V_ADD_U32_e64:
1028 case AMDGPU::V_ADD_CO_U32_e64: {
1029 int Src0Idx = MI.getNumExplicitDefs();
1030 MachineOperand *FIOp = &MI.getOperand(Src0Idx);
1031 MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
1032 if (!FIOp->isFI())
1033 std::swap(FIOp, ImmOp);
1034
1035 if (!ImmOp->isImm()) {
1036 FIOp->ChangeToRegister(BaseReg, false);
1037 TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
1038 return;
1039 }
1040
1041 int64_t TotalOffset = ImmOp->getImm() + Offset;
1042 if (TotalOffset == 0) {
1043 MI.setDesc(TII->get(AMDGPU::COPY));
1044
1045 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1046 MI.removeOperand(I);
1047
1048 MI.getOperand(1).ChangeToRegister(BaseReg, false);
1049 } else {
1050 FIOp->ChangeToRegister(BaseReg, false);
1051 ImmOp->setImm(TotalOffset);
1052 }
1053
1054 return;
1055 }
1056 default:
1057 break;
1058 }
1059
1060 bool IsFlat = TII->isFLATScratch(MI);
1061
1062#ifndef NDEBUG
1063 // FIXME: Is it possible to be storing a frame index to itself?
1064 bool SeenFI = false;
1065 for (const MachineOperand &MO: MI.operands()) {
1066 if (MO.isFI()) {
1067 if (SeenFI)
1068 llvm_unreachable("should not see multiple frame indices");
1069
1070 SeenFI = true;
1071 }
1072 }
1073#endif
1074
1075 MachineOperand *FIOp =
1076 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
1077 : AMDGPU::OpName::vaddr);
1078
1079 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
1080 int64_t NewOffset = OffsetOp->getImm() + Offset;
1081
1082 assert(FIOp && FIOp->isFI() && "frame index must be address operand");
1083 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
1084
1085 if (IsFlat) {
1086 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1088 "offset should be legal");
1089 FIOp->ChangeToRegister(BaseReg, false);
1090 OffsetOp->setImm(NewOffset);
1091 return;
1092 }
1093
1094#ifndef NDEBUG
1095 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
1096 assert(SOffset->isImm() && SOffset->getImm() == 0);
1097#endif
1098
1099 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
1100
1101 FIOp->ChangeToRegister(BaseReg, false);
1102 OffsetOp->setImm(NewOffset);
1103}
1104
1106 Register BaseReg,
1107 int64_t Offset) const {
1108
1109 switch (MI->getOpcode()) {
1110 case AMDGPU::V_ADD_U32_e32:
1111 case AMDGPU::V_ADD_CO_U32_e32:
1112 return true;
1113 case AMDGPU::V_ADD_U32_e64:
1114 case AMDGPU::V_ADD_CO_U32_e64:
1116 default:
1117 break;
1118 }
1119
1121 return false;
1122
1123 int64_t NewOffset = Offset + getScratchInstrOffset(MI);
1124
1125 const SIInstrInfo *TII = ST.getInstrInfo();
1127 return TII->isLegalMUBUFImmOffset(NewOffset);
1128
1129 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1131}
1132
1134 const MachineFunction &MF, unsigned Kind) const {
1135 // This is inaccurate. It depends on the instruction and address space. The
1136 // only place where we should hit this is for dealing with frame indexes /
1137 // private accesses, so this is correct in that case.
1138 return &AMDGPU::VGPR_32RegClass;
1139}
1140
1141const TargetRegisterClass *
1143 if (isAGPRClass(RC) && !ST.hasGFX90AInsts())
1144 return getEquivalentVGPRClass(RC);
1145 if (RC == &AMDGPU::SCC_CLASSRegClass)
1146 return getWaveMaskRegClass();
1147
1148 return RC;
1149}
1150
1151static unsigned getNumSubRegsForSpillOp(unsigned Op) {
1152
1153 switch (Op) {
1154 case AMDGPU::SI_SPILL_S1024_SAVE:
1155 case AMDGPU::SI_SPILL_S1024_RESTORE:
1156 case AMDGPU::SI_SPILL_V1024_SAVE:
1157 case AMDGPU::SI_SPILL_V1024_RESTORE:
1158 case AMDGPU::SI_SPILL_A1024_SAVE:
1159 case AMDGPU::SI_SPILL_A1024_RESTORE:
1160 case AMDGPU::SI_SPILL_AV1024_SAVE:
1161 case AMDGPU::SI_SPILL_AV1024_RESTORE:
1162 return 32;
1163 case AMDGPU::SI_SPILL_S512_SAVE:
1164 case AMDGPU::SI_SPILL_S512_RESTORE:
1165 case AMDGPU::SI_SPILL_V512_SAVE:
1166 case AMDGPU::SI_SPILL_V512_RESTORE:
1167 case AMDGPU::SI_SPILL_A512_SAVE:
1168 case AMDGPU::SI_SPILL_A512_RESTORE:
1169 case AMDGPU::SI_SPILL_AV512_SAVE:
1170 case AMDGPU::SI_SPILL_AV512_RESTORE:
1171 return 16;
1172 case AMDGPU::SI_SPILL_S384_SAVE:
1173 case AMDGPU::SI_SPILL_S384_RESTORE:
1174 case AMDGPU::SI_SPILL_V384_SAVE:
1175 case AMDGPU::SI_SPILL_V384_RESTORE:
1176 case AMDGPU::SI_SPILL_A384_SAVE:
1177 case AMDGPU::SI_SPILL_A384_RESTORE:
1178 case AMDGPU::SI_SPILL_AV384_SAVE:
1179 case AMDGPU::SI_SPILL_AV384_RESTORE:
1180 return 12;
1181 case AMDGPU::SI_SPILL_S352_SAVE:
1182 case AMDGPU::SI_SPILL_S352_RESTORE:
1183 case AMDGPU::SI_SPILL_V352_SAVE:
1184 case AMDGPU::SI_SPILL_V352_RESTORE:
1185 case AMDGPU::SI_SPILL_A352_SAVE:
1186 case AMDGPU::SI_SPILL_A352_RESTORE:
1187 case AMDGPU::SI_SPILL_AV352_SAVE:
1188 case AMDGPU::SI_SPILL_AV352_RESTORE:
1189 return 11;
1190 case AMDGPU::SI_SPILL_S320_SAVE:
1191 case AMDGPU::SI_SPILL_S320_RESTORE:
1192 case AMDGPU::SI_SPILL_V320_SAVE:
1193 case AMDGPU::SI_SPILL_V320_RESTORE:
1194 case AMDGPU::SI_SPILL_A320_SAVE:
1195 case AMDGPU::SI_SPILL_A320_RESTORE:
1196 case AMDGPU::SI_SPILL_AV320_SAVE:
1197 case AMDGPU::SI_SPILL_AV320_RESTORE:
1198 return 10;
1199 case AMDGPU::SI_SPILL_S288_SAVE:
1200 case AMDGPU::SI_SPILL_S288_RESTORE:
1201 case AMDGPU::SI_SPILL_V288_SAVE:
1202 case AMDGPU::SI_SPILL_V288_RESTORE:
1203 case AMDGPU::SI_SPILL_A288_SAVE:
1204 case AMDGPU::SI_SPILL_A288_RESTORE:
1205 case AMDGPU::SI_SPILL_AV288_SAVE:
1206 case AMDGPU::SI_SPILL_AV288_RESTORE:
1207 return 9;
1208 case AMDGPU::SI_SPILL_S256_SAVE:
1209 case AMDGPU::SI_SPILL_S256_RESTORE:
1210 case AMDGPU::SI_SPILL_V256_SAVE:
1211 case AMDGPU::SI_SPILL_V256_RESTORE:
1212 case AMDGPU::SI_SPILL_A256_SAVE:
1213 case AMDGPU::SI_SPILL_A256_RESTORE:
1214 case AMDGPU::SI_SPILL_AV256_SAVE:
1215 case AMDGPU::SI_SPILL_AV256_RESTORE:
1216 return 8;
1217 case AMDGPU::SI_SPILL_S224_SAVE:
1218 case AMDGPU::SI_SPILL_S224_RESTORE:
1219 case AMDGPU::SI_SPILL_V224_SAVE:
1220 case AMDGPU::SI_SPILL_V224_RESTORE:
1221 case AMDGPU::SI_SPILL_A224_SAVE:
1222 case AMDGPU::SI_SPILL_A224_RESTORE:
1223 case AMDGPU::SI_SPILL_AV224_SAVE:
1224 case AMDGPU::SI_SPILL_AV224_RESTORE:
1225 return 7;
1226 case AMDGPU::SI_SPILL_S192_SAVE:
1227 case AMDGPU::SI_SPILL_S192_RESTORE:
1228 case AMDGPU::SI_SPILL_V192_SAVE:
1229 case AMDGPU::SI_SPILL_V192_RESTORE:
1230 case AMDGPU::SI_SPILL_A192_SAVE:
1231 case AMDGPU::SI_SPILL_A192_RESTORE:
1232 case AMDGPU::SI_SPILL_AV192_SAVE:
1233 case AMDGPU::SI_SPILL_AV192_RESTORE:
1234 return 6;
1235 case AMDGPU::SI_SPILL_S160_SAVE:
1236 case AMDGPU::SI_SPILL_S160_RESTORE:
1237 case AMDGPU::SI_SPILL_V160_SAVE:
1238 case AMDGPU::SI_SPILL_V160_RESTORE:
1239 case AMDGPU::SI_SPILL_A160_SAVE:
1240 case AMDGPU::SI_SPILL_A160_RESTORE:
1241 case AMDGPU::SI_SPILL_AV160_SAVE:
1242 case AMDGPU::SI_SPILL_AV160_RESTORE:
1243 return 5;
1244 case AMDGPU::SI_SPILL_S128_SAVE:
1245 case AMDGPU::SI_SPILL_S128_RESTORE:
1246 case AMDGPU::SI_SPILL_V128_SAVE:
1247 case AMDGPU::SI_SPILL_V128_RESTORE:
1248 case AMDGPU::SI_SPILL_A128_SAVE:
1249 case AMDGPU::SI_SPILL_A128_RESTORE:
1250 case AMDGPU::SI_SPILL_AV128_SAVE:
1251 case AMDGPU::SI_SPILL_AV128_RESTORE:
1252 return 4;
1253 case AMDGPU::SI_SPILL_S96_SAVE:
1254 case AMDGPU::SI_SPILL_S96_RESTORE:
1255 case AMDGPU::SI_SPILL_V96_SAVE:
1256 case AMDGPU::SI_SPILL_V96_RESTORE:
1257 case AMDGPU::SI_SPILL_A96_SAVE:
1258 case AMDGPU::SI_SPILL_A96_RESTORE:
1259 case AMDGPU::SI_SPILL_AV96_SAVE:
1260 case AMDGPU::SI_SPILL_AV96_RESTORE:
1261 return 3;
1262 case AMDGPU::SI_SPILL_S64_SAVE:
1263 case AMDGPU::SI_SPILL_S64_RESTORE:
1264 case AMDGPU::SI_SPILL_V64_SAVE:
1265 case AMDGPU::SI_SPILL_V64_RESTORE:
1266 case AMDGPU::SI_SPILL_A64_SAVE:
1267 case AMDGPU::SI_SPILL_A64_RESTORE:
1268 case AMDGPU::SI_SPILL_AV64_SAVE:
1269 case AMDGPU::SI_SPILL_AV64_RESTORE:
1270 return 2;
1271 case AMDGPU::SI_SPILL_S32_SAVE:
1272 case AMDGPU::SI_SPILL_S32_RESTORE:
1273 case AMDGPU::SI_SPILL_V32_SAVE:
1274 case AMDGPU::SI_SPILL_V32_RESTORE:
1275 case AMDGPU::SI_SPILL_A32_SAVE:
1276 case AMDGPU::SI_SPILL_A32_RESTORE:
1277 case AMDGPU::SI_SPILL_AV32_SAVE:
1278 case AMDGPU::SI_SPILL_AV32_RESTORE:
1279 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
1280 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
1281 case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
1282 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
1283 return 1;
1284 default: llvm_unreachable("Invalid spill opcode");
1285 }
1286}
1287
1288static int getOffsetMUBUFStore(unsigned Opc) {
1289 switch (Opc) {
1290 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
1291 return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1292 case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
1293 return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
1294 case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
1295 return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
1296 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
1297 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
1298 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
1299 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
1300 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
1301 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
1302 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
1303 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
1304 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
1305 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
1306 default:
1307 return -1;
1308 }
1309}
1310
1311static int getOffsetMUBUFLoad(unsigned Opc) {
1312 switch (Opc) {
1313 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
1314 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1315 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
1316 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
1317 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
1318 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
1319 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
1320 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
1321 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
1322 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
1323 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
1324 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
1325 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
1326 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
1327 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
1328 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
1329 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
1330 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
1331 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
1332 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
1333 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
1334 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
1335 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
1336 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
1337 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
1338 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
1339 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
1340 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
1341 default:
1342 return -1;
1343 }
1344}
1345
1346static int getOffenMUBUFStore(unsigned Opc) {
1347 switch (Opc) {
1348 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
1349 return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
1350 case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
1351 return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
1352 case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
1353 return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
1354 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
1355 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
1356 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
1357 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
1358 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
1359 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
1360 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
1361 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
1362 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
1363 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
1364 default:
1365 return -1;
1366 }
1367}
1368
1369static int getOffenMUBUFLoad(unsigned Opc) {
1370 switch (Opc) {
1371 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
1372 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
1373 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
1374 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
1375 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
1376 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
1377 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
1378 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
1379 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
1380 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
1381 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
1382 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
1383 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
1384 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
1385 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
1386 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
1387 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
1388 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
1389 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
1390 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
1391 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
1392 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
1393 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
1394 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
1395 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
1396 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
1397 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
1398 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
1399 default:
1400 return -1;
1401 }
1402}
1403
1407 int Index, unsigned Lane,
1408 unsigned ValueReg, bool IsKill) {
1411 const SIInstrInfo *TII = ST.getInstrInfo();
1412
1413 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
1414
1415 if (Reg == AMDGPU::NoRegister)
1416 return MachineInstrBuilder();
1417
1418 bool IsStore = MI->mayStore();
1420 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1421
1422 unsigned Dst = IsStore ? Reg : ValueReg;
1423 unsigned Src = IsStore ? ValueReg : Reg;
1424 bool IsVGPR = TRI->isVGPR(MRI, Reg);
1425 DebugLoc DL = MI->getDebugLoc();
1426 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
1427 // Spiller during regalloc may restore a spilled register to its superclass.
1428 // It could result in AGPR spills restored to VGPRs or the other way around,
1429 // making the src and dst with identical regclasses at this point. It just
1430 // needs a copy in such cases.
1431 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
1432 .addReg(Src, getKillRegState(IsKill));
1434 return CopyMIB;
1435 }
1436 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1437 : AMDGPU::V_ACCVGPR_READ_B32_e64;
1438
1439 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
1440 .addReg(Src, getKillRegState(IsKill));
1442 return MIB;
1443}
1444
1445// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1446// need to handle the case where an SGPR may need to be spilled while spilling.
1448 MachineFrameInfo &MFI,
1450 int Index,
1451 int64_t Offset) {
1452 const SIInstrInfo *TII = ST.getInstrInfo();
1453 MachineBasicBlock *MBB = MI->getParent();
1454 const DebugLoc &DL = MI->getDebugLoc();
1455 bool IsStore = MI->mayStore();
1456
1457 unsigned Opc = MI->getOpcode();
1458 int LoadStoreOp = IsStore ?
1460 if (LoadStoreOp == -1)
1461 return false;
1462
1463 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
1464 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr())
1465 return true;
1466
1467 MachineInstrBuilder NewMI =
1468 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
1469 .add(*Reg)
1470 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
1471 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
1472 .addImm(Offset)
1473 .addImm(0) // cpol
1474 .addImm(0) // swz
1475 .cloneMemRefs(*MI);
1476
1477 const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
1478 AMDGPU::OpName::vdata_in);
1479 if (VDataIn)
1480 NewMI.add(*VDataIn);
1481 return true;
1482}
1483
1485 unsigned LoadStoreOp,
1486 unsigned EltSize) {
1487 bool IsStore = TII->get(LoadStoreOp).mayStore();
1488 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr);
1489 bool UseST =
1490 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
1491
1492 switch (EltSize) {
1493 case 4:
1494 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1495 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
1496 break;
1497 case 8:
1498 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1499 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
1500 break;
1501 case 12:
1502 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1503 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
1504 break;
1505 case 16:
1506 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1507 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
1508 break;
1509 default:
1510 llvm_unreachable("Unexpected spill load/store size!");
1511 }
1512
1513 if (HasVAddr)
1514 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1515 else if (UseST)
1516 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1517
1518 return LoadStoreOp;
1519}
1520
1523 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1524 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1525 RegScavenger *RS, LiveRegUnits *LiveUnits) const {
1526 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
1527
1529 const SIInstrInfo *TII = ST.getInstrInfo();
1530 const MachineFrameInfo &MFI = MF->getFrameInfo();
1531 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1532
1533 const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
1534 bool IsStore = Desc->mayStore();
1535 bool IsFlat = TII->isFLATScratch(LoadStoreOp);
1536
1537 bool CanClobberSCC = false;
1538 bool Scavenged = false;
1539 MCRegister SOffset = ScratchOffsetReg;
1540
1541 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
1542 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1543 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
1544 const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
1545
1546 // Always use 4 byte operations for AGPRs because we need to scavenge
1547 // a temporary VGPR.
1548 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u;
1549 unsigned NumSubRegs = RegWidth / EltSize;
1550 unsigned Size = NumSubRegs * EltSize;
1551 unsigned RemSize = RegWidth - Size;
1552 unsigned NumRemSubRegs = RemSize ? 1 : 0;
1553 int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
1554 int64_t MaterializedOffset = Offset;
1555
1556 int64_t MaxOffset = Offset + Size + RemSize - EltSize;
1557 int64_t ScratchOffsetRegDelta = 0;
1558
1559 if (IsFlat && EltSize > 4) {
1560 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1561 Desc = &TII->get(LoadStoreOp);
1562 }
1563
1564 Align Alignment = MFI.getObjectAlign(Index);
1565 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
1566
1567 assert((IsFlat || ((Offset % EltSize) == 0)) &&
1568 "unexpected VGPR spill offset");
1569
1570 // Track a VGPR to use for a constant offset we need to materialize.
1571 Register TmpOffsetVGPR;
1572
1573 // Track a VGPR to use as an intermediate value.
1574 Register TmpIntermediateVGPR;
1575 bool UseVGPROffset = false;
1576
1577 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1578 // combination.
1579 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
1580 int64_t VOffset) {
1581 // We are using a VGPR offset
1582 if (IsFlat && SGPRBase) {
1583 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1584 // SGPR, so perform the add as vector.
1585 // We don't need a base SGPR in the kernel.
1586
1587 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
1588 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
1589 .addReg(SGPRBase)
1590 .addImm(VOffset)
1591 .addImm(0); // clamp
1592 } else {
1593 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1594 .addReg(SGPRBase);
1595 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
1596 .addImm(VOffset)
1597 .addReg(TmpOffsetVGPR);
1598 }
1599 } else {
1600 assert(TmpOffsetVGPR);
1601 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1602 .addImm(VOffset);
1603 }
1604 };
1605
1606 bool IsOffsetLegal =
1607 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1609 : TII->isLegalMUBUFImmOffset(MaxOffset);
1610 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
1611 SOffset = MCRegister();
1612
1613 // We don't have access to the register scavenger if this function is called
1614 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1615 // TODO: Clobbering SCC is not necessary for scratch instructions in the
1616 // entry.
1617 if (RS) {
1618 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false);
1619
1620 // Piggy back on the liveness scan we just did see if SCC is dead.
1621 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
1622 } else if (LiveUnits) {
1623 CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
1624 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
1625 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1626 SOffset = Reg;
1627 break;
1628 }
1629 }
1630 }
1631
1632 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
1633 SOffset = Register();
1634
1635 if (!SOffset) {
1636 UseVGPROffset = true;
1637
1638 if (RS) {
1639 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1640 } else {
1641 assert(LiveUnits);
1642 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
1643 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1644 TmpOffsetVGPR = Reg;
1645 break;
1646 }
1647 }
1648 }
1649
1650 assert(TmpOffsetVGPR);
1651 } else if (!SOffset && CanClobberSCC) {
1652 // There are no free SGPRs, and since we are in the process of spilling
1653 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
1654 // on SI/CI and on VI it is true until we implement spilling using scalar
1655 // stores), we have no way to free up an SGPR. Our solution here is to
1656 // add the offset directly to the ScratchOffset or StackPtrOffset
1657 // register, and then subtract the offset after the spill to return the
1658 // register to it's original value.
1659
1660 // TODO: If we don't have to do an emergency stack slot spill, converting
1661 // to use the VGPR offset is fewer instructions.
1662 if (!ScratchOffsetReg)
1663 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
1664 SOffset = ScratchOffsetReg;
1665 ScratchOffsetRegDelta = Offset;
1666 } else {
1667 Scavenged = true;
1668 }
1669
1670 // We currently only support spilling VGPRs to EltSize boundaries, meaning
1671 // we can simplify the adjustment of Offset here to just scale with
1672 // WavefrontSize.
1673 if (!IsFlat && !UseVGPROffset)
1674 Offset *= ST.getWavefrontSize();
1675
1676 if (!UseVGPROffset && !SOffset)
1677 report_fatal_error("could not scavenge SGPR to spill in entry function");
1678
1679 if (UseVGPROffset) {
1680 // We are using a VGPR offset
1681 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
1682 } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
1683 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
1684 } else {
1685 assert(Offset != 0);
1686 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1687 .addReg(ScratchOffsetReg)
1688 .addImm(Offset);
1689 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1690 }
1691
1692 Offset = 0;
1693 }
1694
1695 if (IsFlat && SOffset == AMDGPU::NoRegister) {
1696 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
1697 && "Unexpected vaddr for flat scratch with a FI operand");
1698
1699 if (UseVGPROffset) {
1700 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1701 } else {
1703 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1704 }
1705
1706 Desc = &TII->get(LoadStoreOp);
1707 }
1708
1709 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
1710 ++i, RegOffset += EltSize) {
1711 if (i == NumSubRegs) {
1712 EltSize = RemSize;
1713 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1714 }
1715 Desc = &TII->get(LoadStoreOp);
1716
1717 if (!IsFlat && UseVGPROffset) {
1718 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
1719 : getOffenMUBUFLoad(LoadStoreOp);
1720 Desc = &TII->get(NewLoadStoreOp);
1721 }
1722
1723 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
1724 // If we are spilling an AGPR beyond the range of the memory instruction
1725 // offset and need to use a VGPR offset, we ideally have at least 2
1726 // scratch VGPRs. If we don't have a second free VGPR without spilling,
1727 // recycle the VGPR used for the offset which requires resetting after
1728 // each subregister.
1729
1730 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
1731 }
1732
1733 unsigned NumRegs = EltSize / 4;
1734 Register SubReg = e == 1
1735 ? ValueReg
1736 : Register(getSubReg(ValueReg,
1737 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1738
1739 unsigned SOffsetRegState = 0;
1740 unsigned SrcDstRegState = getDefRegState(!IsStore);
1741 const bool IsLastSubReg = i + 1 == e;
1742 const bool IsFirstSubReg = i == 0;
1743 if (IsLastSubReg) {
1744 SOffsetRegState |= getKillRegState(Scavenged);
1745 // The last implicit use carries the "Kill" flag.
1746 SrcDstRegState |= getKillRegState(IsKill);
1747 }
1748
1749 // Make sure the whole register is defined if there are undef components by
1750 // adding an implicit def of the super-reg on the first instruction.
1751 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg;
1752 bool NeedSuperRegImpOperand = e > 1;
1753
1754 // Remaining element size to spill into memory after some parts of it
1755 // spilled into either AGPRs or VGPRs.
1756 unsigned RemEltSize = EltSize;
1757
1758 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1759 // starting from the last lane. In case if a register cannot be completely
1760 // spilled into another register that will ensure its alignment does not
1761 // change. For targets with VGPR alignment requirement this is important
1762 // in case of flat scratch usage as we might get a scratch_load or
1763 // scratch_store of an unaligned register otherwise.
1764 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
1765 LaneE = RegOffset / 4;
1766 Lane >= LaneE; --Lane) {
1767 bool IsSubReg = e > 1 || EltSize > 4;
1768 Register Sub = IsSubReg
1769 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
1770 : ValueReg;
1771 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
1772 if (!MIB.getInstr())
1773 break;
1774 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
1775 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1776 NeedSuperRegDef = false;
1777 }
1778 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
1779 NeedSuperRegImpOperand = true;
1780 unsigned State = SrcDstRegState;
1781 if (!IsLastSubReg || (Lane != LaneE))
1782 State &= ~RegState::Kill;
1783 if (!IsFirstSubReg || (Lane != LaneS))
1784 State &= ~RegState::Define;
1785 MIB.addReg(ValueReg, RegState::Implicit | State);
1786 }
1787 RemEltSize -= 4;
1788 }
1789
1790 if (!RemEltSize) // Fully spilled into AGPRs.
1791 continue;
1792
1793 if (RemEltSize != EltSize) { // Partially spilled to AGPRs
1794 assert(IsFlat && EltSize > 4);
1795
1796 unsigned NumRegs = RemEltSize / 4;
1797 SubReg = Register(getSubReg(ValueReg,
1798 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1799 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
1800 Desc = &TII->get(Opc);
1801 }
1802
1803 unsigned FinalReg = SubReg;
1804
1805 if (IsAGPR) {
1806 assert(EltSize == 4);
1807
1808 if (!TmpIntermediateVGPR) {
1809 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
1810 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
1811 }
1812 if (IsStore) {
1813 auto AccRead = BuildMI(MBB, MI, DL,
1814 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
1815 TmpIntermediateVGPR)
1816 .addReg(SubReg, getKillRegState(IsKill));
1817 if (NeedSuperRegDef)
1818 AccRead.addReg(ValueReg, RegState::ImplicitDefine);
1819 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1820 AccRead.addReg(ValueReg, RegState::Implicit);
1822 }
1823 SubReg = TmpIntermediateVGPR;
1824 } else if (UseVGPROffset) {
1825 if (!TmpOffsetVGPR) {
1826 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1827 MI, false, 0);
1828 RS->setRegUsed(TmpOffsetVGPR);
1829 }
1830 }
1831
1832 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
1833 MachineMemOperand *NewMMO =
1834 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
1835 commonAlignment(Alignment, RegOffset));
1836
1837 auto MIB =
1838 BuildMI(MBB, MI, DL, *Desc)
1839 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
1840
1841 if (UseVGPROffset) {
1842 // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1843 // intermediate accvgpr_write.
1844 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
1845 }
1846
1847 if (!IsFlat)
1848 MIB.addReg(FuncInfo->getScratchRSrcReg());
1849
1850 if (SOffset == AMDGPU::NoRegister) {
1851 if (!IsFlat) {
1852 if (UseVGPROffset && ScratchOffsetReg) {
1853 MIB.addReg(ScratchOffsetReg);
1854 } else {
1855 assert(FuncInfo->isBottomOfStack());
1856 MIB.addImm(0);
1857 }
1858 }
1859 } else {
1860 MIB.addReg(SOffset, SOffsetRegState);
1861 }
1862
1863 MIB.addImm(Offset + RegOffset);
1864
1865 bool LastUse = MMO->getFlags() & MOLastUse;
1866 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1867
1868 if (!IsFlat)
1869 MIB.addImm(0); // swz
1870 MIB.addMemOperand(NewMMO);
1871
1872 if (!IsAGPR && NeedSuperRegDef)
1873 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1874
1875 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
1876 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
1877 FinalReg)
1878 .addReg(TmpIntermediateVGPR, RegState::Kill);
1880 }
1881
1882 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1883 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
1884
1885 // The epilog restore of a wwm-scratch register can cause undesired
1886 // optimization during machine-cp post PrologEpilogInserter if the same
1887 // register was assigned for return value ABI lowering with a COPY
1888 // instruction. As given below, with the epilog reload, the earlier COPY
1889 // appeared to be dead during machine-cp.
1890 // ...
1891 // v0 in WWM operation, needs the WWM spill at prolog/epilog.
1892 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
1893 // ...
1894 // Epilog block:
1895 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
1896 // ...
1897 // WWM spill restore to preserve the inactive lanes of v0.
1898 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
1899 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
1900 // $exec = S_MOV_B64 killed $sgpr4_sgpr5
1901 // ...
1902 // SI_RETURN implicit $vgpr0
1903 // ...
1904 // To fix it, mark the same reg as a tied op for such restore instructions
1905 // so that it marks a usage for the preceding COPY.
1906 if (!IsStore && MI != MBB.end() && MI->isReturn() &&
1907 MI->readsRegister(SubReg, this)) {
1908 MIB.addReg(SubReg, RegState::Implicit);
1909 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1910 }
1911 }
1912
1913 if (ScratchOffsetRegDelta != 0) {
1914 // Subtract the offset we added to the ScratchOffset register.
1915 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1916 .addReg(SOffset)
1917 .addImm(-ScratchOffsetRegDelta);
1918 }
1919}
1920
1922 int Offset, bool IsLoad,
1923 bool IsKill) const {
1924 // Load/store VGPR
1925 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
1926 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
1927
1928 Register FrameReg =
1929 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
1930 ? getBaseRegister()
1931 : getFrameRegister(SB.MF);
1932
1933 Align Alignment = FrameInfo.getObjectAlign(Index);
1937 SB.EltSize, Alignment);
1938
1939 if (IsLoad) {
1940 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1941 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1942 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
1943 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1944 } else {
1945 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1946 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1947 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
1948 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1949 // This only ever adds one VGPR spill
1950 SB.MFI.addToSpilledVGPRs(1);
1951 }
1952}
1953
1955 RegScavenger *RS, SlotIndexes *Indexes,
1956 LiveIntervals *LIS, bool OnlyToVGPR,
1957 bool SpillToPhysVGPRLane) const {
1958 assert(!MI->getOperand(0).isUndef() &&
1959 "undef spill should have been deleted earlier");
1960
1961 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
1962
1963 ArrayRef<SpilledReg> VGPRSpills =
1964 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
1966 bool SpillToVGPR = !VGPRSpills.empty();
1967 if (OnlyToVGPR && !SpillToVGPR)
1968 return false;
1969
1970 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
1971 SB.SuperReg != SB.MFI.getFrameOffsetReg()));
1972
1973 if (SpillToVGPR) {
1974
1975 // Since stack slot coloring pass is trying to optimize SGPR spills,
1976 // VGPR lanes (mapped from spill stack slot) may be shared for SGPR
1977 // spills of different sizes. This accounts for number of VGPR lanes alloted
1978 // equal to the largest SGPR being spilled in them.
1979 assert(SB.NumSubRegs <= VGPRSpills.size() &&
1980 "Num of SGPRs spilled should be less than or equal to num of "
1981 "the VGPR lanes.");
1982
1983 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
1985 SB.NumSubRegs == 1
1986 ? SB.SuperReg
1987 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
1988 SpilledReg Spill = VGPRSpills[i];
1989
1990 bool IsFirstSubreg = i == 0;
1991 bool IsLastSubreg = i == SB.NumSubRegs - 1;
1992 bool UseKill = SB.IsKill && IsLastSubreg;
1993
1994
1995 // Mark the "old value of vgpr" input undef only if this is the first sgpr
1996 // spill to this specific vgpr in the first basic block.
1997 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
1998 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
1999 .addReg(SubReg, getKillRegState(UseKill))
2000 .addImm(Spill.Lane)
2001 .addReg(Spill.VGPR);
2002 if (Indexes) {
2003 if (IsFirstSubreg)
2004 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2005 else
2006 Indexes->insertMachineInstrInMaps(*MIB);
2007 }
2008
2009 if (IsFirstSubreg && SB.NumSubRegs > 1) {
2010 // We may be spilling a super-register which is only partially defined,
2011 // and need to ensure later spills think the value is defined.
2012 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2013 }
2014
2015 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg))
2016 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit);
2017
2018 // FIXME: Since this spills to another register instead of an actual
2019 // frame index, we should delete the frame index when all references to
2020 // it are fixed.
2021 }
2022 } else {
2023 SB.prepare();
2024
2025 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
2026 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2027
2028 // Per VGPR helper data
2029 auto PVD = SB.getPerVGPRData();
2030
2031 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2032 unsigned TmpVGPRFlags = RegState::Undef;
2033
2034 // Write sub registers into the VGPR
2035 for (unsigned i = Offset * PVD.PerVGPR,
2036 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2037 i < e; ++i) {
2039 SB.NumSubRegs == 1
2040 ? SB.SuperReg
2041 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2042
2043 MachineInstrBuilder WriteLane =
2044 BuildMI(*SB.MBB, MI, SB.DL,
2045 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
2046 .addReg(SubReg, SubKillState)
2047 .addImm(i % PVD.PerVGPR)
2048 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2049 TmpVGPRFlags = 0;
2050
2051 if (Indexes) {
2052 if (i == 0)
2053 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane);
2054 else
2055 Indexes->insertMachineInstrInMaps(*WriteLane);
2056 }
2057
2058 // There could be undef components of a spilled super register.
2059 // TODO: Can we detect this and skip the spill?
2060 if (SB.NumSubRegs > 1) {
2061 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2062 unsigned SuperKillState = 0;
2063 if (i + 1 == SB.NumSubRegs)
2064 SuperKillState |= getKillRegState(SB.IsKill);
2065 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2066 }
2067 }
2068
2069 // Write out VGPR
2070 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
2071 }
2072
2073 SB.restore();
2074 }
2075
2076 MI->eraseFromParent();
2078
2079 if (LIS)
2081
2082 return true;
2083}
2084
2086 RegScavenger *RS, SlotIndexes *Indexes,
2087 LiveIntervals *LIS, bool OnlyToVGPR,
2088 bool SpillToPhysVGPRLane) const {
2089 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2090
2091 ArrayRef<SpilledReg> VGPRSpills =
2092 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2094 bool SpillToVGPR = !VGPRSpills.empty();
2095 if (OnlyToVGPR && !SpillToVGPR)
2096 return false;
2097
2098 if (SpillToVGPR) {
2099 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2101 SB.NumSubRegs == 1
2102 ? SB.SuperReg
2103 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2104
2105 SpilledReg Spill = VGPRSpills[i];
2106 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2107 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2108 .addReg(Spill.VGPR)
2109 .addImm(Spill.Lane);
2110 if (SB.NumSubRegs > 1 && i == 0)
2112 if (Indexes) {
2113 if (i == e - 1)
2114 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2115 else
2116 Indexes->insertMachineInstrInMaps(*MIB);
2117 }
2118 }
2119 } else {
2120 SB.prepare();
2121
2122 // Per VGPR helper data
2123 auto PVD = SB.getPerVGPRData();
2124
2125 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2126 // Load in VGPR data
2127 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
2128
2129 // Unpack lanes
2130 for (unsigned i = Offset * PVD.PerVGPR,
2131 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2132 i < e; ++i) {
2134 SB.NumSubRegs == 1
2135 ? SB.SuperReg
2136 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2137
2138 bool LastSubReg = (i + 1 == e);
2139 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2140 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2141 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2142 .addImm(i);
2143 if (SB.NumSubRegs > 1 && i == 0)
2145 if (Indexes) {
2146 if (i == e - 1)
2147 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2148 else
2149 Indexes->insertMachineInstrInMaps(*MIB);
2150 }
2151 }
2152 }
2153
2154 SB.restore();
2155 }
2156
2157 MI->eraseFromParent();
2158
2159 if (LIS)
2161
2162 return true;
2163}
2164
2166 MachineBasicBlock &RestoreMBB,
2167 Register SGPR, RegScavenger *RS) const {
2168 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
2169 RS);
2170 SB.prepare();
2171 // Generate the spill of SGPR to SB.TmpVGPR.
2172 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2173 auto PVD = SB.getPerVGPRData();
2174 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2175 unsigned TmpVGPRFlags = RegState::Undef;
2176 // Write sub registers into the VGPR
2177 for (unsigned i = Offset * PVD.PerVGPR,
2178 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2179 i < e; ++i) {
2181 SB.NumSubRegs == 1
2182 ? SB.SuperReg
2183 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2184
2185 MachineInstrBuilder WriteLane =
2186 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
2187 SB.TmpVGPR)
2188 .addReg(SubReg, SubKillState)
2189 .addImm(i % PVD.PerVGPR)
2190 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2191 TmpVGPRFlags = 0;
2192 // There could be undef components of a spilled super register.
2193 // TODO: Can we detect this and skip the spill?
2194 if (SB.NumSubRegs > 1) {
2195 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2196 unsigned SuperKillState = 0;
2197 if (i + 1 == SB.NumSubRegs)
2198 SuperKillState |= getKillRegState(SB.IsKill);
2199 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2200 }
2201 }
2202 // Don't need to write VGPR out.
2203 }
2204
2205 // Restore clobbered registers in the specified restore block.
2206 MI = RestoreMBB.end();
2207 SB.setMI(&RestoreMBB, MI);
2208 // Generate the restore of SGPR from SB.TmpVGPR.
2209 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2210 // Don't need to load VGPR in.
2211 // Unpack lanes
2212 for (unsigned i = Offset * PVD.PerVGPR,
2213 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2214 i < e; ++i) {
2216 SB.NumSubRegs == 1
2217 ? SB.SuperReg
2218 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2219 bool LastSubReg = (i + 1 == e);
2220 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
2221 SubReg)
2222 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2223 .addImm(i);
2224 if (SB.NumSubRegs > 1 && i == 0)
2226 }
2227 }
2228 SB.restore();
2229
2231 return false;
2232}
2233
2234/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2235/// a VGPR and the stack slot can be safely eliminated when all other users are
2236/// handled.
2239 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
2240 switch (MI->getOpcode()) {
2241 case AMDGPU::SI_SPILL_S1024_SAVE:
2242 case AMDGPU::SI_SPILL_S512_SAVE:
2243 case AMDGPU::SI_SPILL_S384_SAVE:
2244 case AMDGPU::SI_SPILL_S352_SAVE:
2245 case AMDGPU::SI_SPILL_S320_SAVE:
2246 case AMDGPU::SI_SPILL_S288_SAVE:
2247 case AMDGPU::SI_SPILL_S256_SAVE:
2248 case AMDGPU::SI_SPILL_S224_SAVE:
2249 case AMDGPU::SI_SPILL_S192_SAVE:
2250 case AMDGPU::SI_SPILL_S160_SAVE:
2251 case AMDGPU::SI_SPILL_S128_SAVE:
2252 case AMDGPU::SI_SPILL_S96_SAVE:
2253 case AMDGPU::SI_SPILL_S64_SAVE:
2254 case AMDGPU::SI_SPILL_S32_SAVE:
2255 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2256 case AMDGPU::SI_SPILL_S1024_RESTORE:
2257 case AMDGPU::SI_SPILL_S512_RESTORE:
2258 case AMDGPU::SI_SPILL_S384_RESTORE:
2259 case AMDGPU::SI_SPILL_S352_RESTORE:
2260 case AMDGPU::SI_SPILL_S320_RESTORE:
2261 case AMDGPU::SI_SPILL_S288_RESTORE:
2262 case AMDGPU::SI_SPILL_S256_RESTORE:
2263 case AMDGPU::SI_SPILL_S224_RESTORE:
2264 case AMDGPU::SI_SPILL_S192_RESTORE:
2265 case AMDGPU::SI_SPILL_S160_RESTORE:
2266 case AMDGPU::SI_SPILL_S128_RESTORE:
2267 case AMDGPU::SI_SPILL_S96_RESTORE:
2268 case AMDGPU::SI_SPILL_S64_RESTORE:
2269 case AMDGPU::SI_SPILL_S32_RESTORE:
2270 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2271 default:
2272 llvm_unreachable("not an SGPR spill instruction");
2273 }
2274}
2275
2277 int SPAdj, unsigned FIOperandNum,
2278 RegScavenger *RS) const {
2279 MachineFunction *MF = MI->getParent()->getParent();
2280 MachineBasicBlock *MBB = MI->getParent();
2282 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2283 const SIInstrInfo *TII = ST.getInstrInfo();
2284 const DebugLoc &DL = MI->getDebugLoc();
2285
2286 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
2287
2289 "unreserved scratch RSRC register");
2290
2291 MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
2292 int Index = MI->getOperand(FIOperandNum).getIndex();
2293
2294 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
2295 ? getBaseRegister()
2296 : getFrameRegister(*MF);
2297
2298 switch (MI->getOpcode()) {
2299 // SGPR register spill
2300 case AMDGPU::SI_SPILL_S1024_SAVE:
2301 case AMDGPU::SI_SPILL_S512_SAVE:
2302 case AMDGPU::SI_SPILL_S384_SAVE:
2303 case AMDGPU::SI_SPILL_S352_SAVE:
2304 case AMDGPU::SI_SPILL_S320_SAVE:
2305 case AMDGPU::SI_SPILL_S288_SAVE:
2306 case AMDGPU::SI_SPILL_S256_SAVE:
2307 case AMDGPU::SI_SPILL_S224_SAVE:
2308 case AMDGPU::SI_SPILL_S192_SAVE:
2309 case AMDGPU::SI_SPILL_S160_SAVE:
2310 case AMDGPU::SI_SPILL_S128_SAVE:
2311 case AMDGPU::SI_SPILL_S96_SAVE:
2312 case AMDGPU::SI_SPILL_S64_SAVE:
2313 case AMDGPU::SI_SPILL_S32_SAVE: {
2314 return spillSGPR(MI, Index, RS);
2315 }
2316
2317 // SGPR register restore
2318 case AMDGPU::SI_SPILL_S1024_RESTORE:
2319 case AMDGPU::SI_SPILL_S512_RESTORE:
2320 case AMDGPU::SI_SPILL_S384_RESTORE:
2321 case AMDGPU::SI_SPILL_S352_RESTORE:
2322 case AMDGPU::SI_SPILL_S320_RESTORE:
2323 case AMDGPU::SI_SPILL_S288_RESTORE:
2324 case AMDGPU::SI_SPILL_S256_RESTORE:
2325 case AMDGPU::SI_SPILL_S224_RESTORE:
2326 case AMDGPU::SI_SPILL_S192_RESTORE:
2327 case AMDGPU::SI_SPILL_S160_RESTORE:
2328 case AMDGPU::SI_SPILL_S128_RESTORE:
2329 case AMDGPU::SI_SPILL_S96_RESTORE:
2330 case AMDGPU::SI_SPILL_S64_RESTORE:
2331 case AMDGPU::SI_SPILL_S32_RESTORE: {
2332 return restoreSGPR(MI, Index, RS);
2333 }
2334
2335 // VGPR register spill
2336 case AMDGPU::SI_SPILL_V1024_SAVE:
2337 case AMDGPU::SI_SPILL_V512_SAVE:
2338 case AMDGPU::SI_SPILL_V384_SAVE:
2339 case AMDGPU::SI_SPILL_V352_SAVE:
2340 case AMDGPU::SI_SPILL_V320_SAVE:
2341 case AMDGPU::SI_SPILL_V288_SAVE:
2342 case AMDGPU::SI_SPILL_V256_SAVE:
2343 case AMDGPU::SI_SPILL_V224_SAVE:
2344 case AMDGPU::SI_SPILL_V192_SAVE:
2345 case AMDGPU::SI_SPILL_V160_SAVE:
2346 case AMDGPU::SI_SPILL_V128_SAVE:
2347 case AMDGPU::SI_SPILL_V96_SAVE:
2348 case AMDGPU::SI_SPILL_V64_SAVE:
2349 case AMDGPU::SI_SPILL_V32_SAVE:
2350 case AMDGPU::SI_SPILL_A1024_SAVE:
2351 case AMDGPU::SI_SPILL_A512_SAVE:
2352 case AMDGPU::SI_SPILL_A384_SAVE:
2353 case AMDGPU::SI_SPILL_A352_SAVE:
2354 case AMDGPU::SI_SPILL_A320_SAVE:
2355 case AMDGPU::SI_SPILL_A288_SAVE:
2356 case AMDGPU::SI_SPILL_A256_SAVE:
2357 case AMDGPU::SI_SPILL_A224_SAVE:
2358 case AMDGPU::SI_SPILL_A192_SAVE:
2359 case AMDGPU::SI_SPILL_A160_SAVE:
2360 case AMDGPU::SI_SPILL_A128_SAVE:
2361 case AMDGPU::SI_SPILL_A96_SAVE:
2362 case AMDGPU::SI_SPILL_A64_SAVE:
2363 case AMDGPU::SI_SPILL_A32_SAVE:
2364 case AMDGPU::SI_SPILL_AV1024_SAVE:
2365 case AMDGPU::SI_SPILL_AV512_SAVE:
2366 case AMDGPU::SI_SPILL_AV384_SAVE:
2367 case AMDGPU::SI_SPILL_AV352_SAVE:
2368 case AMDGPU::SI_SPILL_AV320_SAVE:
2369 case AMDGPU::SI_SPILL_AV288_SAVE:
2370 case AMDGPU::SI_SPILL_AV256_SAVE:
2371 case AMDGPU::SI_SPILL_AV224_SAVE:
2372 case AMDGPU::SI_SPILL_AV192_SAVE:
2373 case AMDGPU::SI_SPILL_AV160_SAVE:
2374 case AMDGPU::SI_SPILL_AV128_SAVE:
2375 case AMDGPU::SI_SPILL_AV96_SAVE:
2376 case AMDGPU::SI_SPILL_AV64_SAVE:
2377 case AMDGPU::SI_SPILL_AV32_SAVE:
2378 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
2379 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
2380 const MachineOperand *VData = TII->getNamedOperand(*MI,
2381 AMDGPU::OpName::vdata);
2382 if (VData->isUndef()) {
2383 MI->eraseFromParent();
2384 return true;
2385 }
2386
2387 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2388 MFI->getStackPtrOffsetReg());
2389
2390 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2391 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2392 auto *MBB = MI->getParent();
2393 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2394 if (IsWWMRegSpill) {
2395 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2396 RS->isRegUsed(AMDGPU::SCC));
2397 }
2399 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2400 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2401 *MI->memoperands_begin(), RS);
2402 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
2403 if (IsWWMRegSpill)
2404 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2405
2406 MI->eraseFromParent();
2407 return true;
2408 }
2409 case AMDGPU::SI_SPILL_V32_RESTORE:
2410 case AMDGPU::SI_SPILL_V64_RESTORE:
2411 case AMDGPU::SI_SPILL_V96_RESTORE:
2412 case AMDGPU::SI_SPILL_V128_RESTORE:
2413 case AMDGPU::SI_SPILL_V160_RESTORE:
2414 case AMDGPU::SI_SPILL_V192_RESTORE:
2415 case AMDGPU::SI_SPILL_V224_RESTORE:
2416 case AMDGPU::SI_SPILL_V256_RESTORE:
2417 case AMDGPU::SI_SPILL_V288_RESTORE:
2418 case AMDGPU::SI_SPILL_V320_RESTORE:
2419 case AMDGPU::SI_SPILL_V352_RESTORE:
2420 case AMDGPU::SI_SPILL_V384_RESTORE:
2421 case AMDGPU::SI_SPILL_V512_RESTORE:
2422 case AMDGPU::SI_SPILL_V1024_RESTORE:
2423 case AMDGPU::SI_SPILL_A32_RESTORE:
2424 case AMDGPU::SI_SPILL_A64_RESTORE:
2425 case AMDGPU::SI_SPILL_A96_RESTORE:
2426 case AMDGPU::SI_SPILL_A128_RESTORE:
2427 case AMDGPU::SI_SPILL_A160_RESTORE:
2428 case AMDGPU::SI_SPILL_A192_RESTORE:
2429 case AMDGPU::SI_SPILL_A224_RESTORE:
2430 case AMDGPU::SI_SPILL_A256_RESTORE:
2431 case AMDGPU::SI_SPILL_A288_RESTORE:
2432 case AMDGPU::SI_SPILL_A320_RESTORE:
2433 case AMDGPU::SI_SPILL_A352_RESTORE:
2434 case AMDGPU::SI_SPILL_A384_RESTORE:
2435 case AMDGPU::SI_SPILL_A512_RESTORE:
2436 case AMDGPU::SI_SPILL_A1024_RESTORE:
2437 case AMDGPU::SI_SPILL_AV32_RESTORE:
2438 case AMDGPU::SI_SPILL_AV64_RESTORE:
2439 case AMDGPU::SI_SPILL_AV96_RESTORE:
2440 case AMDGPU::SI_SPILL_AV128_RESTORE:
2441 case AMDGPU::SI_SPILL_AV160_RESTORE:
2442 case AMDGPU::SI_SPILL_AV192_RESTORE:
2443 case AMDGPU::SI_SPILL_AV224_RESTORE:
2444 case AMDGPU::SI_SPILL_AV256_RESTORE:
2445 case AMDGPU::SI_SPILL_AV288_RESTORE:
2446 case AMDGPU::SI_SPILL_AV320_RESTORE:
2447 case AMDGPU::SI_SPILL_AV352_RESTORE:
2448 case AMDGPU::SI_SPILL_AV384_RESTORE:
2449 case AMDGPU::SI_SPILL_AV512_RESTORE:
2450 case AMDGPU::SI_SPILL_AV1024_RESTORE:
2451 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
2452 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
2453 const MachineOperand *VData = TII->getNamedOperand(*MI,
2454 AMDGPU::OpName::vdata);
2455 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2456 MFI->getStackPtrOffsetReg());
2457
2458 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2459 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2460 auto *MBB = MI->getParent();
2461 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2462 if (IsWWMRegSpill) {
2463 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2464 RS->isRegUsed(AMDGPU::SCC));
2465 }
2466
2468 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2469 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2470 *MI->memoperands_begin(), RS);
2471
2472 if (IsWWMRegSpill)
2473 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2474
2475 MI->eraseFromParent();
2476 return true;
2477 }
2478 case AMDGPU::V_ADD_U32_e32:
2479 case AMDGPU::V_ADD_U32_e64:
2480 case AMDGPU::V_ADD_CO_U32_e32:
2481 case AMDGPU::V_ADD_CO_U32_e64: {
2482 // TODO: Handle sub, and, or.
2483 unsigned NumDefs = MI->getNumExplicitDefs();
2484 unsigned Src0Idx = NumDefs;
2485
2486 bool HasClamp = false;
2487 MachineOperand *VCCOp = nullptr;
2488
2489 switch (MI->getOpcode()) {
2490 case AMDGPU::V_ADD_U32_e32:
2491 break;
2492 case AMDGPU::V_ADD_U32_e64:
2493 HasClamp = MI->getOperand(3).getImm();
2494 break;
2495 case AMDGPU::V_ADD_CO_U32_e32:
2496 VCCOp = &MI->getOperand(3);
2497 break;
2498 case AMDGPU::V_ADD_CO_U32_e64:
2499 VCCOp = &MI->getOperand(1);
2500 HasClamp = MI->getOperand(4).getImm();
2501 break;
2502 default:
2503 break;
2504 }
2505 bool DeadVCC = !VCCOp || VCCOp->isDead();
2506 MachineOperand &DstOp = MI->getOperand(0);
2507 Register DstReg = DstOp.getReg();
2508
2509 unsigned OtherOpIdx =
2510 FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2511 MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2512
2513 unsigned Src1Idx = Src0Idx + 1;
2514 Register MaterializedReg = FrameReg;
2515 Register ScavengedVGPR;
2516
2517 int64_t Offset = FrameInfo.getObjectOffset(Index);
2518 // For the non-immediate case, we could fall through to the default
2519 // handling, but we do an in-place update of the result register here to
2520 // avoid scavenging another register.
2521 if (OtherOp->isImm()) {
2522 int64_t TotalOffset = OtherOp->getImm() + Offset;
2523
2524 if (!ST.hasVOP3Literal() && SIInstrInfo::isVOP3(*MI) &&
2525 !AMDGPU::isInlinableIntLiteral(TotalOffset)) {
2526 // If we can't support a VOP3 literal in the VALU instruction, we
2527 // can't specially fold into the add.
2528 // TODO: Handle VOP3->VOP2 shrink to support the fold.
2529 break;
2530 }
2531
2532 OtherOp->setImm(TotalOffset);
2533 Offset = 0;
2534 }
2535
2536 if (FrameReg && !ST.enableFlatScratch()) {
2537 // We should just do an in-place update of the result register. However,
2538 // the value there may also be used by the add, in which case we need a
2539 // temporary register.
2540 //
2541 // FIXME: The scavenger is not finding the result register in the
2542 // common case where the add does not read the register.
2543
2544 ScavengedVGPR = RS->scavengeRegisterBackwards(
2545 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2546
2547 // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2548 // shift.
2549 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2550 .addDef(ScavengedVGPR, RegState::Renamable)
2552 .addReg(FrameReg);
2553 MaterializedReg = ScavengedVGPR;
2554 }
2555
2556 if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2557 if (ST.enableFlatScratch() &&
2558 !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2559 // We didn't need the shift above, so we have an SGPR for the frame
2560 // register, but may have a VGPR only operand.
2561 //
2562 // TODO: On gfx10+, we can easily change the opcode to the e64 version
2563 // and use the higher constant bus restriction to avoid this copy.
2564
2565 if (!ScavengedVGPR) {
2566 ScavengedVGPR = RS->scavengeRegisterBackwards(
2567 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2568 /*SPAdj=*/0);
2569 }
2570
2571 assert(ScavengedVGPR != DstReg);
2572
2573 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2574 .addReg(MaterializedReg,
2575 MaterializedReg != FrameReg ? RegState::Kill : 0);
2576 MaterializedReg = ScavengedVGPR;
2577 }
2578
2579 // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2580 // is not live, we could use a scalar add + vector add instead of 2
2581 // vector adds.
2582 auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2583 .addDef(DstReg, RegState::Renamable);
2584 if (NumDefs == 2)
2585 AddI32.add(MI->getOperand(1));
2586
2587 unsigned MaterializedRegFlags =
2588 MaterializedReg != FrameReg ? RegState::Kill : 0;
2589
2590 if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2591 // If we know we have a VGPR already, it's more likely the other
2592 // operand is a legal vsrc0.
2593 AddI32
2594 .add(*OtherOp)
2595 .addReg(MaterializedReg, MaterializedRegFlags);
2596 } else {
2597 // Commute operands to avoid violating VOP2 restrictions. This will
2598 // typically happen when using scratch.
2599 AddI32
2600 .addReg(MaterializedReg, MaterializedRegFlags)
2601 .add(*OtherOp);
2602 }
2603
2604 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2605 MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2606 AddI32.addImm(0); // clamp
2607
2608 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2609 AddI32.setOperandDead(3); // Dead vcc
2610
2611 MaterializedReg = DstReg;
2612
2613 OtherOp->ChangeToRegister(MaterializedReg, false);
2614 OtherOp->setIsKill(true);
2616 Offset = 0;
2617 } else if (Offset != 0) {
2618 assert(!MaterializedReg);
2620 Offset = 0;
2621 } else {
2622 if (DeadVCC && !HasClamp) {
2623 assert(Offset == 0);
2624
2625 // TODO: Losing kills and implicit operands. Just mutate to copy and
2626 // let lowerCopy deal with it?
2627 if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2628 // Folded to an identity copy.
2629 MI->eraseFromParent();
2630 return true;
2631 }
2632
2633 // The immediate value should be in OtherOp
2634 MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2635 MI->removeOperand(FIOperandNum);
2636
2637 unsigned NumOps = MI->getNumOperands();
2638 for (unsigned I = NumOps - 2; I >= NumDefs + 1; --I)
2639 MI->removeOperand(I);
2640
2641 if (NumDefs == 2)
2642 MI->removeOperand(1);
2643
2644 // The code below can't deal with a mov.
2645 return true;
2646 }
2647
2648 // This folded to a constant, but we have to keep the add around for
2649 // pointless implicit defs or clamp modifier.
2650 FIOp->ChangeToImmediate(0);
2651 }
2652
2653 // Try to improve legality by commuting.
2654 if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2655 std::swap(FIOp, OtherOp);
2656 std::swap(FIOperandNum, OtherOpIdx);
2657 }
2658
2659 // We need at most one mov to satisfy the operand constraints. Prefer to
2660 // move the FI operand first, as it may be a literal in a VOP3
2661 // instruction.
2662 for (unsigned SrcIdx : {FIOperandNum, OtherOpIdx}) {
2663 if (!TII->isOperandLegal(*MI, SrcIdx)) {
2664 // If commuting didn't make the operands legal, we need to materialize
2665 // in a register.
2666 // TODO: Can use SGPR on gfx10+ in some cases.
2667 if (!ScavengedVGPR) {
2668 ScavengedVGPR = RS->scavengeRegisterBackwards(
2669 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2670 /*SPAdj=*/0);
2671 }
2672
2673 assert(ScavengedVGPR != DstReg);
2674
2675 MachineOperand &Src = MI->getOperand(SrcIdx);
2676 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2677 .add(Src);
2678
2679 Src.ChangeToRegister(ScavengedVGPR, false);
2680 Src.setIsKill(true);
2681 break;
2682 }
2683 }
2684
2685 // Fold out add of 0 case that can appear in kernels.
2686 if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
2687 if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
2688 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
2689 }
2690
2691 MI->eraseFromParent();
2692 }
2693
2694 return true;
2695 }
2696 case AMDGPU::S_ADD_I32: {
2697 // TODO: Handle s_or_b32, s_and_b32.
2698 unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
2699 MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
2700
2701 assert(FrameReg || MFI->isBottomOfStack());
2702
2703 MachineOperand &DstOp = MI->getOperand(0);
2704 const DebugLoc &DL = MI->getDebugLoc();
2705 Register MaterializedReg = FrameReg;
2706
2707 // Defend against live scc, which should never happen in practice.
2708 bool DeadSCC = MI->getOperand(3).isDead();
2709
2710 Register TmpReg;
2711
2712 // FIXME: Scavenger should figure out that the result register is
2713 // available. Also should do this for the v_add case.
2714 if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg())
2715 TmpReg = DstOp.getReg();
2716
2717 if (FrameReg && !ST.enableFlatScratch()) {
2718 // FIXME: In the common case where the add does not also read its result
2719 // (i.e. this isn't a reg += fi), it's not finding the dest reg as
2720 // available.
2721 if (!TmpReg)
2722 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2723 MI, false, 0);
2724 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
2725 .addDef(TmpReg, RegState::Renamable)
2726 .addReg(FrameReg)
2728 .setOperandDead(3); // Set SCC dead
2729 MaterializedReg = TmpReg;
2730 }
2731
2732 int64_t Offset = FrameInfo.getObjectOffset(Index);
2733
2734 // For the non-immediate case, we could fall through to the default
2735 // handling, but we do an in-place update of the result register here to
2736 // avoid scavenging another register.
2737 if (OtherOp.isImm()) {
2738 OtherOp.setImm(OtherOp.getImm() + Offset);
2739 Offset = 0;
2740
2741 if (MaterializedReg)
2742 FIOp->ChangeToRegister(MaterializedReg, false);
2743 else
2744 FIOp->ChangeToImmediate(0);
2745 } else if (MaterializedReg) {
2746 // If we can't fold the other operand, do another increment.
2747 Register DstReg = DstOp.getReg();
2748
2749 if (!TmpReg && MaterializedReg == FrameReg) {
2750 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2751 MI, /*RestoreAfter=*/false, 0,
2752 /*AllowSpill=*/false);
2753 DstReg = TmpReg;
2754 }
2755
2756 auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_ADD_I32))
2757 .addDef(DstReg, RegState::Renamable)
2758 .addReg(MaterializedReg, RegState::Kill)
2759 .add(OtherOp);
2760 if (DeadSCC)
2761 AddI32.setOperandDead(3);
2762
2763 MaterializedReg = DstReg;
2764
2765 OtherOp.ChangeToRegister(MaterializedReg, false);
2766 OtherOp.setIsKill(true);
2767 OtherOp.setIsRenamable(true);
2769 } else {
2770 // If we don't have any other offset to apply, we can just directly
2771 // interpret the frame index as the offset.
2773 }
2774
2775 if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
2776 assert(Offset == 0);
2777 MI->removeOperand(3);
2778 MI->removeOperand(OtherOpIdx);
2779 MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2780 } else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
2781 assert(Offset == 0);
2782 MI->removeOperand(3);
2783 MI->removeOperand(FIOperandNum);
2784 MI->setDesc(
2785 TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2786 }
2787
2788 assert(!FIOp->isFI());
2789 return true;
2790 }
2791 default: {
2792 break;
2793 }
2794 }
2795
2796 int64_t Offset = FrameInfo.getObjectOffset(Index);
2797 if (ST.enableFlatScratch()) {
2798 if (TII->isFLATScratch(*MI)) {
2799 assert(
2800 (int16_t)FIOperandNum ==
2801 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::saddr));
2802
2803 // The offset is always swizzled, just replace it
2804 if (FrameReg)
2805 FIOp->ChangeToRegister(FrameReg, false);
2806
2807 MachineOperand *OffsetOp =
2808 TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2809 int64_t NewOffset = Offset + OffsetOp->getImm();
2810 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
2812 OffsetOp->setImm(NewOffset);
2813 if (FrameReg)
2814 return false;
2815 Offset = 0;
2816 }
2817
2818 if (!Offset) {
2819 unsigned Opc = MI->getOpcode();
2820 int NewOpc = -1;
2821 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
2823 } else if (ST.hasFlatScratchSTMode()) {
2824 // On GFX10 we have ST mode to use no registers for an address.
2825 // Otherwise we need to materialize 0 into an SGPR.
2827 }
2828
2829 if (NewOpc != -1) {
2830 // removeOperand doesn't fixup tied operand indexes as it goes, so
2831 // it asserts. Untie vdst_in for now and retie them afterwards.
2832 int VDstIn =
2833 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
2834 bool TiedVDst = VDstIn != -1 && MI->getOperand(VDstIn).isReg() &&
2835 MI->getOperand(VDstIn).isTied();
2836 if (TiedVDst)
2837 MI->untieRegOperand(VDstIn);
2838
2839 MI->removeOperand(
2840 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
2841
2842 if (TiedVDst) {
2843 int NewVDst =
2844 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
2845 int NewVDstIn =
2846 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
2847 assert(NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
2848 MI->tieOperands(NewVDst, NewVDstIn);
2849 }
2850 MI->setDesc(TII->get(NewOpc));
2851 return false;
2852 }
2853 }
2854 }
2855
2856 if (!FrameReg) {
2858 if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
2859 return false;
2860 }
2861
2862 // We need to use register here. Check if we can use an SGPR or need
2863 // a VGPR.
2864 FIOp->ChangeToRegister(AMDGPU::M0, false);
2865 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
2866
2867 if (!Offset && FrameReg && UseSGPR) {
2868 FIOp->setReg(FrameReg);
2869 return false;
2870 }
2871
2872 const TargetRegisterClass *RC =
2873 UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass;
2874
2875 Register TmpReg =
2876 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2877 FIOp->setReg(TmpReg);
2878 FIOp->setIsKill();
2879
2880 if ((!FrameReg || !Offset) && TmpReg) {
2881 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2882 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
2883 if (FrameReg)
2884 MIB.addReg(FrameReg);
2885 else
2886 MIB.addImm(Offset);
2887
2888 return false;
2889 }
2890
2891 bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) &&
2892 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
2893
2894 Register TmpSReg =
2895 UseSGPR ? TmpReg
2896 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2897 MI, false, 0, !UseSGPR);
2898
2899 // TODO: for flat scratch another attempt can be made with a VGPR index
2900 // if no SGPRs can be scavenged.
2901 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
2902 report_fatal_error("Cannot scavenge register in FI elimination!");
2903
2904 if (!TmpSReg) {
2905 // Use frame register and restore it after.
2906 TmpSReg = FrameReg;
2907 FIOp->setReg(FrameReg);
2908 FIOp->setIsKill(false);
2909 }
2910
2911 if (NeedSaveSCC) {
2912 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
2913 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
2914 .addReg(FrameReg)
2915 .addImm(Offset);
2916 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
2917 .addReg(TmpSReg)
2918 .addImm(0);
2919 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
2920 .addImm(0)
2921 .addReg(TmpSReg);
2922 } else {
2923 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
2924 .addReg(FrameReg)
2925 .addImm(Offset);
2926 }
2927
2928 if (!UseSGPR)
2929 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
2930 .addReg(TmpSReg, RegState::Kill);
2931
2932 if (TmpSReg == FrameReg) {
2933 // Undo frame register modification.
2934 if (NeedSaveSCC &&
2935 !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
2937 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
2938 TmpSReg)
2939 .addReg(FrameReg)
2940 .addImm(-Offset);
2941 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
2942 .addReg(TmpSReg)
2943 .addImm(0);
2944 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
2945 TmpSReg)
2946 .addImm(0)
2947 .addReg(TmpSReg);
2948 } else {
2949 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
2950 FrameReg)
2951 .addReg(FrameReg)
2952 .addImm(-Offset);
2953 }
2954 }
2955
2956 return false;
2957 }
2958
2959 bool IsMUBUF = TII->isMUBUF(*MI);
2960
2961 if (!IsMUBUF && !MFI->isBottomOfStack()) {
2962 // Convert to a swizzled stack address by scaling by the wave size.
2963 // In an entry function/kernel the offset is already swizzled.
2964 bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
2965 bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
2966 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
2967 const TargetRegisterClass *RC = IsSALU && !LiveSCC
2968 ? &AMDGPU::SReg_32RegClass
2969 : &AMDGPU::VGPR_32RegClass;
2970 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
2971 MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
2972 MI->getOpcode() == AMDGPU::S_MOV_B32;
2973 Register ResultReg =
2974 IsCopy ? MI->getOperand(0).getReg()
2975 : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
2976
2977 int64_t Offset = FrameInfo.getObjectOffset(Index);
2978 if (Offset == 0) {
2979 unsigned OpCode =
2980 IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64;
2981 Register TmpResultReg = ResultReg;
2982 if (IsSALU && LiveSCC) {
2983 TmpResultReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
2984 MI, false, 0);
2985 }
2986
2987 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
2988 if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
2989 // For V_LSHRREV, the operands are reversed (the shift count goes
2990 // first).
2991 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
2992 else
2993 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
2994 if (IsSALU && !LiveSCC)
2995 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
2996 if (IsSALU && LiveSCC) {
2997 Register NewDest =
2998 IsCopy ? ResultReg
2999 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
3000 Shift, false, 0);
3001 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
3002 .addReg(TmpResultReg);
3003 ResultReg = NewDest;
3004 }
3005 } else {
3007 if (!IsSALU) {
3008 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
3009 nullptr) {
3010 // Reuse ResultReg in intermediate step.
3011 Register ScaledReg = ResultReg;
3012
3013 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3014 ScaledReg)
3016 .addReg(FrameReg);
3017
3018 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
3019
3020 // TODO: Fold if use instruction is another add of a constant.
3021 if (IsVOP2 ||
3023 // FIXME: This can fail
3024 MIB.addImm(Offset);
3025 MIB.addReg(ScaledReg, RegState::Kill);
3026 if (!IsVOP2)
3027 MIB.addImm(0); // clamp bit
3028 } else {
3029 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
3030 "Need to reuse carry out register");
3031
3032 // Use scavenged unused carry out as offset register.
3033 Register ConstOffsetReg;
3034 if (!isWave32)
3035 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
3036 else
3037 ConstOffsetReg = MIB.getReg(1);
3038
3039 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32),
3040 ConstOffsetReg)
3041 .addImm(Offset);
3042 MIB.addReg(ConstOffsetReg, RegState::Kill);
3043 MIB.addReg(ScaledReg, RegState::Kill);
3044 MIB.addImm(0); // clamp bit
3045 }
3046 }
3047 }
3048 if (!MIB || IsSALU) {
3049 // We have to produce a carry out, and there isn't a free SGPR pair
3050 // for it. We can keep the whole computation on the SALU to avoid
3051 // clobbering an additional register at the cost of an extra mov.
3052
3053 // We may have 1 free scratch SGPR even though a carry out is
3054 // unavailable. Only one additional mov is needed.
3055 Register TmpScaledReg = IsCopy && IsSALU
3056 ? ResultReg
3058 AMDGPU::SReg_32_XM0RegClass, MI,
3059 false, 0, /*AllowSpill=*/false);
3060 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
3061 Register TmpResultReg = ScaledReg;
3062
3063 if (!LiveSCC) {
3064 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
3065 .addReg(FrameReg)
3067 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
3068 .addReg(TmpResultReg, RegState::Kill)
3069 .addImm(Offset);
3070 } else {
3071 TmpResultReg = RS->scavengeRegisterBackwards(
3072 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3073
3075 if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
3076 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3077 TmpResultReg)
3079 .addReg(FrameReg);
3080 if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
3081 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32), ResultReg)
3082 .addImm(Offset);
3083 Add.addReg(ResultReg, RegState::Kill)
3084 .addReg(TmpResultReg, RegState::Kill)
3085 .addImm(0);
3086 } else
3087 Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
3088 } else {
3089 assert(Offset > 0 && isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
3090 "offset is unsafe for v_mad_u32_u24");
3091
3092 // We start with a frame pointer with a wave space value, and
3093 // an offset in lane-space. We are materializing a lane space
3094 // value. We can either do a right shift of the frame pointer
3095 // to get to lane space, or a left shift of the offset to get
3096 // to wavespace. We can right shift after the computation to
3097 // get back to the desired per-lane value. We are using the
3098 // mad_u32_u24 primarily as an add with no carry out clobber.
3099 bool IsInlinableLiteral =
3101 if (!IsInlinableLiteral) {
3102 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
3103 TmpResultReg)
3104 .addImm(Offset);
3105 }
3106
3107 Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
3108 TmpResultReg);
3109
3110 if (!IsInlinableLiteral) {
3111 Add.addReg(TmpResultReg, RegState::Kill);
3112 } else {
3113 // We fold the offset into mad itself if its inlinable.
3114 Add.addImm(Offset);
3115 }
3116 Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
3117 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3118 TmpResultReg)
3120 .addReg(TmpResultReg);
3121 }
3122
3123 Register NewDest = IsCopy ? ResultReg
3125 AMDGPU::SReg_32RegClass, *Add,
3126 false, 0, /*AllowSpill=*/true);
3127 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
3128 NewDest)
3129 .addReg(TmpResultReg);
3130 ResultReg = NewDest;
3131 }
3132 if (!IsSALU)
3133 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
3134 .addReg(TmpResultReg, RegState::Kill);
3135 else
3136 ResultReg = TmpResultReg;
3137 // If there were truly no free SGPRs, we need to undo everything.
3138 if (!TmpScaledReg.isValid()) {
3139 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
3140 .addReg(ScaledReg, RegState::Kill)
3141 .addImm(-Offset);
3142 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
3143 .addReg(FrameReg)
3145 }
3146 }
3147 }
3148
3149 // Don't introduce an extra copy if we're just materializing in a mov.
3150 if (IsCopy) {
3151 MI->eraseFromParent();
3152 return true;
3153 }
3154 FIOp->ChangeToRegister(ResultReg, false, false, true);
3155 return false;
3156 }
3157
3158 if (IsMUBUF) {
3159 // Disable offen so we don't need a 0 vgpr base.
3160 assert(
3161 static_cast<int>(FIOperandNum) ==
3162 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr));
3163
3164 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
3165 assert((SOffset.isImm() && SOffset.getImm() == 0));
3166
3167 if (FrameReg != AMDGPU::NoRegister)
3168 SOffset.ChangeToRegister(FrameReg, false);
3169
3170 int64_t Offset = FrameInfo.getObjectOffset(Index);
3171 int64_t OldImm =
3172 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
3173 int64_t NewOffset = OldImm + Offset;
3174
3175 if (TII->isLegalMUBUFImmOffset(NewOffset) &&
3176 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
3177 MI->eraseFromParent();
3178 return true;
3179 }
3180 }
3181
3182 // If the offset is simply too big, don't convert to a scratch wave offset
3183 // relative index.
3184
3186 if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
3187 Register TmpReg =
3188 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
3189 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3190 .addImm(Offset);
3191 FIOp->ChangeToRegister(TmpReg, false, false, true);
3192 }
3193
3194 return false;
3195}
3196
3199}
3200
3202 return getRegBitWidth(RC.getID());
3203}
3204
3205static const TargetRegisterClass *
3207 if (BitWidth == 64)
3208 return &AMDGPU::VReg_64RegClass;
3209 if (BitWidth == 96)
3210 return &AMDGPU::VReg_96RegClass;
3211 if (BitWidth == 128)
3212 return &AMDGPU::VReg_128RegClass;
3213 if (BitWidth == 160)
3214 return &AMDGPU::VReg_160RegClass;
3215 if (BitWidth == 192)
3216 return &AMDGPU::VReg_192RegClass;
3217 if (BitWidth == 224)
3218 return &AMDGPU::VReg_224RegClass;
3219 if (BitWidth == 256)
3220 return &AMDGPU::VReg_256RegClass;
3221 if (BitWidth == 288)
3222 return &AMDGPU::VReg_288RegClass;
3223 if (BitWidth == 320)
3224 return &AMDGPU::VReg_320RegClass;
3225 if (BitWidth == 352)
3226 return &AMDGPU::VReg_352RegClass;
3227 if (BitWidth == 384)
3228 return &AMDGPU::VReg_384RegClass;
3229 if (BitWidth == 512)
3230 return &AMDGPU::VReg_512RegClass;
3231 if (BitWidth == 1024)
3232 return &AMDGPU::VReg_1024RegClass;
3233
3234 return nullptr;
3235}
3236
3237static const TargetRegisterClass *
3239 if (BitWidth == 64)
3240 return &AMDGPU::VReg_64_Align2RegClass;
3241 if (BitWidth == 96)
3242 return &AMDGPU::VReg_96_Align2RegClass;
3243 if (BitWidth == 128)
3244 return &AMDGPU::VReg_128_Align2RegClass;
3245 if (BitWidth == 160)
3246 return &AMDGPU::VReg_160_Align2RegClass;
3247 if (BitWidth == 192)
3248 return &AMDGPU::VReg_192_Align2RegClass;
3249 if (BitWidth == 224)
3250 return &AMDGPU::VReg_224_Align2RegClass;
3251 if (BitWidth == 256)
3252 return &AMDGPU::VReg_256_Align2RegClass;
3253 if (BitWidth == 288)
3254 return &AMDGPU::VReg_288_Align2RegClass;
3255 if (BitWidth == 320)
3256 return &AMDGPU::VReg_320_Align2RegClass;
3257 if (BitWidth == 352)
3258 return &AMDGPU::VReg_352_Align2RegClass;
3259 if (BitWidth == 384)
3260 return &AMDGPU::VReg_384_Align2RegClass;
3261 if (BitWidth == 512)
3262 return &AMDGPU::VReg_512_Align2RegClass;
3263 if (BitWidth == 1024)
3264 return &AMDGPU::VReg_1024_Align2RegClass;
3265
3266 return nullptr;
3267}
3268
3269const TargetRegisterClass *
3271 if (BitWidth == 1)
3272 return &AMDGPU::VReg_1RegClass;
3273 if (BitWidth == 16)
3274 return &AMDGPU::VGPR_16RegClass;
3275 if (BitWidth == 32)
3276 return &AMDGPU::VGPR_32RegClass;
3279}
3280
3281static const TargetRegisterClass *
3283 if (BitWidth == 64)
3284 return &AMDGPU::AReg_64RegClass;
3285 if (BitWidth == 96)
3286 return &AMDGPU::AReg_96RegClass;
3287 if (BitWidth == 128)
3288 return &AMDGPU::AReg_128RegClass;
3289 if (BitWidth == 160)
3290 return &AMDGPU::AReg_160RegClass;
3291 if (BitWidth == 192)
3292 return &AMDGPU::AReg_192RegClass;
3293 if (BitWidth == 224)
3294 return &AMDGPU::AReg_224RegClass;
3295 if (BitWidth == 256)
3296 return &AMDGPU::AReg_256RegClass;
3297 if (BitWidth == 288)
3298 return &AMDGPU::AReg_288RegClass;
3299 if (BitWidth == 320)
3300 return &AMDGPU::AReg_320RegClass;
3301 if (BitWidth == 352)
3302 return &AMDGPU::AReg_352RegClass;
3303 if (BitWidth == 384)
3304 return &AMDGPU::AReg_384RegClass;
3305 if (BitWidth == 512)
3306 return &AMDGPU::AReg_512RegClass;
3307 if (BitWidth == 1024)
3308 return &AMDGPU::AReg_1024RegClass;
3309
3310 return nullptr;
3311}
3312
3313static const TargetRegisterClass *
3315 if (BitWidth == 64)
3316 return &AMDGPU::AReg_64_Align2RegClass;
3317 if (BitWidth == 96)
3318 return &AMDGPU::AReg_96_Align2RegClass;
3319 if (BitWidth == 128)
3320 return &AMDGPU::AReg_128_Align2RegClass;
3321 if (BitWidth == 160)
3322 return &AMDGPU::AReg_160_Align2RegClass;
3323 if (BitWidth == 192)
3324 return &AMDGPU::AReg_192_Align2RegClass;
3325 if (BitWidth == 224)
3326 return &AMDGPU::AReg_224_Align2RegClass;
3327 if (BitWidth == 256)
3328 return &AMDGPU::AReg_256_Align2RegClass;
3329 if (BitWidth == 288)
3330 return &AMDGPU::AReg_288_Align2RegClass;
3331 if (BitWidth == 320)
3332 return &AMDGPU::AReg_320_Align2RegClass;
3333 if (BitWidth == 352)
3334 return &AMDGPU::AReg_352_Align2RegClass;
3335 if (BitWidth == 384)
3336 return &AMDGPU::AReg_384_Align2RegClass;
3337 if (BitWidth == 512)
3338 return &AMDGPU::AReg_512_Align2RegClass;
3339 if (BitWidth == 1024)
3340 return &AMDGPU::AReg_1024_Align2RegClass;
3341
3342 return nullptr;
3343}
3344
3345const TargetRegisterClass *
3347 if (BitWidth == 16)
3348 return &AMDGPU::AGPR_LO16RegClass;
3349 if (BitWidth == 32)
3350 return &AMDGPU::AGPR_32RegClass;
3353}
3354
3355static const TargetRegisterClass *
3357 if (BitWidth == 64)
3358 return &AMDGPU::AV_64RegClass;
3359 if (BitWidth == 96)
3360 return &AMDGPU::AV_96RegClass;
3361 if (BitWidth == 128)
3362 return &AMDGPU::AV_128RegClass;
3363 if (BitWidth == 160)
3364 return &AMDGPU::AV_160RegClass;
3365 if (BitWidth == 192)
3366 return &AMDGPU::AV_192RegClass;
3367 if (BitWidth == 224)
3368 return &AMDGPU::AV_224RegClass;
3369 if (BitWidth == 256)
3370 return &AMDGPU::AV_256RegClass;
3371 if (BitWidth == 288)
3372 return &AMDGPU::AV_288RegClass;
3373 if (BitWidth == 320)
3374 return &AMDGPU::AV_320RegClass;
3375 if (BitWidth == 352)
3376 return &AMDGPU::AV_352RegClass;
3377 if (BitWidth == 384)
3378 return &AMDGPU::AV_384RegClass;
3379 if (BitWidth == 512)
3380 return &AMDGPU::AV_512RegClass;
3381 if (BitWidth == 1024)
3382 return &AMDGPU::AV_1024RegClass;
3383
3384 return nullptr;
3385}
3386
3387static const TargetRegisterClass *
3389 if (BitWidth == 64)
3390 return &AMDGPU::AV_64_Align2RegClass;
3391 if (BitWidth == 96)
3392 return &AMDGPU::AV_96_Align2RegClass;
3393 if (BitWidth == 128)
3394 return &AMDGPU::AV_128_Align2RegClass;
3395 if (BitWidth == 160)
3396 return &AMDGPU::AV_160_Align2RegClass;
3397 if (BitWidth == 192)
3398 return &AMDGPU::AV_192_Align2RegClass;
3399 if (BitWidth == 224)
3400 return &AMDGPU::AV_224_Align2RegClass;
3401 if (BitWidth == 256)
3402 return &AMDGPU::AV_256_Align2RegClass;
3403 if (BitWidth == 288)
3404 return &AMDGPU::AV_288_Align2RegClass;
3405 if (BitWidth == 320)
3406 return &AMDGPU::AV_320_Align2RegClass;
3407 if (BitWidth == 352)
3408 return &AMDGPU::AV_352_Align2RegClass;
3409 if (BitWidth == 384)
3410 return &AMDGPU::AV_384_Align2RegClass;
3411 if (BitWidth == 512)
3412 return &AMDGPU::AV_512_Align2RegClass;
3413 if (BitWidth == 1024)
3414 return &AMDGPU::AV_1024_Align2RegClass;
3415
3416 return nullptr;
3417}
3418
3419const TargetRegisterClass *
3421 if (BitWidth == 32)
3422 return &AMDGPU::AV_32RegClass;
3423 return ST.needsAlignedVGPRs()
3426}
3427
3428const TargetRegisterClass *
3430 if (BitWidth == 16)
3431 return &AMDGPU::SGPR_LO16RegClass;
3432 if (BitWidth == 32)
3433 return &AMDGPU::SReg_32RegClass;
3434 if (BitWidth == 64)
3435 return &AMDGPU::SReg_64RegClass;
3436 if (BitWidth == 96)
3437 return &AMDGPU::SGPR_96RegClass;
3438 if (BitWidth == 128)
3439 return &AMDGPU::SGPR_128RegClass;
3440 if (BitWidth == 160)
3441 return &AMDGPU::SGPR_160RegClass;
3442 if (BitWidth == 192)
3443 return &AMDGPU::SGPR_192RegClass;
3444 if (BitWidth == 224)
3445 return &AMDGPU::SGPR_224RegClass;
3446 if (BitWidth == 256)
3447 return &AMDGPU::SGPR_256RegClass;
3448 if (BitWidth == 288)
3449 return &AMDGPU::SGPR_288RegClass;
3450 if (BitWidth == 320)
3451 return &AMDGPU::SGPR_320RegClass;
3452 if (BitWidth == 352)
3453 return &AMDGPU::SGPR_352RegClass;
3454 if (BitWidth == 384)
3455 return &AMDGPU::SGPR_384RegClass;
3456 if (BitWidth == 512)
3457 return &AMDGPU::SGPR_512RegClass;
3458 if (BitWidth == 1024)
3459 return &AMDGPU::SGPR_1024RegClass;
3460
3461 return nullptr;
3462}
3463
3465 Register Reg) const {
3466 const TargetRegisterClass *RC;
3467 if (Reg.isVirtual())
3468 RC = MRI.getRegClass(Reg);
3469 else
3470 RC = getPhysRegBaseClass(Reg);
3471 return RC ? isSGPRClass(RC) : false;
3472}
3473
3474const TargetRegisterClass *
3476 unsigned Size = getRegSizeInBits(*SRC);
3478 assert(VRC && "Invalid register class size");
3479 return VRC;
3480}
3481
3482const TargetRegisterClass *
3484 unsigned Size = getRegSizeInBits(*SRC);
3486 assert(ARC && "Invalid register class size");
3487 return ARC;
3488}
3489
3490const TargetRegisterClass *
3492 unsigned Size = getRegSizeInBits(*VRC);
3493 if (Size == 32)
3494 return &AMDGPU::SGPR_32RegClass;
3496 assert(SRC && "Invalid register class size");
3497 return SRC;
3498}
3499
3500const TargetRegisterClass *
3502 const TargetRegisterClass *SubRC,
3503 unsigned SubIdx) const {
3504 // Ensure this subregister index is aligned in the super register.
3505 const TargetRegisterClass *MatchRC =
3506 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
3507 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
3508}
3509
3510bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
3513 return !ST.hasMFMAInlineLiteralBug();
3514
3515 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
3516 OpType <= AMDGPU::OPERAND_SRC_LAST;
3517}
3518
3520 const TargetRegisterClass *DefRC,
3521 unsigned DefSubReg,
3522 const TargetRegisterClass *SrcRC,
3523 unsigned SrcSubReg) const {
3524 // We want to prefer the smallest register class possible, so we don't want to
3525 // stop and rewrite on anything that looks like a subregister
3526 // extract. Operations mostly don't care about the super register class, so we
3527 // only want to stop on the most basic of copies between the same register
3528 // class.
3529 //
3530 // e.g. if we have something like
3531 // %0 = ...
3532 // %1 = ...
3533 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
3534 // %3 = COPY %2, sub0
3535 //
3536 // We want to look through the COPY to find:
3537 // => %3 = COPY %0
3538
3539 // Plain copy.
3540 return getCommonSubClass(DefRC, SrcRC) != nullptr;
3541}
3542
3543bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
3544 // TODO: 64-bit operands have extending behavior from 32-bit literal.
3545 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
3547}
3548
3549/// Returns a lowest register that is not used at any point in the function.
3550/// If all registers are used, then this function will return
3551/// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
3552/// highest unused register.
3555 const MachineFunction &MF, bool ReserveHighestRegister) const {
3556 if (ReserveHighestRegister) {
3557 for (MCRegister Reg : reverse(*RC))
3558 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3559 return Reg;
3560 } else {
3561 for (MCRegister Reg : *RC)
3562 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3563 return Reg;
3564 }
3565 return MCRegister();
3566}
3567
3569 const RegisterBankInfo &RBI,
3570 Register Reg) const {
3571 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo());
3572 if (!RB)
3573 return false;
3574
3575 return !RBI.isDivergentRegBank(RB);
3576}
3577
3579 unsigned EltSize) const {
3580 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
3581 assert(RegBitWidth >= 32 && RegBitWidth <= 1024);
3582
3583 const unsigned RegDWORDs = RegBitWidth / 32;
3584 const unsigned EltDWORDs = EltSize / 4;
3585 assert(RegSplitParts.size() + 1 >= EltDWORDs);
3586
3587 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1];
3588 const unsigned NumParts = RegDWORDs / EltDWORDs;
3589
3590 return ArrayRef(Parts.data(), NumParts);
3591}
3592
3595 Register Reg) const {
3596 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg);
3597}
3598
3599const TargetRegisterClass *
3601 const MachineOperand &MO) const {
3602 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg());
3603 return getSubRegisterClass(SrcRC, MO.getSubReg());
3604}
3605
3607 Register Reg) const {
3608 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3609 // Registers without classes are unaddressable, SGPR-like registers.
3610 return RC && isVGPRClass(RC);
3611}
3612
3614 Register Reg) const {
3615 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3616
3617 // Registers without classes are unaddressable, SGPR-like registers.
3618 return RC && isAGPRClass(RC);
3619}
3620
3622 const TargetRegisterClass *SrcRC,
3623 unsigned SubReg,
3624 const TargetRegisterClass *DstRC,
3625 unsigned DstSubReg,
3626 const TargetRegisterClass *NewRC,
3627 LiveIntervals &LIS) const {
3628 unsigned SrcSize = getRegSizeInBits(*SrcRC);
3629 unsigned DstSize = getRegSizeInBits(*DstRC);
3630 unsigned NewSize = getRegSizeInBits(*NewRC);
3631
3632 // Do not increase size of registers beyond dword, we would need to allocate
3633 // adjacent registers and constraint regalloc more than needed.
3634
3635 // Always allow dword coalescing.
3636 if (SrcSize <= 32 || DstSize <= 32)
3637 return true;
3638
3639 return NewSize <= DstSize || NewSize <= SrcSize;
3640}
3641
3643 MachineFunction &MF) const {
3644 unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
3645 switch (RC->getID()) {
3646 default:
3647 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
3648 case AMDGPU::VGPR_32RegClassID:
3649 return std::min(ST.getMaxNumVGPRs(MinOcc), ST.getMaxNumVGPRs(MF));
3650 case AMDGPU::SGPR_32RegClassID:
3651 case AMDGPU::SGPR_LO16RegClassID:
3652 return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
3653 }
3654}
3655
3657 unsigned Idx) const {
3658 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
3659 Idx == AMDGPU::RegisterPressureSets::AGPR_32)
3660 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
3661 const_cast<MachineFunction &>(MF));
3662
3663 if (Idx == AMDGPU::RegisterPressureSets::SReg_32)
3664 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
3665 const_cast<MachineFunction &>(MF));
3666
3667 llvm_unreachable("Unexpected register pressure set!");
3668}
3669
3670const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
3671 static const int Empty[] = { -1 };
3672
3673 if (RegPressureIgnoredUnits[RegUnit])
3674 return Empty;
3675
3676 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
3677}
3678
3680 // Not a callee saved register.
3681 return AMDGPU::SGPR30_SGPR31;
3682}
3683
3684const TargetRegisterClass *
3686 const RegisterBank &RB) const {
3687 switch (RB.getID()) {
3688 case AMDGPU::VGPRRegBankID:
3690 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
3691 case AMDGPU::VCCRegBankID:
3692 assert(Size == 1);
3693 return getWaveMaskRegClass();
3694 case AMDGPU::SGPRRegBankID:
3695 return getSGPRClassForBitWidth(std::max(32u, Size));
3696 case AMDGPU::AGPRRegBankID:
3697 return getAGPRClassForBitWidth(std::max(32u, Size));
3698 default:
3699 llvm_unreachable("unknown register bank");
3700 }
3701}
3702
3703const TargetRegisterClass *
3705 const MachineRegisterInfo &MRI) const {
3706 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
3707 if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB))
3708 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
3709
3710 if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB))
3711 return getAllocatableClass(RC);
3712
3713 return nullptr;
3714}
3715
3717 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
3718}
3719
3721 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3722}
3723
3725 // VGPR tuples have an alignment requirement on gfx90a variants.
3726 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
3727 : &AMDGPU::VReg_64RegClass;
3728}
3729
3730const TargetRegisterClass *
3731SIRegisterInfo::getRegClass(unsigned RCID) const {
3732 switch ((int)RCID) {
3733 case AMDGPU::SReg_1RegClassID:
3734 return getBoolRC();
3735 case AMDGPU::SReg_1_XEXECRegClassID:
3736 return getWaveMaskRegClass();
3737 case -1:
3738 return nullptr;
3739 default:
3740 return AMDGPUGenRegisterInfo::getRegClass(RCID);
3741 }
3742}
3743
3744// Find reaching register definition
3748 LiveIntervals *LIS) const {
3749 auto &MDT = LIS->getDomTree();
3750 SlotIndex UseIdx = LIS->getInstructionIndex(Use);
3751 SlotIndex DefIdx;
3752
3753 if (Reg.isVirtual()) {
3754 if (!LIS->hasInterval(Reg))
3755 return nullptr;
3756 LiveInterval &LI = LIS->getInterval(Reg);
3757 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
3758 : MRI.getMaxLaneMaskForVReg(Reg);
3759 VNInfo *V = nullptr;
3760 if (LI.hasSubRanges()) {
3761 for (auto &S : LI.subranges()) {
3762 if ((S.LaneMask & SubLanes) == SubLanes) {
3763 V = S.getVNInfoAt(UseIdx);
3764 break;
3765 }
3766 }
3767 } else {
3768 V = LI.getVNInfoAt(UseIdx);
3769 }
3770 if (!V)
3771 return nullptr;
3772 DefIdx = V->def;
3773 } else {
3774 // Find last def.
3775 for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
3776 LiveRange &LR = LIS->getRegUnit(Unit);
3777 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
3778 if (!DefIdx.isValid() ||
3779 MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
3780 LIS->getInstructionFromIndex(V->def)))
3781 DefIdx = V->def;
3782 } else {
3783 return nullptr;
3784 }
3785 }
3786 }
3787
3788 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
3789
3790 if (!Def || !MDT.dominates(Def, &Use))
3791 return nullptr;
3792
3793 assert(Def->modifiesRegister(Reg, this));
3794
3795 return Def;
3796}
3797
3799 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32);
3800
3801 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
3802 AMDGPU::SReg_32RegClass,
3803 AMDGPU::AGPR_32RegClass } ) {
3804 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
3805 return Super;
3806 }
3807 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
3808 &AMDGPU::VGPR_32RegClass)) {
3809 return Super;
3810 }
3811
3812 return AMDGPU::NoRegister;
3813}
3814
3816 if (!ST.needsAlignedVGPRs())
3817 return true;
3818
3819 if (isVGPRClass(&RC))
3820 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
3821 if (isAGPRClass(&RC))
3822 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
3823 if (isVectorSuperClass(&RC))
3824 return RC.hasSuperClassEq(
3825 getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
3826
3827 return true;
3828}
3829
3830const TargetRegisterClass *
3832 if (!RC || !ST.needsAlignedVGPRs())
3833 return RC;
3834
3835 unsigned Size = getRegSizeInBits(*RC);
3836 if (Size <= 32)
3837 return RC;
3838
3839 if (isVGPRClass(RC))
3841 if (isAGPRClass(RC))
3843 if (isVectorSuperClass(RC))
3845
3846 return RC;
3847}
3848
3851 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
3852}
3853
3856 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2);
3857}
3858
3861 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
3862}
3863
3864unsigned
3866 unsigned SubReg) const {
3867 switch (RC->TSFlags & SIRCFlags::RegKindMask) {
3868 case SIRCFlags::HasSGPR:
3869 return std::min(128u, getSubRegIdxSize(SubReg));
3870 case SIRCFlags::HasAGPR:
3871 case SIRCFlags::HasVGPR:
3873 return std::min(32u, getSubRegIdxSize(SubReg));
3874 default:
3875 break;
3876 }
3877 return 0;
3878}
3879
3880unsigned
3882 const TargetRegisterClass &RC) const {
3883 for (MCPhysReg Reg : reverse(RC.getRegisters()))
3884 if (MRI.isPhysRegUsed(Reg))
3885 return getHWRegIndex(Reg) + 1;
3886 return 0;
3887}
3888
3891 const MachineFunction &MF) const {
3893 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3894 if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
3895 RegFlags.push_back("WWM_REG");
3896 return RegFlags;
3897}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A set of register units.
#define I(x, y, z)
Definition: MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
This file declares the machine register scavenger class.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static int getOffenMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyAGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFLoad(unsigned Opc)
static const std::array< unsigned, 17 > SubRegFromChannelTableWidthMap
static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI, const Twine &ErrMsg)
static const TargetRegisterClass * getAlignedAGPRClassForBitWidth(unsigned BitWidth)
static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, MachineFrameInfo &MFI, MachineBasicBlock::iterator MI, int Index, int64_t Offset)
static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, unsigned LoadStoreOp, unsigned EltSize)
static const TargetRegisterClass * getAlignedVGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyVGPRClassForBitWidth(unsigned BitWidth)
static cl::opt< bool > EnableSpillSGPRToVGPR("amdgpu-spill-sgpr-to-vgpr", cl::desc("Enable spilling SGPRs to VGPRs"), cl::ReallyHidden, cl::init(true))
static unsigned getNumSubRegsForSpillOp(unsigned Op)
static const TargetRegisterClass * getAlignedVectorSuperClassForBitWidth(unsigned BitWidth)
static const TargetRegisterClass * getAnyVectorSuperClassForBitWidth(unsigned BitWidth)
static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, int Index, unsigned Lane, unsigned ValueReg, bool IsKill)
static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI, const MachineInstr &MI)
static int getOffenMUBUFLoad(unsigned Opc)
Interface definition for SIRegisterInfo.
static const char * getRegisterName(MCRegister Reg)
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
bool test(unsigned Idx) const
Definition: BitVector.h:461
void resize(unsigned N, bool t=false)
resize - Grow or shrink the bitvector.
Definition: BitVector.h:341
BitVector & set()
Definition: BitVector.h:351
bool empty() const
empty - Tests whether there are no bits in this bitvector.
Definition: BitVector.h:156
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Register getReg() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
bool hasGFX90AInsts() const
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
bool hasMFMAInlineLiteralBug() const
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
unsigned getConstantBusLimit(unsigned Opcode) const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool enableFlatScratch() const
Definition: GCNSubtarget.h:666
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:283
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:946
bool hasFlatScratchSTMode() const
Definition: GCNSubtarget.h:656
unsigned getMaxWaveScratchSize() const
Definition: GCNSubtarget.h:331
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:687
bool hasSubRanges() const
Returns true if subregister liveness information is available.
Definition: LiveInterval.h:810
iterator_range< subrange_iterator > subranges()
Definition: LiveInterval.h:782
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
bool hasInterval(Register Reg) const
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
MachineDominatorTree & getDomTree()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveRange & getRegUnit(unsigned Unit)
Return the live range for register unit Unit.
LiveInterval & getInterval(Register Reg)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
Definition: LiveInterval.h:421
A set of register units used to track register liveness.
Definition: LiveRegUnits.h:30
bool available(MCPhysReg Reg) const
Returns true if no part of physical register Reg is live.
Definition: LiveRegUnits.h:116
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
MCRegAliasIterator enumerates all registers aliasing Reg.
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition: MCRegister.h:78
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasCalls() const
Return true if the current function has any function calls.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
bool hasStackObjects() const
Return true if there are any stack objects in this function.
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:71
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:577
void setAsmPrinterFlag(uint8_t Flag)
Set a flag for the AsmPrinter.
Definition: MachineInstr.h:380
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:587
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void assignRegToScavengingIndex(int FI, Register Reg, MachineInstr *Restore=nullptr)
Record that Reg is in use at scavenging index FI.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
Holds all the information related to register banks.
virtual bool isDivergentRegBank(const RegisterBank *RB) const
Returns true if the register bank is considered divergent.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:115
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:513
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:645
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:537
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool usesAGPRs(const MachineFunction &MF) const
ArrayRef< MCPhysReg > getAGPRSpillVGPRs() const
MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< MCPhysReg > getVGPRSpillAGPRs() const
int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI)
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const
bool checkFlag(Register Reg, uint8_t Flag) const
const ReservedRegSet & getWWMReservedRegs() const
Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, int64_t Offset) const override
int64_t getScratchInstrOffset(const MachineInstr *MI) const
bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const override
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
ArrayRef< MCPhysReg > getAllSGPR64(const MachineFunction &MF) const
Return all SGPR64 which satisfy the waves per execution unit requirement of the subtarget.
MCRegister findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, const MachineFunction &MF, bool ReserveHighestVGPR=false) const
Returns a lowest register that is not used at any point in the function.
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
bool requiresFrameIndexReplacementScavenging(const MachineFunction &MF) const override
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool shouldRealignStack(const MachineFunction &MF) const override
bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
Register getFrameRegister(const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getVectorSuperClassForBitWidth(unsigned BitWidth) const
bool spillEmergencySGPR(MachineBasicBlock::iterator MI, MachineBasicBlock &RestoreMBB, Register SGPR, RegScavenger *RS) const
SIRegisterInfo(const GCNSubtarget &ST)
const uint32_t * getAllVGPRRegMask() const
MCRegister getReturnAddressReg(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
bool hasBasePointer(const MachineFunction &MF) const
const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override
Returns a legal register class to copy a register in the specified class to or from.
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
ArrayRef< MCPhysReg > getAllSGPR32(const MachineFunction &MF) const
Return all SGPR32 which satisfy the waves per execution unit requirement of the subtarget.
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch buffer in case spilling is needed.
bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool SpillToPhysVGPRLane=false) const
Special case of eliminateFrameIndex.
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
void buildSpillLoadStore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned LoadStoreOp, int Index, Register ValueReg, bool ValueIsKill, MCRegister ScratchOffsetReg, int64_t InstrOffset, MachineMemOperand *MMO, RegScavenger *RS, LiveRegUnits *LiveUnits=nullptr) const
std::pair< unsigned, unsigned > getMaxNumVectorRegs(const MachineFunction &MF) const
Return a pair of maximum numbers of VGPRs and AGPRs that meet the number of waves per execution unit ...
bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override
LLVM_READONLY const TargetRegisterClass * getAGPRClassForBitWidth(unsigned BitWidth) const
static bool isChainScratchRegister(Register VGPR)
bool requiresRegisterScavenging(const MachineFunction &Fn) const override
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
bool isUniformReg(const MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, Register Reg) const override
const uint32_t * getNoPreservedMask() const override
StringRef getRegAsmName(MCRegister Reg) const override
const uint32_t * getAllAllocatableSRegMask() const
MCRegister getAlignedHighSGPRForRC(const MachineFunction &MF, const unsigned Align, const TargetRegisterClass *RC) const
Return the largest available SGPR aligned to Align for the register class RC.
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, const TargetRegisterClass &RC) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const MCPhysReg * getCalleeSavedRegsViaCopy(const MachineFunction *MF) const
const uint32_t * getAllVectorRegMask() const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
bool opCanUseLiteralConstant(unsigned OpType) const
Register getBaseRegister() const
LLVM_READONLY const TargetRegisterClass * getVGPRClassForBitWidth(unsigned BitWidth) const
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override
bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, unsigned DefSubReg, const TargetRegisterClass *SrcRC, unsigned SrcSubReg) const override
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
MachineInstr * findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
SmallVector< StringLiteral > getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
ArrayRef< MCPhysReg > getAllSGPR128(const MachineFunction &MF) const
Return all SGPR128 which satisfy the waves per execution unit requirement of the subtarget.
unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const override
BitVector getReservedRegs(const MachineFunction &MF) const override
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override
const TargetRegisterClass * getRegClassForOperandReg(const MachineRegisterInfo &MRI, const MachineOperand &MO) const
const uint32_t * getAllAGPRRegMask() const
bool shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg, const TargetRegisterClass *DstRC, unsigned DstSubReg, const TargetRegisterClass *NewRC, LiveIntervals &LIS) const override
const TargetRegisterClass * getBoolRC() const
const TargetRegisterClass * getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const override
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override
bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
If OnlyToVGPR is true, this will only succeed if this manages to find a free VGPR lane to spill.
MCRegister getExec() const
MCRegister getVCC() const
int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override
bool isVectorSuperClass(const TargetRegisterClass *RC) const
const TargetRegisterClass * getWaveMaskRegClass() const
unsigned getSubRegAlignmentNumBits(const TargetRegisterClass *RC, unsigned SubReg) const
void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override
bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override
const TargetRegisterClass * getVGPR64Class() const
void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill=true) const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
const int * getRegUnitPressureSets(unsigned RegUnit) const override
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:65
bool isValid() const
Returns true if this is a valid index.
Definition: SlotIndexes.h:130
SlotIndexes pass.
Definition: SlotIndexes.h:297
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:531
SlotIndex replaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
ReplaceMachineInstrInMaps - Replacing a machine instr with a new one in maps used by register allocat...
Definition: SlotIndexes.h:588
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
const uint8_t TSFlags
Configurable target specific flags.
ArrayRef< MCPhysReg > getRegisters() const
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
virtual const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &) const
Returns the largest super class of RC that is legal to use in the current sub-target and has the same...
virtual bool shouldRealignStack(const MachineFunction &MF) const
True if storage within the function requires the stack pointer to be aligned more than the normal cal...
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
VNInfo - Value Number Information.
Definition: LiveInterval.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ PRIVATE_ADDRESS
Address space for private memory.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
LLVM_READONLY int getFlatScratchInstSTfromSS(uint16_t Opcode)
LLVM_READONLY int getFlatScratchInstSVfromSVS(uint16_t Opcode)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
@ OPERAND_REG_IMM_FIRST
Definition: SIDefines.h:256
@ OPERAND_SRC_FIRST
Definition: SIDefines.h:265
@ OPERAND_REG_INLINE_AC_FIRST
Definition: SIDefines.h:262
@ OPERAND_REG_INLINE_AC_LAST
Definition: SIDefines.h:263
@ OPERAND_REG_IMM_LAST
Definition: SIDefines.h:257
@ OPERAND_SRC_LAST
Definition: SIDefines.h:266
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Renamable
Register that may be renamed.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition: MCRegister.h:21
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:556
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
@ HasSGPR
Definition: SIDefines.h:26
@ HasVGPR
Definition: SIDefines.h:24
@ RegKindMask
Definition: SIDefines.h:29
@ HasAGPR
Definition: SIDefines.h:25
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:404
unsigned getDefRegState(bool B)
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition: Threading.h:86
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:47
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Description of the encoding of one expression Op.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI)
ArrayRef< int16_t > SplitParts
SIMachineFunctionInfo & MFI
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, int Index, RegScavenger *RS)
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, bool IsKill, int Index, RegScavenger *RS)
PerVGPRData getPerVGPRData()
MachineBasicBlock::iterator MI
void readWriteTmpVGPR(unsigned Offset, bool IsLoad)
const SIRegisterInfo & TRI
MachineFunction & MF
MachineBasicBlock * MBB
const SIInstrInfo & TII
The llvm::once_flag structure.
Definition: Threading.h:67