LLVM 20.0.0git
SIRegisterInfo.cpp
Go to the documentation of this file.
1//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI implementation of the TargetRegisterInfo class.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
16#include "GCNSubtarget.h"
20#include "SIRegisterInfo.h"
26
27using namespace llvm;
28
29#define GET_REGINFO_TARGET_DESC
30#include "AMDGPUGenRegisterInfo.inc"
31
33 "amdgpu-spill-sgpr-to-vgpr",
34 cl::desc("Enable spilling SGPRs to VGPRs"),
36 cl::init(true));
37
38std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
39std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
40
41// Map numbers of DWORDs to indexes in SubRegFromChannelTable.
42// Valid indexes are shifted 1, such that a 0 mapping means unsupported.
43// e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
44// meaning index 7 in SubRegFromChannelTable.
45static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
47
48static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI,
49 const Twine &ErrMsg) {
51 DiagnosticInfoUnsupported(Fn, ErrMsg, MI.getDebugLoc()));
52}
53
54namespace llvm {
55
56// A temporary struct to spill SGPRs.
57// This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
58// just v_writelane and v_readlane.
59//
60// When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
61// is saved to scratch (or the other way around for loads).
62// For this, a VGPR is required where the needed lanes can be clobbered. The
63// RegScavenger can provide a VGPR where currently active lanes can be
64// clobbered, but we still need to save inactive lanes.
65// The high-level steps are:
66// - Try to scavenge SGPR(s) to save exec
67// - Try to scavenge VGPR
68// - Save needed, all or inactive lanes of a TmpVGPR
69// - Spill/Restore SGPRs using TmpVGPR
70// - Restore TmpVGPR
71//
72// To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
73// cannot scavenge temporary SGPRs to save exec, we use the following code:
74// buffer_store_dword TmpVGPR ; only if active lanes need to be saved
75// s_not exec, exec
76// buffer_store_dword TmpVGPR ; save inactive lanes
77// s_not exec, exec
79 struct PerVGPRData {
80 unsigned PerVGPR;
81 unsigned NumVGPRs;
82 int64_t VGPRLanes;
83 };
84
85 // The SGPR to save
89 unsigned NumSubRegs;
90 bool IsKill;
91 const DebugLoc &DL;
92
93 /* When spilling to stack */
94 // The SGPRs are written into this VGPR, which is then written to scratch
95 // (or vice versa for loads).
96 Register TmpVGPR = AMDGPU::NoRegister;
97 // Temporary spill slot to save TmpVGPR to.
98 int TmpVGPRIndex = 0;
99 // If TmpVGPR is live before the spill or if it is scavenged.
100 bool TmpVGPRLive = false;
101 // Scavenged SGPR to save EXEC.
102 Register SavedExecReg = AMDGPU::NoRegister;
103 // Stack index to write the SGPRs to.
104 int Index;
105 unsigned EltSize = 4;
106
115 unsigned MovOpc;
116 unsigned NotOpc;
117
121 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
122 MI->getOperand(0).isKill(), Index, RS) {}
123
126 bool IsKill, int Index, RegScavenger *RS)
127 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
128 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
129 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
131 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
134
135 if (IsWave32) {
136 ExecReg = AMDGPU::EXEC_LO;
137 MovOpc = AMDGPU::S_MOV_B32;
138 NotOpc = AMDGPU::S_NOT_B32;
139 } else {
140 ExecReg = AMDGPU::EXEC;
141 MovOpc = AMDGPU::S_MOV_B64;
142 NotOpc = AMDGPU::S_NOT_B64;
143 }
144
145 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
146 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
147 SuperReg != AMDGPU::EXEC && "exec should never spill");
148 }
149
152 Data.PerVGPR = IsWave32 ? 32 : 64;
153 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
154 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
155 return Data;
156 }
157
158 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
159 // free.
160 // Writes these instructions if an SGPR can be scavenged:
161 // s_mov_b64 s[6:7], exec ; Save exec
162 // s_mov_b64 exec, 3 ; Wanted lanemask
163 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
164 //
165 // Writes these instructions if no SGPR can be scavenged:
166 // buffer_store_dword v0 ; Only if no free VGPR was found
167 // s_not_b64 exec, exec
168 // buffer_store_dword v0 ; Save inactive lanes
169 // ; exec stays inverted, it is flipped back in
170 // ; restore.
171 void prepare() {
172 // Scavenged temporary VGPR to use. It must be scavenged once for any number
173 // of spilled subregs.
174 // FIXME: The liveness analysis is limited and does not tell if a register
175 // is in use in lanes that are currently inactive. We can never be sure if
176 // a register as actually in use in another lane, so we need to save all
177 // used lanes of the chosen VGPR.
178 assert(RS && "Cannot spill SGPR to memory without RegScavenger");
179 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
180 0, false);
181
182 // Reserve temporary stack slot
184 if (TmpVGPR) {
185 // Found a register that is dead in the currently active lanes, we only
186 // need to spill inactive lanes.
187 TmpVGPRLive = false;
188 } else {
189 // Pick v0 because it doesn't make a difference.
190 TmpVGPR = AMDGPU::VGPR0;
191 TmpVGPRLive = true;
192 }
193
194 if (TmpVGPRLive) {
195 // We need to inform the scavenger that this index is already in use until
196 // we're done with the custom emergency spill.
198 }
199
200 // We may end up recursively calling the scavenger, and don't want to re-use
201 // the same register.
203
204 // Try to scavenge SGPRs to save exec
205 assert(!SavedExecReg && "Exec is already saved, refuse to save again");
206 const TargetRegisterClass &RC =
207 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
209 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
210
211 int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
212
213 if (SavedExecReg) {
215 // Set exec to needed lanes
217 auto I =
218 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
219 if (!TmpVGPRLive)
221 // Spill needed lanes
222 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
223 } else {
224 // The modify and restore of exec clobber SCC, which we would have to save
225 // and restore. FIXME: We probably would need to reserve a register for
226 // this.
227 if (RS->isRegUsed(AMDGPU::SCC))
229 "unhandled SGPR spill to memory");
230
231 // Spill active lanes
232 if (TmpVGPRLive)
233 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
234 /*IsKill*/ false);
235 // Spill inactive lanes
236 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
237 if (!TmpVGPRLive)
239 I->getOperand(2).setIsDead(); // Mark SCC as dead.
240 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
241 }
242 }
243
244 // Writes these instructions if an SGPR can be scavenged:
245 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
246 // s_waitcnt vmcnt(0) ; If a free VGPR was found
247 // s_mov_b64 exec, s[6:7] ; Save exec
248 //
249 // Writes these instructions if no SGPR can be scavenged:
250 // buffer_load_dword v0 ; Restore inactive lanes
251 // s_waitcnt vmcnt(0) ; If a free VGPR was found
252 // s_not_b64 exec, exec
253 // buffer_load_dword v0 ; Only if no free VGPR was found
254 void restore() {
255 if (SavedExecReg) {
256 // Restore used lanes
257 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
258 /*IsKill*/ false);
259 // Restore exec
260 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
262 // Add an implicit use of the load so it is not dead.
263 // FIXME This inserts an unnecessary waitcnt
264 if (!TmpVGPRLive) {
266 }
267 } else {
268 // Restore inactive lanes
269 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
270 /*IsKill*/ false);
271 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
272 if (!TmpVGPRLive)
274 I->getOperand(2).setIsDead(); // Mark SCC as dead.
275
276 // Restore active lanes
277 if (TmpVGPRLive)
278 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
279 }
280
281 // Inform the scavenger where we're releasing our custom scavenged register.
282 if (TmpVGPRLive) {
283 MachineBasicBlock::iterator RestorePt = std::prev(MI);
285 }
286 }
287
288 // Write TmpVGPR to memory or read TmpVGPR from memory.
289 // Either using a single buffer_load/store if exec is set to the needed mask
290 // or using
291 // buffer_load
292 // s_not exec, exec
293 // buffer_load
294 // s_not exec, exec
295 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
296 if (SavedExecReg) {
297 // Spill needed lanes
298 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
299 } else {
300 // The modify and restore of exec clobber SCC, which we would have to save
301 // and restore. FIXME: We probably would need to reserve a register for
302 // this.
303 if (RS->isRegUsed(AMDGPU::SCC))
305 "unhandled SGPR spill to memory");
306
307 // Spill active lanes
308 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
309 /*IsKill*/ false);
310 // Spill inactive lanes
311 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
312 Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
313 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
314 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
315 Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
316 }
317 }
318
320 assert(MBB->getParent() == &MF);
321 MI = NewMI;
322 MBB = NewMBB;
323 }
324};
325
326} // namespace llvm
327
329 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
330 ST.getAMDGPUDwarfFlavour(),
331 /*PC=*/0, ST.getHwMode()),
332 ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
333
334 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
335 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
336 (getSubRegIndexLaneMask(AMDGPU::lo16) |
337 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
338 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
339 "getNumCoveredRegs() will not work with generated subreg masks!");
340
341 RegPressureIgnoredUnits.resize(getNumRegUnits());
342 RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin());
343 for (auto Reg : AMDGPU::VGPR_16RegClass) {
344 if (AMDGPU::isHi16Reg(Reg, *this))
345 RegPressureIgnoredUnits.set(*regunits(Reg).begin());
346 }
347
348 // HACK: Until this is fully tablegen'd.
349 static llvm::once_flag InitializeRegSplitPartsFlag;
350
351 static auto InitializeRegSplitPartsOnce = [this]() {
352 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
353 unsigned Size = getSubRegIdxSize(Idx);
354 if (Size & 31)
355 continue;
356 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1];
357 unsigned Pos = getSubRegIdxOffset(Idx);
358 if (Pos % Size)
359 continue;
360 Pos /= Size;
361 if (Vec.empty()) {
362 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
363 Vec.resize(MaxNumParts);
364 }
365 Vec[Pos] = Idx;
366 }
367 };
368
369 static llvm::once_flag InitializeSubRegFromChannelTableFlag;
370
371 static auto InitializeSubRegFromChannelTableOnce = [this]() {
372 for (auto &Row : SubRegFromChannelTable)
373 Row.fill(AMDGPU::NoSubRegister);
374 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
375 unsigned Width = getSubRegIdxSize(Idx) / 32;
376 unsigned Offset = getSubRegIdxOffset(Idx) / 32;
378 Width = SubRegFromChannelTableWidthMap[Width];
379 if (Width == 0)
380 continue;
381 unsigned TableIdx = Width - 1;
382 assert(TableIdx < SubRegFromChannelTable.size());
383 assert(Offset < SubRegFromChannelTable[TableIdx].size());
384 SubRegFromChannelTable[TableIdx][Offset] = Idx;
385 }
386 };
387
388 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
389 llvm::call_once(InitializeSubRegFromChannelTableFlag,
390 InitializeSubRegFromChannelTableOnce);
391}
392
393void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
394 MCRegister Reg) const {
395 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
396 Reserved.set(*R);
397}
398
399// Forced to be here by one .inc
401 const MachineFunction *MF) const {
403 switch (CC) {
404 case CallingConv::C:
407 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
408 : CSR_AMDGPU_SaveList;
410 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
411 : CSR_AMDGPU_SI_Gfx_SaveList;
413 return CSR_AMDGPU_CS_ChainPreserve_SaveList;
414 default: {
415 // Dummy to not crash RegisterClassInfo.
416 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
417 return &NoCalleeSavedReg;
418 }
419 }
420}
421
422const MCPhysReg *
424 return nullptr;
425}
426
428 CallingConv::ID CC) const {
429 switch (CC) {
430 case CallingConv::C:
433 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
434 : CSR_AMDGPU_RegMask;
436 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
437 : CSR_AMDGPU_SI_Gfx_RegMask;
440 // Calls to these functions never return, so we can pretend everything is
441 // preserved.
442 return AMDGPU_AllVGPRs_RegMask;
443 default:
444 return nullptr;
445 }
446}
447
449 return CSR_AMDGPU_NoRegs_RegMask;
450}
451
453 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
454}
455
458 const MachineFunction &MF) const {
459 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
460 // equivalent AV class. If used one, the verifier will crash after
461 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
462 // until Instruction selection.
463 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
464 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
465 return &AMDGPU::AV_32RegClass;
466 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
467 return &AMDGPU::AV_64RegClass;
468 if (RC == &AMDGPU::VReg_64_Align2RegClass ||
469 RC == &AMDGPU::AReg_64_Align2RegClass)
470 return &AMDGPU::AV_64_Align2RegClass;
471 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
472 return &AMDGPU::AV_96RegClass;
473 if (RC == &AMDGPU::VReg_96_Align2RegClass ||
474 RC == &AMDGPU::AReg_96_Align2RegClass)
475 return &AMDGPU::AV_96_Align2RegClass;
476 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
477 return &AMDGPU::AV_128RegClass;
478 if (RC == &AMDGPU::VReg_128_Align2RegClass ||
479 RC == &AMDGPU::AReg_128_Align2RegClass)
480 return &AMDGPU::AV_128_Align2RegClass;
481 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
482 return &AMDGPU::AV_160RegClass;
483 if (RC == &AMDGPU::VReg_160_Align2RegClass ||
484 RC == &AMDGPU::AReg_160_Align2RegClass)
485 return &AMDGPU::AV_160_Align2RegClass;
486 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
487 return &AMDGPU::AV_192RegClass;
488 if (RC == &AMDGPU::VReg_192_Align2RegClass ||
489 RC == &AMDGPU::AReg_192_Align2RegClass)
490 return &AMDGPU::AV_192_Align2RegClass;
491 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
492 return &AMDGPU::AV_256RegClass;
493 if (RC == &AMDGPU::VReg_256_Align2RegClass ||
494 RC == &AMDGPU::AReg_256_Align2RegClass)
495 return &AMDGPU::AV_256_Align2RegClass;
496 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
497 return &AMDGPU::AV_512RegClass;
498 if (RC == &AMDGPU::VReg_512_Align2RegClass ||
499 RC == &AMDGPU::AReg_512_Align2RegClass)
500 return &AMDGPU::AV_512_Align2RegClass;
501 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
502 return &AMDGPU::AV_1024RegClass;
503 if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
504 RC == &AMDGPU::AReg_1024_Align2RegClass)
505 return &AMDGPU::AV_1024_Align2RegClass;
506 }
507
509}
510
512 const SIFrameLowering *TFI = ST.getFrameLowering();
514 // During ISel lowering we always reserve the stack pointer in entry and chain
515 // functions, but never actually want to reference it when accessing our own
516 // frame. If we need a frame pointer we use it, but otherwise we can just use
517 // an immediate "0" which we represent by returning NoRegister.
518 if (FuncInfo->isBottomOfStack()) {
519 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
520 }
521 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
522 : FuncInfo->getStackPtrOffsetReg();
523}
524
526 // When we need stack realignment, we can't reference off of the
527 // stack pointer, so we reserve a base pointer.
528 const MachineFrameInfo &MFI = MF.getFrameInfo();
529 return MFI.getNumFixedObjects() && shouldRealignStack(MF);
530}
531
532Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
533
535 return AMDGPU_AllVGPRs_RegMask;
536}
537
539 return AMDGPU_AllAGPRs_RegMask;
540}
541
543 return AMDGPU_AllVectorRegs_RegMask;
544}
545
547 return AMDGPU_AllAllocatableSRegs_RegMask;
548}
549
550unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
551 unsigned NumRegs) {
552 assert(NumRegs < SubRegFromChannelTableWidthMap.size());
553 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
554 assert(NumRegIndex && "Not implemented");
555 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
556 return SubRegFromChannelTable[NumRegIndex - 1][Channel];
557}
558
561 const unsigned Align,
562 const TargetRegisterClass *RC) const {
563 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align;
564 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
565 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
566}
567
569 const MachineFunction &MF) const {
570 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
571}
572
573std::pair<unsigned, unsigned>
576 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
577 unsigned MaxNumAGPRs = MaxNumVGPRs;
578 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
579
580 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
581 // a wave may have up to 512 total vector registers combining together both
582 // VGPRs and AGPRs. Hence, in an entry function without calls and without
583 // AGPRs used within it, it is possible to use the whole vector register
584 // budget for VGPRs.
585 //
586 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
587 // register file accordingly.
588 if (ST.hasGFX90AInsts()) {
589 if (MFI->usesAGPRs(MF)) {
590 MaxNumVGPRs /= 2;
591 MaxNumAGPRs = MaxNumVGPRs;
592 } else {
593 if (MaxNumVGPRs > TotalNumVGPRs) {
594 MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs;
595 MaxNumVGPRs = TotalNumVGPRs;
596 } else
597 MaxNumAGPRs = 0;
598 }
599 }
600
601 return std::pair(MaxNumVGPRs, MaxNumAGPRs);
602}
603
605 BitVector Reserved(getNumRegs());
606 Reserved.set(AMDGPU::MODE);
607
609
610 // Reserve special purpose registers.
611 //
612 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
613 // this seems likely to result in bugs, so I'm marking them as reserved.
614 reserveRegisterTuples(Reserved, AMDGPU::EXEC);
615 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
616
617 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
618 reserveRegisterTuples(Reserved, AMDGPU::M0);
619
620 // Reserve src_vccz, src_execz, src_scc.
621 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
622 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
623 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
624
625 // Reserve the memory aperture registers
626 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
627 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
628 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
629 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
630
631 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
632 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
633
634 // Reserve xnack_mask registers - support is not implemented in Codegen.
635 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
636
637 // Reserve lds_direct register - support is not implemented in Codegen.
638 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
639
640 // Reserve Trap Handler registers - support is not implemented in Codegen.
641 reserveRegisterTuples(Reserved, AMDGPU::TBA);
642 reserveRegisterTuples(Reserved, AMDGPU::TMA);
643 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
644 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
645 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
646 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
647 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
648 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
649 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
650 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
651
652 // Reserve null register - it shall never be allocated
653 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
654
655 // Reserve SGPRs.
656 //
657 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
658 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
659 for (const TargetRegisterClass *RC : regclasses()) {
660 if (RC->isBaseClass() && isSGPRClass(RC)) {
661 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
662 for (MCPhysReg Reg : *RC) {
663 unsigned Index = getHWRegIndex(Reg);
664 if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs)
665 Reserved.set(Reg);
666 }
667 }
668 }
669
670 Register ScratchRSrcReg = MFI->getScratchRSrcReg();
671 if (ScratchRSrcReg != AMDGPU::NoRegister) {
672 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
673 // need to spill.
674 // TODO: May need to reserve a VGPR if doing LDS spilling.
675 reserveRegisterTuples(Reserved, ScratchRSrcReg);
676 }
677
678 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
679 if (LongBranchReservedReg)
680 reserveRegisterTuples(Reserved, LongBranchReservedReg);
681
682 // We have to assume the SP is needed in case there are calls in the function,
683 // which is detected after the function is lowered. If we aren't really going
684 // to need SP, don't bother reserving it.
685 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
686 if (StackPtrReg) {
687 reserveRegisterTuples(Reserved, StackPtrReg);
688 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
689 }
690
691 MCRegister FrameReg = MFI->getFrameOffsetReg();
692 if (FrameReg) {
693 reserveRegisterTuples(Reserved, FrameReg);
694 assert(!isSubRegister(ScratchRSrcReg, FrameReg));
695 }
696
697 if (hasBasePointer(MF)) {
698 MCRegister BasePtrReg = getBaseRegister();
699 reserveRegisterTuples(Reserved, BasePtrReg);
700 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
701 }
702
703 // FIXME: Use same reserved register introduced in D149775
704 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
705 Register ExecCopyReg = MFI->getSGPRForEXECCopy();
706 if (ExecCopyReg)
707 reserveRegisterTuples(Reserved, ExecCopyReg);
708
709 // Reserve VGPRs/AGPRs.
710 //
711 auto [MaxNumVGPRs, MaxNumAGPRs] = getMaxNumVectorRegs(MF);
712
713 for (const TargetRegisterClass *RC : regclasses()) {
714 if (RC->isBaseClass() && isVGPRClass(RC)) {
715 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
716 for (MCPhysReg Reg : *RC) {
717 unsigned Index = getHWRegIndex(Reg);
718 if (Index + NumRegs > MaxNumVGPRs)
719 Reserved.set(Reg);
720 }
721 }
722 }
723
724 // Reserve all the AGPRs if there are no instructions to use it.
725 if (!ST.hasMAIInsts())
726 MaxNumAGPRs = 0;
727 for (const TargetRegisterClass *RC : regclasses()) {
728 if (RC->isBaseClass() && isAGPRClass(RC)) {
729 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
730 for (MCPhysReg Reg : *RC) {
731 unsigned Index = getHWRegIndex(Reg);
732 if (Index + NumRegs > MaxNumAGPRs)
733 Reserved.set(Reg);
734 }
735 }
736 }
737
738 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
739 // VGPR available at all times.
740 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
741 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
742 }
743
744 // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The
745 // MFI->getNonWWMRegMask() field will have a valid bitmask only during
746 // wwm-regalloc and it would be empty otherwise.
747 BitVector NonWWMRegMask = MFI->getNonWWMRegMask();
748 if (!NonWWMRegMask.empty()) {
749 for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs;
750 RegI < RegE; ++RegI) {
751 if (NonWWMRegMask.test(RegI))
752 reserveRegisterTuples(Reserved, RegI);
753 }
754 }
755
756 for (Register Reg : MFI->getWWMReservedRegs())
757 reserveRegisterTuples(Reserved, Reg);
758
759 // FIXME: Stop using reserved registers for this.
760 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
761 reserveRegisterTuples(Reserved, Reg);
762
763 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
764 reserveRegisterTuples(Reserved, Reg);
765
766 return Reserved;
767}
768
770 MCRegister PhysReg) const {
771 return !MF.getRegInfo().isReserved(PhysReg);
772}
773
776 // On entry or in chain functions, the base address is 0, so it can't possibly
777 // need any more alignment.
778
779 // FIXME: Should be able to specify the entry frame alignment per calling
780 // convention instead.
781 if (Info->isBottomOfStack())
782 return false;
783
785}
786
789 if (Info->isEntryFunction()) {
790 const MachineFrameInfo &MFI = Fn.getFrameInfo();
791 return MFI.hasStackObjects() || MFI.hasCalls();
792 }
793
794 // May need scavenger for dealing with callee saved registers.
795 return true;
796}
797
799 const MachineFunction &MF) const {
800 // Do not use frame virtual registers. They used to be used for SGPRs, but
801 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
802 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
803 // spill.
804 return false;
805}
806
808 const MachineFunction &MF) const {
809 const MachineFrameInfo &MFI = MF.getFrameInfo();
810 return MFI.hasStackObjects();
811}
812
814 const MachineFunction &) const {
815 // There are no special dedicated stack or frame pointers.
816 return true;
817}
818
821
822 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
823 AMDGPU::OpName::offset);
824 return MI->getOperand(OffIdx).getImm();
825}
826
828 int Idx) const {
829 switch (MI->getOpcode()) {
830 case AMDGPU::V_ADD_U32_e32:
831 case AMDGPU::V_ADD_U32_e64:
832 case AMDGPU::V_ADD_CO_U32_e32: {
833 int OtherIdx = Idx == 1 ? 2 : 1;
834 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
835 return OtherOp.isImm() ? OtherOp.getImm() : 0;
836 }
837 case AMDGPU::V_ADD_CO_U32_e64: {
838 int OtherIdx = Idx == 2 ? 3 : 2;
839 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
840 return OtherOp.isImm() ? OtherOp.getImm() : 0;
841 }
842 default:
843 break;
844 }
845
847 return 0;
848
849 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
850 AMDGPU::OpName::vaddr) ||
851 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
852 AMDGPU::OpName::saddr))) &&
853 "Should never see frame index on non-address operand");
854
856}
857
859 const MachineInstr &MI) {
860 assert(MI.getDesc().isAdd());
861 const MachineOperand &Src0 = MI.getOperand(1);
862 const MachineOperand &Src1 = MI.getOperand(2);
863
864 if (Src0.isFI()) {
865 return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
866 Src1.getReg()));
867 }
868
869 if (Src1.isFI()) {
870 return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
871 Src0.getReg()));
872 }
873
874 return false;
875}
876
878 // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
879 switch (MI->getOpcode()) {
880 case AMDGPU::V_ADD_U32_e32: {
881 // TODO: We could handle this but it requires work to avoid violating
882 // operand restrictions.
883 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
884 !isFIPlusImmOrVGPR(*this, *MI))
885 return false;
886 [[fallthrough]];
887 }
888 case AMDGPU::V_ADD_U32_e64:
889 // FIXME: This optimization is barely profitable enableFlatScratch as-is.
890 //
891 // Much of the benefit with the MUBUF handling is we avoid duplicating the
892 // shift of the frame register, which isn't needed with scratch.
893 //
894 // materializeFrameBaseRegister doesn't know the register classes of the
895 // uses, and unconditionally uses an s_add_i32, which will end up using a
896 // copy for the vector uses.
897 return !ST.enableFlatScratch();
898 case AMDGPU::V_ADD_CO_U32_e32:
899 if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
900 !isFIPlusImmOrVGPR(*this, *MI))
901 return false;
902 // We can't deal with the case where the carry out has a use (though this
903 // should never happen)
904 return MI->getOperand(3).isDead();
905 case AMDGPU::V_ADD_CO_U32_e64:
906 // TODO: Should we check use_empty instead?
907 return MI->getOperand(1).isDead();
908 default:
909 break;
910 }
911
913 return false;
914
915 int64_t FullOffset = Offset + getScratchInstrOffset(MI);
916
917 const SIInstrInfo *TII = ST.getInstrInfo();
919 return !TII->isLegalMUBUFImmOffset(FullOffset);
920
921 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
923}
924
926 int FrameIdx,
927 int64_t Offset) const {
929 DebugLoc DL; // Defaults to "unknown"
930
931 if (Ins != MBB->end())
932 DL = Ins->getDebugLoc();
933
935 const SIInstrInfo *TII = ST.getInstrInfo();
937 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
938 : AMDGPU::V_MOV_B32_e32;
939
940 Register BaseReg = MRI.createVirtualRegister(
941 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
942 : &AMDGPU::VGPR_32RegClass);
943
944 if (Offset == 0) {
945 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
946 .addFrameIndex(FrameIdx);
947 return BaseReg;
948 }
949
950 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
951
952 Register FIReg = MRI.createVirtualRegister(
953 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
954 : &AMDGPU::VGPR_32RegClass);
955
956 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
957 .addImm(Offset);
958 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
959 .addFrameIndex(FrameIdx);
960
961 if (ST.enableFlatScratch() ) {
962 // FIXME: Make sure scc isn't live in.
963 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
964 .addReg(OffsetReg, RegState::Kill)
965 .addReg(FIReg)
966 .setOperandDead(3); // scc
967 return BaseReg;
968 }
969
970 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
971 .addReg(OffsetReg, RegState::Kill)
972 .addReg(FIReg)
973 .addImm(0); // clamp bit
974
975 return BaseReg;
976}
977
979 int64_t Offset) const {
980 const SIInstrInfo *TII = ST.getInstrInfo();
981
982 switch (MI.getOpcode()) {
983 case AMDGPU::V_ADD_U32_e32:
984 case AMDGPU::V_ADD_CO_U32_e32: {
985 MachineOperand *FIOp = &MI.getOperand(2);
986 MachineOperand *ImmOp = &MI.getOperand(1);
987 if (!FIOp->isFI())
988 std::swap(FIOp, ImmOp);
989
990 if (!ImmOp->isImm()) {
991 assert(Offset == 0);
992 FIOp->ChangeToRegister(BaseReg, false);
993 TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
994 return;
995 }
996
997 int64_t TotalOffset = ImmOp->getImm() + Offset;
998 if (TotalOffset == 0) {
999 MI.setDesc(TII->get(AMDGPU::COPY));
1000 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1001 MI.removeOperand(I);
1002
1003 MI.getOperand(1).ChangeToRegister(BaseReg, false);
1004 return;
1005 }
1006
1007 ImmOp->setImm(TotalOffset);
1008
1009 MachineBasicBlock *MBB = MI.getParent();
1010 MachineFunction *MF = MBB->getParent();
1012
1013 // FIXME: materializeFrameBaseRegister does not know the register class of
1014 // the uses of the frame index, and assumes SGPR for enableFlatScratch. Emit
1015 // a copy so we have a legal operand and hope the register coalescer can
1016 // clean it up.
1017 if (isSGPRReg(MRI, BaseReg)) {
1018 Register BaseRegVGPR =
1019 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1020 BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
1021 .addReg(BaseReg);
1022 MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
1023 } else {
1024 MI.getOperand(2).ChangeToRegister(BaseReg, false);
1025 }
1026 return;
1027 }
1028 case AMDGPU::V_ADD_U32_e64:
1029 case AMDGPU::V_ADD_CO_U32_e64: {
1030 int Src0Idx = MI.getNumExplicitDefs();
1031 MachineOperand *FIOp = &MI.getOperand(Src0Idx);
1032 MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
1033 if (!FIOp->isFI())
1034 std::swap(FIOp, ImmOp);
1035
1036 if (!ImmOp->isImm()) {
1037 FIOp->ChangeToRegister(BaseReg, false);
1038 TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
1039 return;
1040 }
1041
1042 int64_t TotalOffset = ImmOp->getImm() + Offset;
1043 if (TotalOffset == 0) {
1044 MI.setDesc(TII->get(AMDGPU::COPY));
1045
1046 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1047 MI.removeOperand(I);
1048
1049 MI.getOperand(1).ChangeToRegister(BaseReg, false);
1050 } else {
1051 FIOp->ChangeToRegister(BaseReg, false);
1052 ImmOp->setImm(TotalOffset);
1053 }
1054
1055 return;
1056 }
1057 default:
1058 break;
1059 }
1060
1061 bool IsFlat = TII->isFLATScratch(MI);
1062
1063#ifndef NDEBUG
1064 // FIXME: Is it possible to be storing a frame index to itself?
1065 bool SeenFI = false;
1066 for (const MachineOperand &MO: MI.operands()) {
1067 if (MO.isFI()) {
1068 if (SeenFI)
1069 llvm_unreachable("should not see multiple frame indices");
1070
1071 SeenFI = true;
1072 }
1073 }
1074#endif
1075
1076 MachineOperand *FIOp =
1077 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
1078 : AMDGPU::OpName::vaddr);
1079
1080 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
1081 int64_t NewOffset = OffsetOp->getImm() + Offset;
1082
1083 assert(FIOp && FIOp->isFI() && "frame index must be address operand");
1084 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
1085
1086 if (IsFlat) {
1087 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1089 "offset should be legal");
1090 FIOp->ChangeToRegister(BaseReg, false);
1091 OffsetOp->setImm(NewOffset);
1092 return;
1093 }
1094
1095#ifndef NDEBUG
1096 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
1097 assert(SOffset->isImm() && SOffset->getImm() == 0);
1098#endif
1099
1100 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
1101
1102 FIOp->ChangeToRegister(BaseReg, false);
1103 OffsetOp->setImm(NewOffset);
1104}
1105
1107 Register BaseReg,
1108 int64_t Offset) const {
1109
1110 switch (MI->getOpcode()) {
1111 case AMDGPU::V_ADD_U32_e32:
1112 case AMDGPU::V_ADD_CO_U32_e32:
1113 return true;
1114 case AMDGPU::V_ADD_U32_e64:
1115 case AMDGPU::V_ADD_CO_U32_e64:
1117 default:
1118 break;
1119 }
1120
1122 return false;
1123
1124 int64_t NewOffset = Offset + getScratchInstrOffset(MI);
1125
1126 const SIInstrInfo *TII = ST.getInstrInfo();
1128 return TII->isLegalMUBUFImmOffset(NewOffset);
1129
1130 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1132}
1133
1135 const MachineFunction &MF, unsigned Kind) const {
1136 // This is inaccurate. It depends on the instruction and address space. The
1137 // only place where we should hit this is for dealing with frame indexes /
1138 // private accesses, so this is correct in that case.
1139 return &AMDGPU::VGPR_32RegClass;
1140}
1141
1142const TargetRegisterClass *
1144 if (isAGPRClass(RC) && !ST.hasGFX90AInsts())
1145 return getEquivalentVGPRClass(RC);
1146 if (RC == &AMDGPU::SCC_CLASSRegClass)
1147 return getWaveMaskRegClass();
1148
1149 return RC;
1150}
1151
1152static unsigned getNumSubRegsForSpillOp(unsigned Op) {
1153
1154 switch (Op) {
1155 case AMDGPU::SI_SPILL_S1024_SAVE:
1156 case AMDGPU::SI_SPILL_S1024_RESTORE:
1157 case AMDGPU::SI_SPILL_V1024_SAVE:
1158 case AMDGPU::SI_SPILL_V1024_RESTORE:
1159 case AMDGPU::SI_SPILL_A1024_SAVE:
1160 case AMDGPU::SI_SPILL_A1024_RESTORE:
1161 case AMDGPU::SI_SPILL_AV1024_SAVE:
1162 case AMDGPU::SI_SPILL_AV1024_RESTORE:
1163 return 32;
1164 case AMDGPU::SI_SPILL_S512_SAVE:
1165 case AMDGPU::SI_SPILL_S512_RESTORE:
1166 case AMDGPU::SI_SPILL_V512_SAVE:
1167 case AMDGPU::SI_SPILL_V512_RESTORE:
1168 case AMDGPU::SI_SPILL_A512_SAVE:
1169 case AMDGPU::SI_SPILL_A512_RESTORE:
1170 case AMDGPU::SI_SPILL_AV512_SAVE:
1171 case AMDGPU::SI_SPILL_AV512_RESTORE:
1172 return 16;
1173 case AMDGPU::SI_SPILL_S384_SAVE:
1174 case AMDGPU::SI_SPILL_S384_RESTORE:
1175 case AMDGPU::SI_SPILL_V384_SAVE:
1176 case AMDGPU::SI_SPILL_V384_RESTORE:
1177 case AMDGPU::SI_SPILL_A384_SAVE:
1178 case AMDGPU::SI_SPILL_A384_RESTORE:
1179 case AMDGPU::SI_SPILL_AV384_SAVE:
1180 case AMDGPU::SI_SPILL_AV384_RESTORE:
1181 return 12;
1182 case AMDGPU::SI_SPILL_S352_SAVE:
1183 case AMDGPU::SI_SPILL_S352_RESTORE:
1184 case AMDGPU::SI_SPILL_V352_SAVE:
1185 case AMDGPU::SI_SPILL_V352_RESTORE:
1186 case AMDGPU::SI_SPILL_A352_SAVE:
1187 case AMDGPU::SI_SPILL_A352_RESTORE:
1188 case AMDGPU::SI_SPILL_AV352_SAVE:
1189 case AMDGPU::SI_SPILL_AV352_RESTORE:
1190 return 11;
1191 case AMDGPU::SI_SPILL_S320_SAVE:
1192 case AMDGPU::SI_SPILL_S320_RESTORE:
1193 case AMDGPU::SI_SPILL_V320_SAVE:
1194 case AMDGPU::SI_SPILL_V320_RESTORE:
1195 case AMDGPU::SI_SPILL_A320_SAVE:
1196 case AMDGPU::SI_SPILL_A320_RESTORE:
1197 case AMDGPU::SI_SPILL_AV320_SAVE:
1198 case AMDGPU::SI_SPILL_AV320_RESTORE:
1199 return 10;
1200 case AMDGPU::SI_SPILL_S288_SAVE:
1201 case AMDGPU::SI_SPILL_S288_RESTORE:
1202 case AMDGPU::SI_SPILL_V288_SAVE:
1203 case AMDGPU::SI_SPILL_V288_RESTORE:
1204 case AMDGPU::SI_SPILL_A288_SAVE:
1205 case AMDGPU::SI_SPILL_A288_RESTORE:
1206 case AMDGPU::SI_SPILL_AV288_SAVE:
1207 case AMDGPU::SI_SPILL_AV288_RESTORE:
1208 return 9;
1209 case AMDGPU::SI_SPILL_S256_SAVE:
1210 case AMDGPU::SI_SPILL_S256_RESTORE:
1211 case AMDGPU::SI_SPILL_V256_SAVE:
1212 case AMDGPU::SI_SPILL_V256_RESTORE:
1213 case AMDGPU::SI_SPILL_A256_SAVE:
1214 case AMDGPU::SI_SPILL_A256_RESTORE:
1215 case AMDGPU::SI_SPILL_AV256_SAVE:
1216 case AMDGPU::SI_SPILL_AV256_RESTORE:
1217 return 8;
1218 case AMDGPU::SI_SPILL_S224_SAVE:
1219 case AMDGPU::SI_SPILL_S224_RESTORE:
1220 case AMDGPU::SI_SPILL_V224_SAVE:
1221 case AMDGPU::SI_SPILL_V224_RESTORE:
1222 case AMDGPU::SI_SPILL_A224_SAVE:
1223 case AMDGPU::SI_SPILL_A224_RESTORE:
1224 case AMDGPU::SI_SPILL_AV224_SAVE:
1225 case AMDGPU::SI_SPILL_AV224_RESTORE:
1226 return 7;
1227 case AMDGPU::SI_SPILL_S192_SAVE:
1228 case AMDGPU::SI_SPILL_S192_RESTORE:
1229 case AMDGPU::SI_SPILL_V192_SAVE:
1230 case AMDGPU::SI_SPILL_V192_RESTORE:
1231 case AMDGPU::SI_SPILL_A192_SAVE:
1232 case AMDGPU::SI_SPILL_A192_RESTORE:
1233 case AMDGPU::SI_SPILL_AV192_SAVE:
1234 case AMDGPU::SI_SPILL_AV192_RESTORE:
1235 return 6;
1236 case AMDGPU::SI_SPILL_S160_SAVE:
1237 case AMDGPU::SI_SPILL_S160_RESTORE:
1238 case AMDGPU::SI_SPILL_V160_SAVE:
1239 case AMDGPU::SI_SPILL_V160_RESTORE:
1240 case AMDGPU::SI_SPILL_A160_SAVE:
1241 case AMDGPU::SI_SPILL_A160_RESTORE:
1242 case AMDGPU::SI_SPILL_AV160_SAVE:
1243 case AMDGPU::SI_SPILL_AV160_RESTORE:
1244 return 5;
1245 case AMDGPU::SI_SPILL_S128_SAVE:
1246 case AMDGPU::SI_SPILL_S128_RESTORE:
1247 case AMDGPU::SI_SPILL_V128_SAVE:
1248 case AMDGPU::SI_SPILL_V128_RESTORE:
1249 case AMDGPU::SI_SPILL_A128_SAVE:
1250 case AMDGPU::SI_SPILL_A128_RESTORE:
1251 case AMDGPU::SI_SPILL_AV128_SAVE:
1252 case AMDGPU::SI_SPILL_AV128_RESTORE:
1253 return 4;
1254 case AMDGPU::SI_SPILL_S96_SAVE:
1255 case AMDGPU::SI_SPILL_S96_RESTORE:
1256 case AMDGPU::SI_SPILL_V96_SAVE:
1257 case AMDGPU::SI_SPILL_V96_RESTORE:
1258 case AMDGPU::SI_SPILL_A96_SAVE:
1259 case AMDGPU::SI_SPILL_A96_RESTORE:
1260 case AMDGPU::SI_SPILL_AV96_SAVE:
1261 case AMDGPU::SI_SPILL_AV96_RESTORE:
1262 return 3;
1263 case AMDGPU::SI_SPILL_S64_SAVE:
1264 case AMDGPU::SI_SPILL_S64_RESTORE:
1265 case AMDGPU::SI_SPILL_V64_SAVE:
1266 case AMDGPU::SI_SPILL_V64_RESTORE:
1267 case AMDGPU::SI_SPILL_A64_SAVE:
1268 case AMDGPU::SI_SPILL_A64_RESTORE:
1269 case AMDGPU::SI_SPILL_AV64_SAVE:
1270 case AMDGPU::SI_SPILL_AV64_RESTORE:
1271 return 2;
1272 case AMDGPU::SI_SPILL_S32_SAVE:
1273 case AMDGPU::SI_SPILL_S32_RESTORE:
1274 case AMDGPU::SI_SPILL_V32_SAVE:
1275 case AMDGPU::SI_SPILL_V32_RESTORE:
1276 case AMDGPU::SI_SPILL_A32_SAVE:
1277 case AMDGPU::SI_SPILL_A32_RESTORE:
1278 case AMDGPU::SI_SPILL_AV32_SAVE:
1279 case AMDGPU::SI_SPILL_AV32_RESTORE:
1280 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
1281 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
1282 case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
1283 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
1284 return 1;
1285 default: llvm_unreachable("Invalid spill opcode");
1286 }
1287}
1288
1289static int getOffsetMUBUFStore(unsigned Opc) {
1290 switch (Opc) {
1291 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
1292 return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1293 case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
1294 return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
1295 case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
1296 return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
1297 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
1298 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
1299 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
1300 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
1301 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
1302 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
1303 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
1304 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
1305 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
1306 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
1307 default:
1308 return -1;
1309 }
1310}
1311
1312static int getOffsetMUBUFLoad(unsigned Opc) {
1313 switch (Opc) {
1314 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
1315 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1316 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
1317 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
1318 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
1319 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
1320 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
1321 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
1322 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
1323 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
1324 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
1325 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
1326 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
1327 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
1328 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
1329 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
1330 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
1331 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
1332 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
1333 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
1334 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
1335 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
1336 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
1337 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
1338 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
1339 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
1340 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
1341 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
1342 default:
1343 return -1;
1344 }
1345}
1346
1347static int getOffenMUBUFStore(unsigned Opc) {
1348 switch (Opc) {
1349 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
1350 return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
1351 case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
1352 return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
1353 case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
1354 return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
1355 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
1356 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
1357 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
1358 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
1359 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
1360 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
1361 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
1362 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
1363 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
1364 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
1365 default:
1366 return -1;
1367 }
1368}
1369
1370static int getOffenMUBUFLoad(unsigned Opc) {
1371 switch (Opc) {
1372 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
1373 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
1374 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
1375 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
1376 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
1377 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
1378 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
1379 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
1380 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
1381 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
1382 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
1383 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
1384 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
1385 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
1386 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
1387 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
1388 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
1389 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
1390 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
1391 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
1392 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
1393 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
1394 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
1395 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
1396 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
1397 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
1398 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
1399 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
1400 default:
1401 return -1;
1402 }
1403}
1404
1408 int Index, unsigned Lane,
1409 unsigned ValueReg, bool IsKill) {
1412 const SIInstrInfo *TII = ST.getInstrInfo();
1413
1414 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
1415
1416 if (Reg == AMDGPU::NoRegister)
1417 return MachineInstrBuilder();
1418
1419 bool IsStore = MI->mayStore();
1421 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1422
1423 unsigned Dst = IsStore ? Reg : ValueReg;
1424 unsigned Src = IsStore ? ValueReg : Reg;
1425 bool IsVGPR = TRI->isVGPR(MRI, Reg);
1426 DebugLoc DL = MI->getDebugLoc();
1427 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
1428 // Spiller during regalloc may restore a spilled register to its superclass.
1429 // It could result in AGPR spills restored to VGPRs or the other way around,
1430 // making the src and dst with identical regclasses at this point. It just
1431 // needs a copy in such cases.
1432 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
1433 .addReg(Src, getKillRegState(IsKill));
1435 return CopyMIB;
1436 }
1437 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1438 : AMDGPU::V_ACCVGPR_READ_B32_e64;
1439
1440 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
1441 .addReg(Src, getKillRegState(IsKill));
1443 return MIB;
1444}
1445
1446// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1447// need to handle the case where an SGPR may need to be spilled while spilling.
1449 MachineFrameInfo &MFI,
1451 int Index,
1452 int64_t Offset) {
1453 const SIInstrInfo *TII = ST.getInstrInfo();
1454 MachineBasicBlock *MBB = MI->getParent();
1455 const DebugLoc &DL = MI->getDebugLoc();
1456 bool IsStore = MI->mayStore();
1457
1458 unsigned Opc = MI->getOpcode();
1459 int LoadStoreOp = IsStore ?
1461 if (LoadStoreOp == -1)
1462 return false;
1463
1464 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
1465 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr())
1466 return true;
1467
1468 MachineInstrBuilder NewMI =
1469 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
1470 .add(*Reg)
1471 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
1472 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
1473 .addImm(Offset)
1474 .addImm(0) // cpol
1475 .addImm(0) // swz
1476 .cloneMemRefs(*MI);
1477
1478 const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
1479 AMDGPU::OpName::vdata_in);
1480 if (VDataIn)
1481 NewMI.add(*VDataIn);
1482 return true;
1483}
1484
1486 unsigned LoadStoreOp,
1487 unsigned EltSize) {
1488 bool IsStore = TII->get(LoadStoreOp).mayStore();
1489 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr);
1490 bool UseST =
1491 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
1492
1493 switch (EltSize) {
1494 case 4:
1495 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1496 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
1497 break;
1498 case 8:
1499 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1500 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
1501 break;
1502 case 12:
1503 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1504 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
1505 break;
1506 case 16:
1507 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1508 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
1509 break;
1510 default:
1511 llvm_unreachable("Unexpected spill load/store size!");
1512 }
1513
1514 if (HasVAddr)
1515 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1516 else if (UseST)
1517 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1518
1519 return LoadStoreOp;
1520}
1521
1524 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1525 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1526 RegScavenger *RS, LiveRegUnits *LiveUnits) const {
1527 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
1528
1530 const SIInstrInfo *TII = ST.getInstrInfo();
1531 const MachineFrameInfo &MFI = MF->getFrameInfo();
1532 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1533
1534 const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
1535 bool IsStore = Desc->mayStore();
1536 bool IsFlat = TII->isFLATScratch(LoadStoreOp);
1537
1538 bool CanClobberSCC = false;
1539 bool Scavenged = false;
1540 MCRegister SOffset = ScratchOffsetReg;
1541
1542 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
1543 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1544 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
1545 const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
1546
1547 // Always use 4 byte operations for AGPRs because we need to scavenge
1548 // a temporary VGPR.
1549 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u;
1550 unsigned NumSubRegs = RegWidth / EltSize;
1551 unsigned Size = NumSubRegs * EltSize;
1552 unsigned RemSize = RegWidth - Size;
1553 unsigned NumRemSubRegs = RemSize ? 1 : 0;
1554 int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
1555 int64_t MaterializedOffset = Offset;
1556
1557 int64_t MaxOffset = Offset + Size + RemSize - EltSize;
1558 int64_t ScratchOffsetRegDelta = 0;
1559
1560 if (IsFlat && EltSize > 4) {
1561 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1562 Desc = &TII->get(LoadStoreOp);
1563 }
1564
1565 Align Alignment = MFI.getObjectAlign(Index);
1566 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
1567
1568 assert((IsFlat || ((Offset % EltSize) == 0)) &&
1569 "unexpected VGPR spill offset");
1570
1571 // Track a VGPR to use for a constant offset we need to materialize.
1572 Register TmpOffsetVGPR;
1573
1574 // Track a VGPR to use as an intermediate value.
1575 Register TmpIntermediateVGPR;
1576 bool UseVGPROffset = false;
1577
1578 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1579 // combination.
1580 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
1581 int64_t VOffset) {
1582 // We are using a VGPR offset
1583 if (IsFlat && SGPRBase) {
1584 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1585 // SGPR, so perform the add as vector.
1586 // We don't need a base SGPR in the kernel.
1587
1588 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
1589 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
1590 .addReg(SGPRBase)
1591 .addImm(VOffset)
1592 .addImm(0); // clamp
1593 } else {
1594 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1595 .addReg(SGPRBase);
1596 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
1597 .addImm(VOffset)
1598 .addReg(TmpOffsetVGPR);
1599 }
1600 } else {
1601 assert(TmpOffsetVGPR);
1602 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1603 .addImm(VOffset);
1604 }
1605 };
1606
1607 bool IsOffsetLegal =
1608 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1610 : TII->isLegalMUBUFImmOffset(MaxOffset);
1611 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
1612 SOffset = MCRegister();
1613
1614 // We don't have access to the register scavenger if this function is called
1615 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1616 // TODO: Clobbering SCC is not necessary for scratch instructions in the
1617 // entry.
1618 if (RS) {
1619 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false);
1620
1621 // Piggy back on the liveness scan we just did see if SCC is dead.
1622 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
1623 } else if (LiveUnits) {
1624 CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
1625 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
1626 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1627 SOffset = Reg;
1628 break;
1629 }
1630 }
1631 }
1632
1633 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
1634 SOffset = Register();
1635
1636 if (!SOffset) {
1637 UseVGPROffset = true;
1638
1639 if (RS) {
1640 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1641 } else {
1642 assert(LiveUnits);
1643 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
1644 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1645 TmpOffsetVGPR = Reg;
1646 break;
1647 }
1648 }
1649 }
1650
1651 assert(TmpOffsetVGPR);
1652 } else if (!SOffset && CanClobberSCC) {
1653 // There are no free SGPRs, and since we are in the process of spilling
1654 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
1655 // on SI/CI and on VI it is true until we implement spilling using scalar
1656 // stores), we have no way to free up an SGPR. Our solution here is to
1657 // add the offset directly to the ScratchOffset or StackPtrOffset
1658 // register, and then subtract the offset after the spill to return the
1659 // register to it's original value.
1660
1661 // TODO: If we don't have to do an emergency stack slot spill, converting
1662 // to use the VGPR offset is fewer instructions.
1663 if (!ScratchOffsetReg)
1664 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
1665 SOffset = ScratchOffsetReg;
1666 ScratchOffsetRegDelta = Offset;
1667 } else {
1668 Scavenged = true;
1669 }
1670
1671 // We currently only support spilling VGPRs to EltSize boundaries, meaning
1672 // we can simplify the adjustment of Offset here to just scale with
1673 // WavefrontSize.
1674 if (!IsFlat && !UseVGPROffset)
1675 Offset *= ST.getWavefrontSize();
1676
1677 if (!UseVGPROffset && !SOffset)
1678 report_fatal_error("could not scavenge SGPR to spill in entry function");
1679
1680 if (UseVGPROffset) {
1681 // We are using a VGPR offset
1682 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
1683 } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
1684 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
1685 } else {
1686 assert(Offset != 0);
1687 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1688 .addReg(ScratchOffsetReg)
1689 .addImm(Offset);
1690 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1691 }
1692
1693 Offset = 0;
1694 }
1695
1696 if (IsFlat && SOffset == AMDGPU::NoRegister) {
1697 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
1698 && "Unexpected vaddr for flat scratch with a FI operand");
1699
1700 if (UseVGPROffset) {
1701 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1702 } else {
1704 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1705 }
1706
1707 Desc = &TII->get(LoadStoreOp);
1708 }
1709
1710 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
1711 ++i, RegOffset += EltSize) {
1712 if (i == NumSubRegs) {
1713 EltSize = RemSize;
1714 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1715 }
1716 Desc = &TII->get(LoadStoreOp);
1717
1718 if (!IsFlat && UseVGPROffset) {
1719 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
1720 : getOffenMUBUFLoad(LoadStoreOp);
1721 Desc = &TII->get(NewLoadStoreOp);
1722 }
1723
1724 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
1725 // If we are spilling an AGPR beyond the range of the memory instruction
1726 // offset and need to use a VGPR offset, we ideally have at least 2
1727 // scratch VGPRs. If we don't have a second free VGPR without spilling,
1728 // recycle the VGPR used for the offset which requires resetting after
1729 // each subregister.
1730
1731 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
1732 }
1733
1734 unsigned NumRegs = EltSize / 4;
1735 Register SubReg = e == 1
1736 ? ValueReg
1737 : Register(getSubReg(ValueReg,
1738 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1739
1740 unsigned SOffsetRegState = 0;
1741 unsigned SrcDstRegState = getDefRegState(!IsStore);
1742 const bool IsLastSubReg = i + 1 == e;
1743 const bool IsFirstSubReg = i == 0;
1744 if (IsLastSubReg) {
1745 SOffsetRegState |= getKillRegState(Scavenged);
1746 // The last implicit use carries the "Kill" flag.
1747 SrcDstRegState |= getKillRegState(IsKill);
1748 }
1749
1750 // Make sure the whole register is defined if there are undef components by
1751 // adding an implicit def of the super-reg on the first instruction.
1752 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg;
1753 bool NeedSuperRegImpOperand = e > 1;
1754
1755 // Remaining element size to spill into memory after some parts of it
1756 // spilled into either AGPRs or VGPRs.
1757 unsigned RemEltSize = EltSize;
1758
1759 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1760 // starting from the last lane. In case if a register cannot be completely
1761 // spilled into another register that will ensure its alignment does not
1762 // change. For targets with VGPR alignment requirement this is important
1763 // in case of flat scratch usage as we might get a scratch_load or
1764 // scratch_store of an unaligned register otherwise.
1765 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
1766 LaneE = RegOffset / 4;
1767 Lane >= LaneE; --Lane) {
1768 bool IsSubReg = e > 1 || EltSize > 4;
1769 Register Sub = IsSubReg
1770 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
1771 : ValueReg;
1772 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
1773 if (!MIB.getInstr())
1774 break;
1775 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
1776 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1777 NeedSuperRegDef = false;
1778 }
1779 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
1780 NeedSuperRegImpOperand = true;
1781 unsigned State = SrcDstRegState;
1782 if (!IsLastSubReg || (Lane != LaneE))
1783 State &= ~RegState::Kill;
1784 if (!IsFirstSubReg || (Lane != LaneS))
1785 State &= ~RegState::Define;
1786 MIB.addReg(ValueReg, RegState::Implicit | State);
1787 }
1788 RemEltSize -= 4;
1789 }
1790
1791 if (!RemEltSize) // Fully spilled into AGPRs.
1792 continue;
1793
1794 if (RemEltSize != EltSize) { // Partially spilled to AGPRs
1795 assert(IsFlat && EltSize > 4);
1796
1797 unsigned NumRegs = RemEltSize / 4;
1798 SubReg = Register(getSubReg(ValueReg,
1799 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1800 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
1801 Desc = &TII->get(Opc);
1802 }
1803
1804 unsigned FinalReg = SubReg;
1805
1806 if (IsAGPR) {
1807 assert(EltSize == 4);
1808
1809 if (!TmpIntermediateVGPR) {
1810 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
1811 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
1812 }
1813 if (IsStore) {
1814 auto AccRead = BuildMI(MBB, MI, DL,
1815 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
1816 TmpIntermediateVGPR)
1817 .addReg(SubReg, getKillRegState(IsKill));
1818 if (NeedSuperRegDef)
1819 AccRead.addReg(ValueReg, RegState::ImplicitDefine);
1820 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1821 AccRead.addReg(ValueReg, RegState::Implicit);
1823 }
1824 SubReg = TmpIntermediateVGPR;
1825 } else if (UseVGPROffset) {
1826 if (!TmpOffsetVGPR) {
1827 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1828 MI, false, 0);
1829 RS->setRegUsed(TmpOffsetVGPR);
1830 }
1831 }
1832
1833 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
1834 MachineMemOperand *NewMMO =
1835 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
1836 commonAlignment(Alignment, RegOffset));
1837
1838 auto MIB =
1839 BuildMI(MBB, MI, DL, *Desc)
1840 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
1841
1842 if (UseVGPROffset) {
1843 // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1844 // intermediate accvgpr_write.
1845 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
1846 }
1847
1848 if (!IsFlat)
1849 MIB.addReg(FuncInfo->getScratchRSrcReg());
1850
1851 if (SOffset == AMDGPU::NoRegister) {
1852 if (!IsFlat) {
1853 if (UseVGPROffset && ScratchOffsetReg) {
1854 MIB.addReg(ScratchOffsetReg);
1855 } else {
1856 assert(FuncInfo->isBottomOfStack());
1857 MIB.addImm(0);
1858 }
1859 }
1860 } else {
1861 MIB.addReg(SOffset, SOffsetRegState);
1862 }
1863
1864 MIB.addImm(Offset + RegOffset);
1865
1866 bool LastUse = MMO->getFlags() & MOLastUse;
1867 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1868
1869 if (!IsFlat)
1870 MIB.addImm(0); // swz
1871 MIB.addMemOperand(NewMMO);
1872
1873 if (!IsAGPR && NeedSuperRegDef)
1874 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1875
1876 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
1877 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
1878 FinalReg)
1879 .addReg(TmpIntermediateVGPR, RegState::Kill);
1881 }
1882
1883 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1884 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
1885
1886 // The epilog restore of a wwm-scratch register can cause undesired
1887 // optimization during machine-cp post PrologEpilogInserter if the same
1888 // register was assigned for return value ABI lowering with a COPY
1889 // instruction. As given below, with the epilog reload, the earlier COPY
1890 // appeared to be dead during machine-cp.
1891 // ...
1892 // v0 in WWM operation, needs the WWM spill at prolog/epilog.
1893 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
1894 // ...
1895 // Epilog block:
1896 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
1897 // ...
1898 // WWM spill restore to preserve the inactive lanes of v0.
1899 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
1900 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
1901 // $exec = S_MOV_B64 killed $sgpr4_sgpr5
1902 // ...
1903 // SI_RETURN implicit $vgpr0
1904 // ...
1905 // To fix it, mark the same reg as a tied op for such restore instructions
1906 // so that it marks a usage for the preceding COPY.
1907 if (!IsStore && MI != MBB.end() && MI->isReturn() &&
1908 MI->readsRegister(SubReg, this)) {
1909 MIB.addReg(SubReg, RegState::Implicit);
1910 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1911 }
1912 }
1913
1914 if (ScratchOffsetRegDelta != 0) {
1915 // Subtract the offset we added to the ScratchOffset register.
1916 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1917 .addReg(SOffset)
1918 .addImm(-ScratchOffsetRegDelta);
1919 }
1920}
1921
1923 int Offset, bool IsLoad,
1924 bool IsKill) const {
1925 // Load/store VGPR
1926 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
1927 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
1928
1929 Register FrameReg =
1930 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
1931 ? getBaseRegister()
1932 : getFrameRegister(SB.MF);
1933
1934 Align Alignment = FrameInfo.getObjectAlign(Index);
1938 SB.EltSize, Alignment);
1939
1940 if (IsLoad) {
1941 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1942 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1943 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
1944 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1945 } else {
1946 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1947 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1948 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
1949 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1950 // This only ever adds one VGPR spill
1951 SB.MFI.addToSpilledVGPRs(1);
1952 }
1953}
1954
1956 RegScavenger *RS, SlotIndexes *Indexes,
1957 LiveIntervals *LIS, bool OnlyToVGPR,
1958 bool SpillToPhysVGPRLane) const {
1959 assert(!MI->getOperand(0).isUndef() &&
1960 "undef spill should have been deleted earlier");
1961
1962 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
1963
1964 ArrayRef<SpilledReg> VGPRSpills =
1965 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
1967 bool SpillToVGPR = !VGPRSpills.empty();
1968 if (OnlyToVGPR && !SpillToVGPR)
1969 return false;
1970
1971 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
1972 SB.SuperReg != SB.MFI.getFrameOffsetReg()));
1973
1974 if (SpillToVGPR) {
1975
1976 // Since stack slot coloring pass is trying to optimize SGPR spills,
1977 // VGPR lanes (mapped from spill stack slot) may be shared for SGPR
1978 // spills of different sizes. This accounts for number of VGPR lanes alloted
1979 // equal to the largest SGPR being spilled in them.
1980 assert(SB.NumSubRegs <= VGPRSpills.size() &&
1981 "Num of SGPRs spilled should be less than or equal to num of "
1982 "the VGPR lanes.");
1983
1984 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
1986 SB.NumSubRegs == 1
1987 ? SB.SuperReg
1988 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
1989 SpilledReg Spill = VGPRSpills[i];
1990
1991 bool IsFirstSubreg = i == 0;
1992 bool IsLastSubreg = i == SB.NumSubRegs - 1;
1993 bool UseKill = SB.IsKill && IsLastSubreg;
1994
1995
1996 // Mark the "old value of vgpr" input undef only if this is the first sgpr
1997 // spill to this specific vgpr in the first basic block.
1998 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
1999 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
2000 .addReg(SubReg, getKillRegState(UseKill))
2001 .addImm(Spill.Lane)
2002 .addReg(Spill.VGPR);
2003 if (Indexes) {
2004 if (IsFirstSubreg)
2005 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2006 else
2007 Indexes->insertMachineInstrInMaps(*MIB);
2008 }
2009
2010 if (IsFirstSubreg && SB.NumSubRegs > 1) {
2011 // We may be spilling a super-register which is only partially defined,
2012 // and need to ensure later spills think the value is defined.
2013 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2014 }
2015
2016 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg))
2017 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit);
2018
2019 // FIXME: Since this spills to another register instead of an actual
2020 // frame index, we should delete the frame index when all references to
2021 // it are fixed.
2022 }
2023 } else {
2024 SB.prepare();
2025
2026 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
2027 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2028
2029 // Per VGPR helper data
2030 auto PVD = SB.getPerVGPRData();
2031
2032 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2033 unsigned TmpVGPRFlags = RegState::Undef;
2034
2035 // Write sub registers into the VGPR
2036 for (unsigned i = Offset * PVD.PerVGPR,
2037 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2038 i < e; ++i) {
2040 SB.NumSubRegs == 1
2041 ? SB.SuperReg
2042 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2043
2044 MachineInstrBuilder WriteLane =
2045 BuildMI(*SB.MBB, MI, SB.DL,
2046 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
2047 .addReg(SubReg, SubKillState)
2048 .addImm(i % PVD.PerVGPR)
2049 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2050 TmpVGPRFlags = 0;
2051
2052 if (Indexes) {
2053 if (i == 0)
2054 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane);
2055 else
2056 Indexes->insertMachineInstrInMaps(*WriteLane);
2057 }
2058
2059 // There could be undef components of a spilled super register.
2060 // TODO: Can we detect this and skip the spill?
2061 if (SB.NumSubRegs > 1) {
2062 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2063 unsigned SuperKillState = 0;
2064 if (i + 1 == SB.NumSubRegs)
2065 SuperKillState |= getKillRegState(SB.IsKill);
2066 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2067 }
2068 }
2069
2070 // Write out VGPR
2071 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
2072 }
2073
2074 SB.restore();
2075 }
2076
2077 MI->eraseFromParent();
2079
2080 if (LIS)
2082
2083 return true;
2084}
2085
2087 RegScavenger *RS, SlotIndexes *Indexes,
2088 LiveIntervals *LIS, bool OnlyToVGPR,
2089 bool SpillToPhysVGPRLane) const {
2090 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2091
2092 ArrayRef<SpilledReg> VGPRSpills =
2093 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2095 bool SpillToVGPR = !VGPRSpills.empty();
2096 if (OnlyToVGPR && !SpillToVGPR)
2097 return false;
2098
2099 if (SpillToVGPR) {
2100 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2102 SB.NumSubRegs == 1
2103 ? SB.SuperReg
2104 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2105
2106 SpilledReg Spill = VGPRSpills[i];
2107 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2108 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2109 .addReg(Spill.VGPR)
2110 .addImm(Spill.Lane);
2111 if (SB.NumSubRegs > 1 && i == 0)
2113 if (Indexes) {
2114 if (i == e - 1)
2115 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2116 else
2117 Indexes->insertMachineInstrInMaps(*MIB);
2118 }
2119 }
2120 } else {
2121 SB.prepare();
2122
2123 // Per VGPR helper data
2124 auto PVD = SB.getPerVGPRData();
2125
2126 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2127 // Load in VGPR data
2128 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
2129
2130 // Unpack lanes
2131 for (unsigned i = Offset * PVD.PerVGPR,
2132 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2133 i < e; ++i) {
2135 SB.NumSubRegs == 1
2136 ? SB.SuperReg
2137 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2138
2139 bool LastSubReg = (i + 1 == e);
2140 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2141 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2142 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2143 .addImm(i);
2144 if (SB.NumSubRegs > 1 && i == 0)
2146 if (Indexes) {
2147 if (i == e - 1)
2148 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2149 else
2150 Indexes->insertMachineInstrInMaps(*MIB);
2151 }
2152 }
2153 }
2154
2155 SB.restore();
2156 }
2157
2158 MI->eraseFromParent();
2159
2160 if (LIS)
2162
2163 return true;
2164}
2165
2167 MachineBasicBlock &RestoreMBB,
2168 Register SGPR, RegScavenger *RS) const {
2169 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
2170 RS);
2171 SB.prepare();
2172 // Generate the spill of SGPR to SB.TmpVGPR.
2173 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2174 auto PVD = SB.getPerVGPRData();
2175 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2176 unsigned TmpVGPRFlags = RegState::Undef;
2177 // Write sub registers into the VGPR
2178 for (unsigned i = Offset * PVD.PerVGPR,
2179 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2180 i < e; ++i) {
2182 SB.NumSubRegs == 1
2183 ? SB.SuperReg
2184 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2185
2186 MachineInstrBuilder WriteLane =
2187 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
2188 SB.TmpVGPR)
2189 .addReg(SubReg, SubKillState)
2190 .addImm(i % PVD.PerVGPR)
2191 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2192 TmpVGPRFlags = 0;
2193 // There could be undef components of a spilled super register.
2194 // TODO: Can we detect this and skip the spill?
2195 if (SB.NumSubRegs > 1) {
2196 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2197 unsigned SuperKillState = 0;
2198 if (i + 1 == SB.NumSubRegs)
2199 SuperKillState |= getKillRegState(SB.IsKill);
2200 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2201 }
2202 }
2203 // Don't need to write VGPR out.
2204 }
2205
2206 // Restore clobbered registers in the specified restore block.
2207 MI = RestoreMBB.end();
2208 SB.setMI(&RestoreMBB, MI);
2209 // Generate the restore of SGPR from SB.TmpVGPR.
2210 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2211 // Don't need to load VGPR in.
2212 // Unpack lanes
2213 for (unsigned i = Offset * PVD.PerVGPR,
2214 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2215 i < e; ++i) {
2217 SB.NumSubRegs == 1
2218 ? SB.SuperReg
2219 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2220 bool LastSubReg = (i + 1 == e);
2221 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
2222 SubReg)
2223 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2224 .addImm(i);
2225 if (SB.NumSubRegs > 1 && i == 0)
2227 }
2228 }
2229 SB.restore();
2230
2232 return false;
2233}
2234
2235/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2236/// a VGPR and the stack slot can be safely eliminated when all other users are
2237/// handled.
2240 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
2241 switch (MI->getOpcode()) {
2242 case AMDGPU::SI_SPILL_S1024_SAVE:
2243 case AMDGPU::SI_SPILL_S512_SAVE:
2244 case AMDGPU::SI_SPILL_S384_SAVE:
2245 case AMDGPU::SI_SPILL_S352_SAVE:
2246 case AMDGPU::SI_SPILL_S320_SAVE:
2247 case AMDGPU::SI_SPILL_S288_SAVE:
2248 case AMDGPU::SI_SPILL_S256_SAVE:
2249 case AMDGPU::SI_SPILL_S224_SAVE:
2250 case AMDGPU::SI_SPILL_S192_SAVE:
2251 case AMDGPU::SI_SPILL_S160_SAVE:
2252 case AMDGPU::SI_SPILL_S128_SAVE:
2253 case AMDGPU::SI_SPILL_S96_SAVE:
2254 case AMDGPU::SI_SPILL_S64_SAVE:
2255 case AMDGPU::SI_SPILL_S32_SAVE:
2256 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2257 case AMDGPU::SI_SPILL_S1024_RESTORE:
2258 case AMDGPU::SI_SPILL_S512_RESTORE:
2259 case AMDGPU::SI_SPILL_S384_RESTORE:
2260 case AMDGPU::SI_SPILL_S352_RESTORE:
2261 case AMDGPU::SI_SPILL_S320_RESTORE:
2262 case AMDGPU::SI_SPILL_S288_RESTORE:
2263 case AMDGPU::SI_SPILL_S256_RESTORE:
2264 case AMDGPU::SI_SPILL_S224_RESTORE:
2265 case AMDGPU::SI_SPILL_S192_RESTORE:
2266 case AMDGPU::SI_SPILL_S160_RESTORE:
2267 case AMDGPU::SI_SPILL_S128_RESTORE:
2268 case AMDGPU::SI_SPILL_S96_RESTORE:
2269 case AMDGPU::SI_SPILL_S64_RESTORE:
2270 case AMDGPU::SI_SPILL_S32_RESTORE:
2271 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2272 default:
2273 llvm_unreachable("not an SGPR spill instruction");
2274 }
2275}
2276
2278 int SPAdj, unsigned FIOperandNum,
2279 RegScavenger *RS) const {
2280 MachineFunction *MF = MI->getParent()->getParent();
2281 MachineBasicBlock *MBB = MI->getParent();
2283 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2284 const SIInstrInfo *TII = ST.getInstrInfo();
2285 const DebugLoc &DL = MI->getDebugLoc();
2286
2287 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
2288
2290 "unreserved scratch RSRC register");
2291
2292 MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
2293 int Index = MI->getOperand(FIOperandNum).getIndex();
2294
2295 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
2296 ? getBaseRegister()
2297 : getFrameRegister(*MF);
2298
2299 switch (MI->getOpcode()) {
2300 // SGPR register spill
2301 case AMDGPU::SI_SPILL_S1024_SAVE:
2302 case AMDGPU::SI_SPILL_S512_SAVE:
2303 case AMDGPU::SI_SPILL_S384_SAVE:
2304 case AMDGPU::SI_SPILL_S352_SAVE:
2305 case AMDGPU::SI_SPILL_S320_SAVE:
2306 case AMDGPU::SI_SPILL_S288_SAVE:
2307 case AMDGPU::SI_SPILL_S256_SAVE:
2308 case AMDGPU::SI_SPILL_S224_SAVE:
2309 case AMDGPU::SI_SPILL_S192_SAVE:
2310 case AMDGPU::SI_SPILL_S160_SAVE:
2311 case AMDGPU::SI_SPILL_S128_SAVE:
2312 case AMDGPU::SI_SPILL_S96_SAVE:
2313 case AMDGPU::SI_SPILL_S64_SAVE:
2314 case AMDGPU::SI_SPILL_S32_SAVE: {
2315 return spillSGPR(MI, Index, RS);
2316 }
2317
2318 // SGPR register restore
2319 case AMDGPU::SI_SPILL_S1024_RESTORE:
2320 case AMDGPU::SI_SPILL_S512_RESTORE:
2321 case AMDGPU::SI_SPILL_S384_RESTORE:
2322 case AMDGPU::SI_SPILL_S352_RESTORE:
2323 case AMDGPU::SI_SPILL_S320_RESTORE:
2324 case AMDGPU::SI_SPILL_S288_RESTORE:
2325 case AMDGPU::SI_SPILL_S256_RESTORE:
2326 case AMDGPU::SI_SPILL_S224_RESTORE:
2327 case AMDGPU::SI_SPILL_S192_RESTORE:
2328 case AMDGPU::SI_SPILL_S160_RESTORE:
2329 case AMDGPU::SI_SPILL_S128_RESTORE:
2330 case AMDGPU::SI_SPILL_S96_RESTORE:
2331 case AMDGPU::SI_SPILL_S64_RESTORE:
2332 case AMDGPU::SI_SPILL_S32_RESTORE: {
2333 return restoreSGPR(MI, Index, RS);
2334 }
2335
2336 // VGPR register spill
2337 case AMDGPU::SI_SPILL_V1024_SAVE:
2338 case AMDGPU::SI_SPILL_V512_SAVE:
2339 case AMDGPU::SI_SPILL_V384_SAVE:
2340 case AMDGPU::SI_SPILL_V352_SAVE:
2341 case AMDGPU::SI_SPILL_V320_SAVE:
2342 case AMDGPU::SI_SPILL_V288_SAVE:
2343 case AMDGPU::SI_SPILL_V256_SAVE:
2344 case AMDGPU::SI_SPILL_V224_SAVE:
2345 case AMDGPU::SI_SPILL_V192_SAVE:
2346 case AMDGPU::SI_SPILL_V160_SAVE:
2347 case AMDGPU::SI_SPILL_V128_SAVE:
2348 case AMDGPU::SI_SPILL_V96_SAVE:
2349 case AMDGPU::SI_SPILL_V64_SAVE:
2350 case AMDGPU::SI_SPILL_V32_SAVE:
2351 case AMDGPU::SI_SPILL_A1024_SAVE:
2352 case AMDGPU::SI_SPILL_A512_SAVE:
2353 case AMDGPU::SI_SPILL_A384_SAVE:
2354 case AMDGPU::SI_SPILL_A352_SAVE:
2355 case AMDGPU::SI_SPILL_A320_SAVE:
2356 case AMDGPU::SI_SPILL_A288_SAVE:
2357 case AMDGPU::SI_SPILL_A256_SAVE:
2358 case AMDGPU::SI_SPILL_A224_SAVE:
2359 case AMDGPU::SI_SPILL_A192_SAVE:
2360 case AMDGPU::SI_SPILL_A160_SAVE:
2361 case AMDGPU::SI_SPILL_A128_SAVE:
2362 case AMDGPU::SI_SPILL_A96_SAVE:
2363 case AMDGPU::SI_SPILL_A64_SAVE:
2364 case AMDGPU::SI_SPILL_A32_SAVE:
2365 case AMDGPU::SI_SPILL_AV1024_SAVE:
2366 case AMDGPU::SI_SPILL_AV512_SAVE:
2367 case AMDGPU::SI_SPILL_AV384_SAVE:
2368 case AMDGPU::SI_SPILL_AV352_SAVE:
2369 case AMDGPU::SI_SPILL_AV320_SAVE:
2370 case AMDGPU::SI_SPILL_AV288_SAVE:
2371 case AMDGPU::SI_SPILL_AV256_SAVE:
2372 case AMDGPU::SI_SPILL_AV224_SAVE:
2373 case AMDGPU::SI_SPILL_AV192_SAVE:
2374 case AMDGPU::SI_SPILL_AV160_SAVE:
2375 case AMDGPU::SI_SPILL_AV128_SAVE:
2376 case AMDGPU::SI_SPILL_AV96_SAVE:
2377 case AMDGPU::SI_SPILL_AV64_SAVE:
2378 case AMDGPU::SI_SPILL_AV32_SAVE:
2379 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
2380 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
2381 const MachineOperand *VData = TII->getNamedOperand(*MI,
2382 AMDGPU::OpName::vdata);
2383 if (VData->isUndef()) {
2384 MI->eraseFromParent();
2385 return true;
2386 }
2387
2388 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2389 MFI->getStackPtrOffsetReg());
2390
2391 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2392 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2393 auto *MBB = MI->getParent();
2394 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2395 if (IsWWMRegSpill) {
2396 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2397 RS->isRegUsed(AMDGPU::SCC));
2398 }
2400 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2401 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2402 *MI->memoperands_begin(), RS);
2403 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
2404 if (IsWWMRegSpill)
2405 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2406
2407 MI->eraseFromParent();
2408 return true;
2409 }
2410 case AMDGPU::SI_SPILL_V32_RESTORE:
2411 case AMDGPU::SI_SPILL_V64_RESTORE:
2412 case AMDGPU::SI_SPILL_V96_RESTORE:
2413 case AMDGPU::SI_SPILL_V128_RESTORE:
2414 case AMDGPU::SI_SPILL_V160_RESTORE:
2415 case AMDGPU::SI_SPILL_V192_RESTORE:
2416 case AMDGPU::SI_SPILL_V224_RESTORE:
2417 case AMDGPU::SI_SPILL_V256_RESTORE:
2418 case AMDGPU::SI_SPILL_V288_RESTORE:
2419 case AMDGPU::SI_SPILL_V320_RESTORE:
2420 case AMDGPU::SI_SPILL_V352_RESTORE:
2421 case AMDGPU::SI_SPILL_V384_RESTORE:
2422 case AMDGPU::SI_SPILL_V512_RESTORE:
2423 case AMDGPU::SI_SPILL_V1024_RESTORE:
2424 case AMDGPU::SI_SPILL_A32_RESTORE:
2425 case AMDGPU::SI_SPILL_A64_RESTORE:
2426 case AMDGPU::SI_SPILL_A96_RESTORE:
2427 case AMDGPU::SI_SPILL_A128_RESTORE:
2428 case AMDGPU::SI_SPILL_A160_RESTORE:
2429 case AMDGPU::SI_SPILL_A192_RESTORE:
2430 case AMDGPU::SI_SPILL_A224_RESTORE:
2431 case AMDGPU::SI_SPILL_A256_RESTORE:
2432 case AMDGPU::SI_SPILL_A288_RESTORE:
2433 case AMDGPU::SI_SPILL_A320_RESTORE:
2434 case AMDGPU::SI_SPILL_A352_RESTORE:
2435 case AMDGPU::SI_SPILL_A384_RESTORE:
2436 case AMDGPU::SI_SPILL_A512_RESTORE:
2437 case AMDGPU::SI_SPILL_A1024_RESTORE:
2438 case AMDGPU::SI_SPILL_AV32_RESTORE:
2439 case AMDGPU::SI_SPILL_AV64_RESTORE:
2440 case AMDGPU::SI_SPILL_AV96_RESTORE:
2441 case AMDGPU::SI_SPILL_AV128_RESTORE:
2442 case AMDGPU::SI_SPILL_AV160_RESTORE:
2443 case AMDGPU::SI_SPILL_AV192_RESTORE:
2444 case AMDGPU::SI_SPILL_AV224_RESTORE:
2445 case AMDGPU::SI_SPILL_AV256_RESTORE:
2446 case AMDGPU::SI_SPILL_AV288_RESTORE:
2447 case AMDGPU::SI_SPILL_AV320_RESTORE:
2448 case AMDGPU::SI_SPILL_AV352_RESTORE:
2449 case AMDGPU::SI_SPILL_AV384_RESTORE:
2450 case AMDGPU::SI_SPILL_AV512_RESTORE:
2451 case AMDGPU::SI_SPILL_AV1024_RESTORE:
2452 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
2453 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
2454 const MachineOperand *VData = TII->getNamedOperand(*MI,
2455 AMDGPU::OpName::vdata);
2456 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2457 MFI->getStackPtrOffsetReg());
2458
2459 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2460 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2461 auto *MBB = MI->getParent();
2462 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2463 if (IsWWMRegSpill) {
2464 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2465 RS->isRegUsed(AMDGPU::SCC));
2466 }
2467
2469 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2470 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2471 *MI->memoperands_begin(), RS);
2472
2473 if (IsWWMRegSpill)
2474 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2475
2476 MI->eraseFromParent();
2477 return true;
2478 }
2479 case AMDGPU::V_ADD_U32_e32:
2480 case AMDGPU::V_ADD_U32_e64:
2481 case AMDGPU::V_ADD_CO_U32_e32:
2482 case AMDGPU::V_ADD_CO_U32_e64: {
2483 // TODO: Handle sub, and, or.
2484 unsigned NumDefs = MI->getNumExplicitDefs();
2485 unsigned Src0Idx = NumDefs;
2486
2487 bool HasClamp = false;
2488 MachineOperand *VCCOp = nullptr;
2489
2490 switch (MI->getOpcode()) {
2491 case AMDGPU::V_ADD_U32_e32:
2492 break;
2493 case AMDGPU::V_ADD_U32_e64:
2494 HasClamp = MI->getOperand(3).getImm();
2495 break;
2496 case AMDGPU::V_ADD_CO_U32_e32:
2497 VCCOp = &MI->getOperand(3);
2498 break;
2499 case AMDGPU::V_ADD_CO_U32_e64:
2500 VCCOp = &MI->getOperand(1);
2501 HasClamp = MI->getOperand(4).getImm();
2502 break;
2503 default:
2504 break;
2505 }
2506 bool DeadVCC = !VCCOp || VCCOp->isDead();
2507 MachineOperand &DstOp = MI->getOperand(0);
2508 Register DstReg = DstOp.getReg();
2509
2510 unsigned OtherOpIdx =
2511 FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2512 MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2513
2514 unsigned Src1Idx = Src0Idx + 1;
2515 Register MaterializedReg = FrameReg;
2516 Register ScavengedVGPR;
2517
2518 int64_t Offset = FrameInfo.getObjectOffset(Index);
2519 // For the non-immediate case, we could fall through to the default
2520 // handling, but we do an in-place update of the result register here to
2521 // avoid scavenging another register.
2522 if (OtherOp->isImm()) {
2523 int64_t TotalOffset = OtherOp->getImm() + Offset;
2524
2525 if (!ST.hasVOP3Literal() && SIInstrInfo::isVOP3(*MI) &&
2526 !AMDGPU::isInlinableIntLiteral(TotalOffset)) {
2527 // If we can't support a VOP3 literal in the VALU instruction, we
2528 // can't specially fold into the add.
2529 // TODO: Handle VOP3->VOP2 shrink to support the fold.
2530 break;
2531 }
2532
2533 OtherOp->setImm(TotalOffset);
2534 Offset = 0;
2535 }
2536
2537 if (FrameReg && !ST.enableFlatScratch()) {
2538 // We should just do an in-place update of the result register. However,
2539 // the value there may also be used by the add, in which case we need a
2540 // temporary register.
2541 //
2542 // FIXME: The scavenger is not finding the result register in the
2543 // common case where the add does not read the register.
2544
2545 ScavengedVGPR = RS->scavengeRegisterBackwards(
2546 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2547
2548 // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2549 // shift.
2550 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2551 .addDef(ScavengedVGPR, RegState::Renamable)
2553 .addReg(FrameReg);
2554 MaterializedReg = ScavengedVGPR;
2555 }
2556
2557 if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2558 if (ST.enableFlatScratch() &&
2559 !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2560 // We didn't need the shift above, so we have an SGPR for the frame
2561 // register, but may have a VGPR only operand.
2562 //
2563 // TODO: On gfx10+, we can easily change the opcode to the e64 version
2564 // and use the higher constant bus restriction to avoid this copy.
2565
2566 if (!ScavengedVGPR) {
2567 ScavengedVGPR = RS->scavengeRegisterBackwards(
2568 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2569 /*SPAdj=*/0);
2570 }
2571
2572 assert(ScavengedVGPR != DstReg);
2573
2574 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2575 .addReg(MaterializedReg,
2576 MaterializedReg != FrameReg ? RegState::Kill : 0);
2577 MaterializedReg = ScavengedVGPR;
2578 }
2579
2580 // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2581 // is not live, we could use a scalar add + vector add instead of 2
2582 // vector adds.
2583 auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2584 .addDef(DstReg, RegState::Renamable);
2585 if (NumDefs == 2)
2586 AddI32.add(MI->getOperand(1));
2587
2588 unsigned MaterializedRegFlags =
2589 MaterializedReg != FrameReg ? RegState::Kill : 0;
2590
2591 if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2592 // If we know we have a VGPR already, it's more likely the other
2593 // operand is a legal vsrc0.
2594 AddI32
2595 .add(*OtherOp)
2596 .addReg(MaterializedReg, MaterializedRegFlags);
2597 } else {
2598 // Commute operands to avoid violating VOP2 restrictions. This will
2599 // typically happen when using scratch.
2600 AddI32
2601 .addReg(MaterializedReg, MaterializedRegFlags)
2602 .add(*OtherOp);
2603 }
2604
2605 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2606 MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2607 AddI32.addImm(0); // clamp
2608
2609 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2610 AddI32.setOperandDead(3); // Dead vcc
2611
2612 MaterializedReg = DstReg;
2613
2614 OtherOp->ChangeToRegister(MaterializedReg, false);
2615 OtherOp->setIsKill(true);
2617 Offset = 0;
2618 } else if (Offset != 0) {
2619 assert(!MaterializedReg);
2621 Offset = 0;
2622 } else {
2623 if (DeadVCC && !HasClamp) {
2624 assert(Offset == 0);
2625
2626 // TODO: Losing kills and implicit operands. Just mutate to copy and
2627 // let lowerCopy deal with it?
2628 if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2629 // Folded to an identity copy.
2630 MI->eraseFromParent();
2631 return true;
2632 }
2633
2634 // The immediate value should be in OtherOp
2635 MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2636 MI->removeOperand(FIOperandNum);
2637
2638 unsigned NumOps = MI->getNumOperands();
2639 for (unsigned I = NumOps - 2; I >= NumDefs + 1; --I)
2640 MI->removeOperand(I);
2641
2642 if (NumDefs == 2)
2643 MI->removeOperand(1);
2644
2645 // The code below can't deal with a mov.
2646 return true;
2647 }
2648
2649 // This folded to a constant, but we have to keep the add around for
2650 // pointless implicit defs or clamp modifier.
2651 FIOp->ChangeToImmediate(0);
2652 }
2653
2654 // Try to improve legality by commuting.
2655 if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2656 std::swap(FIOp, OtherOp);
2657 std::swap(FIOperandNum, OtherOpIdx);
2658 }
2659
2660 // We need at most one mov to satisfy the operand constraints. Prefer to
2661 // move the FI operand first, as it may be a literal in a VOP3
2662 // instruction.
2663 for (unsigned SrcIdx : {FIOperandNum, OtherOpIdx}) {
2664 if (!TII->isOperandLegal(*MI, SrcIdx)) {
2665 // If commuting didn't make the operands legal, we need to materialize
2666 // in a register.
2667 // TODO: Can use SGPR on gfx10+ in some cases.
2668 if (!ScavengedVGPR) {
2669 ScavengedVGPR = RS->scavengeRegisterBackwards(
2670 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2671 /*SPAdj=*/0);
2672 }
2673
2674 assert(ScavengedVGPR != DstReg);
2675
2676 MachineOperand &Src = MI->getOperand(SrcIdx);
2677 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2678 .add(Src);
2679
2680 Src.ChangeToRegister(ScavengedVGPR, false);
2681 Src.setIsKill(true);
2682 break;
2683 }
2684 }
2685
2686 // Fold out add of 0 case that can appear in kernels.
2687 if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
2688 if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
2689 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
2690 }
2691
2692 MI->eraseFromParent();
2693 }
2694
2695 return true;
2696 }
2697 case AMDGPU::S_ADD_I32: {
2698 // TODO: Handle s_or_b32, s_and_b32.
2699 unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
2700 MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
2701
2702 assert(FrameReg || MFI->isBottomOfStack());
2703
2704 MachineOperand &DstOp = MI->getOperand(0);
2705 const DebugLoc &DL = MI->getDebugLoc();
2706 Register MaterializedReg = FrameReg;
2707
2708 // Defend against live scc, which should never happen in practice.
2709 bool DeadSCC = MI->getOperand(3).isDead();
2710
2711 Register TmpReg;
2712
2713 // FIXME: Scavenger should figure out that the result register is
2714 // available. Also should do this for the v_add case.
2715 if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg())
2716 TmpReg = DstOp.getReg();
2717
2718 if (FrameReg && !ST.enableFlatScratch()) {
2719 // FIXME: In the common case where the add does not also read its result
2720 // (i.e. this isn't a reg += fi), it's not finding the dest reg as
2721 // available.
2722 if (!TmpReg)
2723 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2724 MI, false, 0);
2725 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
2726 .addDef(TmpReg, RegState::Renamable)
2727 .addReg(FrameReg)
2729 .setOperandDead(3); // Set SCC dead
2730 MaterializedReg = TmpReg;
2731 }
2732
2733 int64_t Offset = FrameInfo.getObjectOffset(Index);
2734
2735 // For the non-immediate case, we could fall through to the default
2736 // handling, but we do an in-place update of the result register here to
2737 // avoid scavenging another register.
2738 if (OtherOp.isImm()) {
2739 OtherOp.setImm(OtherOp.getImm() + Offset);
2740 Offset = 0;
2741
2742 if (MaterializedReg)
2743 FIOp->ChangeToRegister(MaterializedReg, false);
2744 else
2745 FIOp->ChangeToImmediate(0);
2746 } else if (MaterializedReg) {
2747 // If we can't fold the other operand, do another increment.
2748 Register DstReg = DstOp.getReg();
2749
2750 if (!TmpReg && MaterializedReg == FrameReg) {
2751 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2752 MI, /*RestoreAfter=*/false, 0,
2753 /*AllowSpill=*/false);
2754 DstReg = TmpReg;
2755 }
2756
2757 auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_ADD_I32))
2758 .addDef(DstReg, RegState::Renamable)
2759 .addReg(MaterializedReg, RegState::Kill)
2760 .add(OtherOp);
2761 if (DeadSCC)
2762 AddI32.setOperandDead(3);
2763
2764 MaterializedReg = DstReg;
2765
2766 OtherOp.ChangeToRegister(MaterializedReg, false);
2767 OtherOp.setIsKill(true);
2768 OtherOp.setIsRenamable(true);
2770 } else {
2771 // If we don't have any other offset to apply, we can just directly
2772 // interpret the frame index as the offset.
2774 }
2775
2776 if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
2777 assert(Offset == 0);
2778 MI->removeOperand(3);
2779 MI->removeOperand(OtherOpIdx);
2780 MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2781 } else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
2782 assert(Offset == 0);
2783 MI->removeOperand(3);
2784 MI->removeOperand(FIOperandNum);
2785 MI->setDesc(
2786 TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2787 }
2788
2789 assert(!FIOp->isFI());
2790 return true;
2791 }
2792 default: {
2793 break;
2794 }
2795 }
2796
2797 int64_t Offset = FrameInfo.getObjectOffset(Index);
2798 if (ST.enableFlatScratch()) {
2799 if (TII->isFLATScratch(*MI)) {
2800 assert(
2801 (int16_t)FIOperandNum ==
2802 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::saddr));
2803
2804 // The offset is always swizzled, just replace it
2805 if (FrameReg)
2806 FIOp->ChangeToRegister(FrameReg, false);
2807
2808 MachineOperand *OffsetOp =
2809 TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2810 int64_t NewOffset = Offset + OffsetOp->getImm();
2811 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
2813 OffsetOp->setImm(NewOffset);
2814 if (FrameReg)
2815 return false;
2816 Offset = 0;
2817 }
2818
2819 if (!Offset) {
2820 unsigned Opc = MI->getOpcode();
2821 int NewOpc = -1;
2822 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
2824 } else if (ST.hasFlatScratchSTMode()) {
2825 // On GFX10 we have ST mode to use no registers for an address.
2826 // Otherwise we need to materialize 0 into an SGPR.
2828 }
2829
2830 if (NewOpc != -1) {
2831 // removeOperand doesn't fixup tied operand indexes as it goes, so
2832 // it asserts. Untie vdst_in for now and retie them afterwards.
2833 int VDstIn =
2834 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
2835 bool TiedVDst = VDstIn != -1 && MI->getOperand(VDstIn).isReg() &&
2836 MI->getOperand(VDstIn).isTied();
2837 if (TiedVDst)
2838 MI->untieRegOperand(VDstIn);
2839
2840 MI->removeOperand(
2841 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
2842
2843 if (TiedVDst) {
2844 int NewVDst =
2845 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
2846 int NewVDstIn =
2847 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
2848 assert(NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
2849 MI->tieOperands(NewVDst, NewVDstIn);
2850 }
2851 MI->setDesc(TII->get(NewOpc));
2852 return false;
2853 }
2854 }
2855 }
2856
2857 if (!FrameReg) {
2859 if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
2860 return false;
2861 }
2862
2863 // We need to use register here. Check if we can use an SGPR or need
2864 // a VGPR.
2865 FIOp->ChangeToRegister(AMDGPU::M0, false);
2866 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
2867
2868 if (!Offset && FrameReg && UseSGPR) {
2869 FIOp->setReg(FrameReg);
2870 return false;
2871 }
2872
2873 const TargetRegisterClass *RC =
2874 UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass;
2875
2876 Register TmpReg =
2877 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2878 FIOp->setReg(TmpReg);
2879 FIOp->setIsKill();
2880
2881 if ((!FrameReg || !Offset) && TmpReg) {
2882 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2883 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
2884 if (FrameReg)
2885 MIB.addReg(FrameReg);
2886 else
2887 MIB.addImm(Offset);
2888
2889 return false;
2890 }
2891
2892 bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) &&
2893 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
2894
2895 Register TmpSReg =
2896 UseSGPR ? TmpReg
2897 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2898 MI, false, 0, !UseSGPR);
2899
2900 // TODO: for flat scratch another attempt can be made with a VGPR index
2901 // if no SGPRs can be scavenged.
2902 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
2903 report_fatal_error("Cannot scavenge register in FI elimination!");
2904
2905 if (!TmpSReg) {
2906 // Use frame register and restore it after.
2907 TmpSReg = FrameReg;
2908 FIOp->setReg(FrameReg);
2909 FIOp->setIsKill(false);
2910 }
2911
2912 if (NeedSaveSCC) {
2913 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
2914 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
2915 .addReg(FrameReg)
2916 .addImm(Offset);
2917 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
2918 .addReg(TmpSReg)
2919 .addImm(0);
2920 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
2921 .addImm(0)
2922 .addReg(TmpSReg);
2923 } else {
2924 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
2925 .addReg(FrameReg)
2926 .addImm(Offset);
2927 }
2928
2929 if (!UseSGPR)
2930 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
2931 .addReg(TmpSReg, RegState::Kill);
2932
2933 if (TmpSReg == FrameReg) {
2934 // Undo frame register modification.
2935 if (NeedSaveSCC &&
2936 !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
2938 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
2939 TmpSReg)
2940 .addReg(FrameReg)
2941 .addImm(-Offset);
2942 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
2943 .addReg(TmpSReg)
2944 .addImm(0);
2945 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
2946 TmpSReg)
2947 .addImm(0)
2948 .addReg(TmpSReg);
2949 } else {
2950 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
2951 FrameReg)
2952 .addReg(FrameReg)
2953 .addImm(-Offset);
2954 }
2955 }
2956
2957 return false;
2958 }
2959
2960 bool IsMUBUF = TII->isMUBUF(*MI);
2961
2962 if (!IsMUBUF && !MFI->isBottomOfStack()) {
2963 // Convert to a swizzled stack address by scaling by the wave size.
2964 // In an entry function/kernel the offset is already swizzled.
2965 bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
2966 bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
2967 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
2968 const TargetRegisterClass *RC = IsSALU && !LiveSCC
2969 ? &AMDGPU::SReg_32RegClass
2970 : &AMDGPU::VGPR_32RegClass;
2971 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
2972 MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
2973 MI->getOpcode() == AMDGPU::S_MOV_B32;
2974 Register ResultReg =
2975 IsCopy ? MI->getOperand(0).getReg()
2976 : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
2977
2978 int64_t Offset = FrameInfo.getObjectOffset(Index);
2979 if (Offset == 0) {
2980 unsigned OpCode =
2981 IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64;
2982 Register TmpResultReg = ResultReg;
2983 if (IsSALU && LiveSCC) {
2984 TmpResultReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
2985 MI, false, 0);
2986 }
2987
2988 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
2989 if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
2990 // For V_LSHRREV, the operands are reversed (the shift count goes
2991 // first).
2992 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
2993 else
2994 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
2995 if (IsSALU && !LiveSCC)
2996 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
2997 if (IsSALU && LiveSCC) {
2998 Register NewDest =
2999 IsCopy ? ResultReg
3000 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32RegClass,
3001 Shift, false, 0);
3002 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
3003 .addReg(TmpResultReg);
3004 ResultReg = NewDest;
3005 }
3006 } else {
3008 if (!IsSALU) {
3009 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
3010 nullptr) {
3011 // Reuse ResultReg in intermediate step.
3012 Register ScaledReg = ResultReg;
3013
3014 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3015 ScaledReg)
3017 .addReg(FrameReg);
3018
3019 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
3020
3021 // TODO: Fold if use instruction is another add of a constant.
3022 if (IsVOP2 ||
3024 // FIXME: This can fail
3025 MIB.addImm(Offset);
3026 MIB.addReg(ScaledReg, RegState::Kill);
3027 if (!IsVOP2)
3028 MIB.addImm(0); // clamp bit
3029 } else {
3030 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
3031 "Need to reuse carry out register");
3032
3033 // Use scavenged unused carry out as offset register.
3034 Register ConstOffsetReg;
3035 if (!isWave32)
3036 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
3037 else
3038 ConstOffsetReg = MIB.getReg(1);
3039
3040 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32),
3041 ConstOffsetReg)
3042 .addImm(Offset);
3043 MIB.addReg(ConstOffsetReg, RegState::Kill);
3044 MIB.addReg(ScaledReg, RegState::Kill);
3045 MIB.addImm(0); // clamp bit
3046 }
3047 }
3048 }
3049 if (!MIB || IsSALU) {
3050 // We have to produce a carry out, and there isn't a free SGPR pair
3051 // for it. We can keep the whole computation on the SALU to avoid
3052 // clobbering an additional register at the cost of an extra mov.
3053
3054 // We may have 1 free scratch SGPR even though a carry out is
3055 // unavailable. Only one additional mov is needed.
3056 Register TmpScaledReg = IsCopy && IsSALU
3057 ? ResultReg
3059 AMDGPU::SReg_32_XM0RegClass, MI,
3060 false, 0, /*AllowSpill=*/false);
3061 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
3062 Register TmpResultReg = ScaledReg;
3063
3064 if (!LiveSCC) {
3065 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
3066 .addReg(FrameReg)
3068 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
3069 .addReg(TmpResultReg, RegState::Kill)
3070 .addImm(Offset);
3071 } else {
3072 TmpResultReg = RS->scavengeRegisterBackwards(
3073 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3074
3076 if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
3077 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3078 TmpResultReg)
3080 .addReg(FrameReg);
3081 if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
3082 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32), ResultReg)
3083 .addImm(Offset);
3084 Add.addReg(ResultReg, RegState::Kill)
3085 .addReg(TmpResultReg, RegState::Kill)
3086 .addImm(0);
3087 } else
3088 Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
3089 } else {
3090 assert(Offset > 0 && isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
3091 "offset is unsafe for v_mad_u32_u24");
3092
3093 // We start with a frame pointer with a wave space value, and
3094 // an offset in lane-space. We are materializing a lane space
3095 // value. We can either do a right shift of the frame pointer
3096 // to get to lane space, or a left shift of the offset to get
3097 // to wavespace. We can right shift after the computation to
3098 // get back to the desired per-lane value. We are using the
3099 // mad_u32_u24 primarily as an add with no carry out clobber.
3100 bool IsInlinableLiteral =
3102 if (!IsInlinableLiteral) {
3103 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
3104 TmpResultReg)
3105 .addImm(Offset);
3106 }
3107
3108 Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
3109 TmpResultReg);
3110
3111 if (!IsInlinableLiteral) {
3112 Add.addReg(TmpResultReg, RegState::Kill);
3113 } else {
3114 // We fold the offset into mad itself if its inlinable.
3115 Add.addImm(Offset);
3116 }
3117 Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
3118 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3119 TmpResultReg)
3121 .addReg(TmpResultReg);
3122 }
3123
3124 Register NewDest = IsCopy ? ResultReg
3126 AMDGPU::SReg_32RegClass, *Add,
3127 false, 0, /*AllowSpill=*/true);
3128 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
3129 NewDest)
3130 .addReg(TmpResultReg);
3131 ResultReg = NewDest;
3132 }
3133 if (!IsSALU)
3134 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
3135 .addReg(TmpResultReg, RegState::Kill);
3136 else
3137 ResultReg = TmpResultReg;
3138 // If there were truly no free SGPRs, we need to undo everything.
3139 if (!TmpScaledReg.isValid()) {
3140 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
3141 .addReg(ScaledReg, RegState::Kill)
3142 .addImm(-Offset);
3143 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
3144 .addReg(FrameReg)
3146 }
3147 }
3148 }
3149
3150 // Don't introduce an extra copy if we're just materializing in a mov.
3151 if (IsCopy) {
3152 MI->eraseFromParent();
3153 return true;
3154 }
3155 FIOp->ChangeToRegister(ResultReg, false, false, true);
3156 return false;
3157 }
3158
3159 if (IsMUBUF) {
3160 // Disable offen so we don't need a 0 vgpr base.
3161 assert(
3162 static_cast<int>(FIOperandNum) ==
3163 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr));
3164
3165 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
3166 assert((SOffset.isImm() && SOffset.getImm() == 0));
3167
3168 if (FrameReg != AMDGPU::NoRegister)
3169 SOffset.ChangeToRegister(FrameReg, false);
3170
3171 int64_t Offset = FrameInfo.getObjectOffset(Index);
3172 int64_t OldImm =
3173 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
3174 int64_t NewOffset = OldImm + Offset;
3175
3176 if (TII->isLegalMUBUFImmOffset(NewOffset) &&
3177 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
3178 MI->eraseFromParent();
3179 return true;
3180 }
3181 }
3182
3183 // If the offset is simply too big, don't convert to a scratch wave offset
3184 // relative index.
3185
3187 if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
3188 Register TmpReg =
3189 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
3190 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3191 .addImm(Offset);
3192 FIOp->ChangeToRegister(TmpReg, false, false, true);
3193 }
3194
3195 return false;
3196}
3197
3200}
3201
3203 return getRegBitWidth(RC.getID());
3204}
3205
3206static const TargetRegisterClass *
3208 if (BitWidth == 64)
3209 return &AMDGPU::VReg_64RegClass;
3210 if (BitWidth == 96)
3211 return &AMDGPU::VReg_96RegClass;
3212 if (BitWidth == 128)
3213 return &AMDGPU::VReg_128RegClass;
3214 if (BitWidth == 160)
3215 return &AMDGPU::VReg_160RegClass;
3216 if (BitWidth == 192)
3217 return &AMDGPU::VReg_192RegClass;
3218 if (BitWidth == 224)
3219 return &AMDGPU::VReg_224RegClass;
3220 if (BitWidth == 256)
3221 return &AMDGPU::VReg_256RegClass;
3222 if (BitWidth == 288)
3223 return &AMDGPU::VReg_288RegClass;
3224 if (BitWidth == 320)
3225 return &AMDGPU::VReg_320RegClass;
3226 if (BitWidth == 352)
3227 return &AMDGPU::VReg_352RegClass;
3228 if (BitWidth == 384)
3229 return &AMDGPU::VReg_384RegClass;
3230 if (BitWidth == 512)
3231 return &AMDGPU::VReg_512RegClass;
3232 if (BitWidth == 1024)
3233 return &AMDGPU::VReg_1024RegClass;
3234
3235 return nullptr;
3236}
3237
3238static const TargetRegisterClass *
3240 if (BitWidth == 64)
3241 return &AMDGPU::VReg_64_Align2RegClass;
3242 if (BitWidth == 96)
3243 return &AMDGPU::VReg_96_Align2RegClass;
3244 if (BitWidth == 128)
3245 return &AMDGPU::VReg_128_Align2RegClass;
3246 if (BitWidth == 160)
3247 return &AMDGPU::VReg_160_Align2RegClass;
3248 if (BitWidth == 192)
3249 return &AMDGPU::VReg_192_Align2RegClass;
3250 if (BitWidth == 224)
3251 return &AMDGPU::VReg_224_Align2RegClass;
3252 if (BitWidth == 256)
3253 return &AMDGPU::VReg_256_Align2RegClass;
3254 if (BitWidth == 288)
3255 return &AMDGPU::VReg_288_Align2RegClass;
3256 if (BitWidth == 320)
3257 return &AMDGPU::VReg_320_Align2RegClass;
3258 if (BitWidth == 352)
3259 return &AMDGPU::VReg_352_Align2RegClass;
3260 if (BitWidth == 384)
3261 return &AMDGPU::VReg_384_Align2RegClass;
3262 if (BitWidth == 512)
3263 return &AMDGPU::VReg_512_Align2RegClass;
3264 if (BitWidth == 1024)
3265 return &AMDGPU::VReg_1024_Align2RegClass;
3266
3267 return nullptr;
3268}
3269
3270const TargetRegisterClass *
3272 if (BitWidth == 1)
3273 return &AMDGPU::VReg_1RegClass;
3274 if (BitWidth == 16)
3275 return &AMDGPU::VGPR_16RegClass;
3276 if (BitWidth == 32)
3277 return &AMDGPU::VGPR_32RegClass;
3280}
3281
3282static const TargetRegisterClass *
3284 if (BitWidth == 64)
3285 return &AMDGPU::AReg_64RegClass;
3286 if (BitWidth == 96)
3287 return &AMDGPU::AReg_96RegClass;
3288 if (BitWidth == 128)
3289 return &AMDGPU::AReg_128RegClass;
3290 if (BitWidth == 160)
3291 return &AMDGPU::AReg_160RegClass;
3292 if (BitWidth == 192)
3293 return &AMDGPU::AReg_192RegClass;
3294 if (BitWidth == 224)
3295 return &AMDGPU::AReg_224RegClass;
3296 if (BitWidth == 256)
3297 return &AMDGPU::AReg_256RegClass;
3298 if (BitWidth == 288)
3299 return &AMDGPU::AReg_288RegClass;
3300 if (BitWidth == 320)
3301 return &AMDGPU::AReg_320RegClass;
3302 if (BitWidth == 352)
3303 return &AMDGPU::AReg_352RegClass;
3304 if (BitWidth == 384)
3305 return &AMDGPU::AReg_384RegClass;
3306 if (BitWidth == 512)
3307 return &AMDGPU::AReg_512RegClass;
3308 if (BitWidth == 1024)
3309 return &AMDGPU::AReg_1024RegClass;
3310
3311 return nullptr;
3312}
3313
3314static const TargetRegisterClass *
3316 if (BitWidth == 64)
3317 return &AMDGPU::AReg_64_Align2RegClass;
3318 if (BitWidth == 96)
3319 return &AMDGPU::AReg_96_Align2RegClass;
3320 if (BitWidth == 128)
3321 return &AMDGPU::AReg_128_Align2RegClass;
3322 if (BitWidth == 160)
3323 return &AMDGPU::AReg_160_Align2RegClass;
3324 if (BitWidth == 192)
3325 return &AMDGPU::AReg_192_Align2RegClass;
3326 if (BitWidth == 224)
3327 return &AMDGPU::AReg_224_Align2RegClass;
3328 if (BitWidth == 256)
3329 return &AMDGPU::AReg_256_Align2RegClass;
3330 if (BitWidth == 288)
3331 return &AMDGPU::AReg_288_Align2RegClass;
3332 if (BitWidth == 320)
3333 return &AMDGPU::AReg_320_Align2RegClass;
3334 if (BitWidth == 352)
3335 return &AMDGPU::AReg_352_Align2RegClass;
3336 if (BitWidth == 384)
3337 return &AMDGPU::AReg_384_Align2RegClass;
3338 if (BitWidth == 512)
3339 return &AMDGPU::AReg_512_Align2RegClass;
3340 if (BitWidth == 1024)
3341 return &AMDGPU::AReg_1024_Align2RegClass;
3342
3343 return nullptr;
3344}
3345
3346const TargetRegisterClass *
3348 if (BitWidth == 16)
3349 return &AMDGPU::AGPR_LO16RegClass;
3350 if (BitWidth == 32)
3351 return &AMDGPU::AGPR_32RegClass;
3354}
3355
3356static const TargetRegisterClass *
3358 if (BitWidth == 64)
3359 return &AMDGPU::AV_64RegClass;
3360 if (BitWidth == 96)
3361 return &AMDGPU::AV_96RegClass;
3362 if (BitWidth == 128)
3363 return &AMDGPU::AV_128RegClass;
3364 if (BitWidth == 160)
3365 return &AMDGPU::AV_160RegClass;
3366 if (BitWidth == 192)
3367 return &AMDGPU::AV_192RegClass;
3368 if (BitWidth == 224)
3369 return &AMDGPU::AV_224RegClass;
3370 if (BitWidth == 256)
3371 return &AMDGPU::AV_256RegClass;
3372 if (BitWidth == 288)
3373 return &AMDGPU::AV_288RegClass;
3374 if (BitWidth == 320)
3375 return &AMDGPU::AV_320RegClass;
3376 if (BitWidth == 352)
3377 return &AMDGPU::AV_352RegClass;
3378 if (BitWidth == 384)
3379 return &AMDGPU::AV_384RegClass;
3380 if (BitWidth == 512)
3381 return &AMDGPU::AV_512RegClass;
3382 if (BitWidth == 1024)
3383 return &AMDGPU::AV_1024RegClass;
3384
3385 return nullptr;
3386}
3387
3388static const TargetRegisterClass *
3390 if (BitWidth == 64)
3391 return &AMDGPU::AV_64_Align2RegClass;
3392 if (BitWidth == 96)
3393 return &AMDGPU::AV_96_Align2RegClass;
3394 if (BitWidth == 128)
3395 return &AMDGPU::AV_128_Align2RegClass;
3396 if (BitWidth == 160)
3397 return &AMDGPU::AV_160_Align2RegClass;
3398 if (BitWidth == 192)
3399 return &AMDGPU::AV_192_Align2RegClass;
3400 if (BitWidth == 224)
3401 return &AMDGPU::AV_224_Align2RegClass;
3402 if (BitWidth == 256)
3403 return &AMDGPU::AV_256_Align2RegClass;
3404 if (BitWidth == 288)
3405 return &AMDGPU::AV_288_Align2RegClass;
3406 if (BitWidth == 320)
3407 return &AMDGPU::AV_320_Align2RegClass;
3408 if (BitWidth == 352)
3409 return &AMDGPU::AV_352_Align2RegClass;
3410 if (BitWidth == 384)
3411 return &AMDGPU::AV_384_Align2RegClass;
3412 if (BitWidth == 512)
3413 return &AMDGPU::AV_512_Align2RegClass;
3414 if (BitWidth == 1024)
3415 return &AMDGPU::AV_1024_Align2RegClass;
3416
3417 return nullptr;
3418}
3419
3420const TargetRegisterClass *
3422 if (BitWidth == 32)
3423 return &AMDGPU::AV_32RegClass;
3424 return ST.needsAlignedVGPRs()
3427}
3428
3429const TargetRegisterClass *
3431 if (BitWidth == 16)
3432 return &AMDGPU::SGPR_LO16RegClass;
3433 if (BitWidth == 32)
3434 return &AMDGPU::SReg_32RegClass;
3435 if (BitWidth == 64)
3436 return &AMDGPU::SReg_64RegClass;
3437 if (BitWidth == 96)
3438 return &AMDGPU::SGPR_96RegClass;
3439 if (BitWidth == 128)
3440 return &AMDGPU::SGPR_128RegClass;
3441 if (BitWidth == 160)
3442 return &AMDGPU::SGPR_160RegClass;
3443 if (BitWidth == 192)
3444 return &AMDGPU::SGPR_192RegClass;
3445 if (BitWidth == 224)
3446 return &AMDGPU::SGPR_224RegClass;
3447 if (BitWidth == 256)
3448 return &AMDGPU::SGPR_256RegClass;
3449 if (BitWidth == 288)
3450 return &AMDGPU::SGPR_288RegClass;
3451 if (BitWidth == 320)
3452 return &AMDGPU::SGPR_320RegClass;
3453 if (BitWidth == 352)
3454 return &AMDGPU::SGPR_352RegClass;
3455 if (BitWidth == 384)
3456 return &AMDGPU::SGPR_384RegClass;
3457 if (BitWidth == 512)
3458 return &AMDGPU::SGPR_512RegClass;
3459 if (BitWidth == 1024)
3460 return &AMDGPU::SGPR_1024RegClass;
3461
3462 return nullptr;
3463}
3464
3466 Register Reg) const {
3467 const TargetRegisterClass *RC;
3468 if (Reg.isVirtual())
3469 RC = MRI.getRegClass(Reg);
3470 else
3471 RC = getPhysRegBaseClass(Reg);
3472 return RC ? isSGPRClass(RC) : false;
3473}
3474
3475const TargetRegisterClass *
3477 unsigned Size = getRegSizeInBits(*SRC);
3479 assert(VRC && "Invalid register class size");
3480 return VRC;
3481}
3482
3483const TargetRegisterClass *
3485 unsigned Size = getRegSizeInBits(*SRC);
3487 assert(ARC && "Invalid register class size");
3488 return ARC;
3489}
3490
3491const TargetRegisterClass *
3493 unsigned Size = getRegSizeInBits(*VRC);
3494 if (Size == 32)
3495 return &AMDGPU::SGPR_32RegClass;
3497 assert(SRC && "Invalid register class size");
3498 return SRC;
3499}
3500
3501const TargetRegisterClass *
3503 const TargetRegisterClass *SubRC,
3504 unsigned SubIdx) const {
3505 // Ensure this subregister index is aligned in the super register.
3506 const TargetRegisterClass *MatchRC =
3507 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
3508 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
3509}
3510
3511bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
3514 return !ST.hasMFMAInlineLiteralBug();
3515
3516 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
3517 OpType <= AMDGPU::OPERAND_SRC_LAST;
3518}
3519
3521 const TargetRegisterClass *DefRC,
3522 unsigned DefSubReg,
3523 const TargetRegisterClass *SrcRC,
3524 unsigned SrcSubReg) const {
3525 // We want to prefer the smallest register class possible, so we don't want to
3526 // stop and rewrite on anything that looks like a subregister
3527 // extract. Operations mostly don't care about the super register class, so we
3528 // only want to stop on the most basic of copies between the same register
3529 // class.
3530 //
3531 // e.g. if we have something like
3532 // %0 = ...
3533 // %1 = ...
3534 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
3535 // %3 = COPY %2, sub0
3536 //
3537 // We want to look through the COPY to find:
3538 // => %3 = COPY %0
3539
3540 // Plain copy.
3541 return getCommonSubClass(DefRC, SrcRC) != nullptr;
3542}
3543
3544bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
3545 // TODO: 64-bit operands have extending behavior from 32-bit literal.
3546 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
3548}
3549
3550/// Returns a lowest register that is not used at any point in the function.
3551/// If all registers are used, then this function will return
3552/// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
3553/// highest unused register.
3556 const MachineFunction &MF, bool ReserveHighestRegister) const {
3557 if (ReserveHighestRegister) {
3558 for (MCRegister Reg : reverse(*RC))
3559 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3560 return Reg;
3561 } else {
3562 for (MCRegister Reg : *RC)
3563 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3564 return Reg;
3565 }
3566 return MCRegister();
3567}
3568
3570 const RegisterBankInfo &RBI,
3571 Register Reg) const {
3572 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo());
3573 if (!RB)
3574 return false;
3575
3576 return !RBI.isDivergentRegBank(RB);
3577}
3578
3580 unsigned EltSize) const {
3581 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
3582 assert(RegBitWidth >= 32 && RegBitWidth <= 1024);
3583
3584 const unsigned RegDWORDs = RegBitWidth / 32;
3585 const unsigned EltDWORDs = EltSize / 4;
3586 assert(RegSplitParts.size() + 1 >= EltDWORDs);
3587
3588 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1];
3589 const unsigned NumParts = RegDWORDs / EltDWORDs;
3590
3591 return ArrayRef(Parts.data(), NumParts);
3592}
3593
3596 Register Reg) const {
3597 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg);
3598}
3599
3600const TargetRegisterClass *
3602 const MachineOperand &MO) const {
3603 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg());
3604 return getSubRegisterClass(SrcRC, MO.getSubReg());
3605}
3606
3608 Register Reg) const {
3609 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3610 // Registers without classes are unaddressable, SGPR-like registers.
3611 return RC && isVGPRClass(RC);
3612}
3613
3615 Register Reg) const {
3616 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3617
3618 // Registers without classes are unaddressable, SGPR-like registers.
3619 return RC && isAGPRClass(RC);
3620}
3621
3623 const TargetRegisterClass *SrcRC,
3624 unsigned SubReg,
3625 const TargetRegisterClass *DstRC,
3626 unsigned DstSubReg,
3627 const TargetRegisterClass *NewRC,
3628 LiveIntervals &LIS) const {
3629 unsigned SrcSize = getRegSizeInBits(*SrcRC);
3630 unsigned DstSize = getRegSizeInBits(*DstRC);
3631 unsigned NewSize = getRegSizeInBits(*NewRC);
3632
3633 // Do not increase size of registers beyond dword, we would need to allocate
3634 // adjacent registers and constraint regalloc more than needed.
3635
3636 // Always allow dword coalescing.
3637 if (SrcSize <= 32 || DstSize <= 32)
3638 return true;
3639
3640 return NewSize <= DstSize || NewSize <= SrcSize;
3641}
3642
3644 MachineFunction &MF) const {
3646
3647 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
3648 MF.getFunction());
3649 switch (RC->getID()) {
3650 default:
3651 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
3652 case AMDGPU::VGPR_32RegClassID:
3653 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
3654 case AMDGPU::SGPR_32RegClassID:
3655 case AMDGPU::SGPR_LO16RegClassID:
3656 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
3657 }
3658}
3659
3661 unsigned Idx) const {
3662 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
3663 Idx == AMDGPU::RegisterPressureSets::AGPR_32)
3664 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
3665 const_cast<MachineFunction &>(MF));
3666
3667 if (Idx == AMDGPU::RegisterPressureSets::SReg_32)
3668 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
3669 const_cast<MachineFunction &>(MF));
3670
3671 llvm_unreachable("Unexpected register pressure set!");
3672}
3673
3674const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
3675 static const int Empty[] = { -1 };
3676
3677 if (RegPressureIgnoredUnits[RegUnit])
3678 return Empty;
3679
3680 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
3681}
3682
3684 // Not a callee saved register.
3685 return AMDGPU::SGPR30_SGPR31;
3686}
3687
3688const TargetRegisterClass *
3690 const RegisterBank &RB) const {
3691 switch (RB.getID()) {
3692 case AMDGPU::VGPRRegBankID:
3694 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
3695 case AMDGPU::VCCRegBankID:
3696 assert(Size == 1);
3697 return getWaveMaskRegClass();
3698 case AMDGPU::SGPRRegBankID:
3699 return getSGPRClassForBitWidth(std::max(32u, Size));
3700 case AMDGPU::AGPRRegBankID:
3701 return getAGPRClassForBitWidth(std::max(32u, Size));
3702 default:
3703 llvm_unreachable("unknown register bank");
3704 }
3705}
3706
3707const TargetRegisterClass *
3709 const MachineRegisterInfo &MRI) const {
3710 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
3711 if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB))
3712 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
3713
3714 if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB))
3715 return getAllocatableClass(RC);
3716
3717 return nullptr;
3718}
3719
3721 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
3722}
3723
3725 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3726}
3727
3729 // VGPR tuples have an alignment requirement on gfx90a variants.
3730 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
3731 : &AMDGPU::VReg_64RegClass;
3732}
3733
3734const TargetRegisterClass *
3735SIRegisterInfo::getRegClass(unsigned RCID) const {
3736 switch ((int)RCID) {
3737 case AMDGPU::SReg_1RegClassID:
3738 return getBoolRC();
3739 case AMDGPU::SReg_1_XEXECRegClassID:
3740 return getWaveMaskRegClass();
3741 case -1:
3742 return nullptr;
3743 default:
3744 return AMDGPUGenRegisterInfo::getRegClass(RCID);
3745 }
3746}
3747
3748// Find reaching register definition
3752 LiveIntervals *LIS) const {
3753 auto &MDT = LIS->getDomTree();
3754 SlotIndex UseIdx = LIS->getInstructionIndex(Use);
3755 SlotIndex DefIdx;
3756
3757 if (Reg.isVirtual()) {
3758 if (!LIS->hasInterval(Reg))
3759 return nullptr;
3760 LiveInterval &LI = LIS->getInterval(Reg);
3761 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
3762 : MRI.getMaxLaneMaskForVReg(Reg);
3763 VNInfo *V = nullptr;
3764 if (LI.hasSubRanges()) {
3765 for (auto &S : LI.subranges()) {
3766 if ((S.LaneMask & SubLanes) == SubLanes) {
3767 V = S.getVNInfoAt(UseIdx);
3768 break;
3769 }
3770 }
3771 } else {
3772 V = LI.getVNInfoAt(UseIdx);
3773 }
3774 if (!V)
3775 return nullptr;
3776 DefIdx = V->def;
3777 } else {
3778 // Find last def.
3779 for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
3780 LiveRange &LR = LIS->getRegUnit(Unit);
3781 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
3782 if (!DefIdx.isValid() ||
3783 MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
3784 LIS->getInstructionFromIndex(V->def)))
3785 DefIdx = V->def;
3786 } else {
3787 return nullptr;
3788 }
3789 }
3790 }
3791
3792 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
3793
3794 if (!Def || !MDT.dominates(Def, &Use))
3795 return nullptr;
3796
3797 assert(Def->modifiesRegister(Reg, this));
3798
3799 return Def;
3800}
3801
3803 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32);
3804
3805 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
3806 AMDGPU::SReg_32RegClass,
3807 AMDGPU::AGPR_32RegClass } ) {
3808 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
3809 return Super;
3810 }
3811 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
3812 &AMDGPU::VGPR_32RegClass)) {
3813 return Super;
3814 }
3815
3816 return AMDGPU::NoRegister;
3817}
3818
3820 if (!ST.needsAlignedVGPRs())
3821 return true;
3822
3823 if (isVGPRClass(&RC))
3824 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
3825 if (isAGPRClass(&RC))
3826 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
3827 if (isVectorSuperClass(&RC))
3828 return RC.hasSuperClassEq(
3829 getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
3830
3831 return true;
3832}
3833
3834const TargetRegisterClass *
3836 if (!RC || !ST.needsAlignedVGPRs())
3837 return RC;
3838
3839 unsigned Size = getRegSizeInBits(*RC);
3840 if (Size <= 32)
3841 return RC;
3842
3843 if (isVGPRClass(RC))
3845 if (isAGPRClass(RC))
3847 if (isVectorSuperClass(RC))
3849
3850 return RC;
3851}
3852
3855 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
3856}
3857
3860 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2);
3861}
3862
3865 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
3866}
3867
3868unsigned
3870 unsigned SubReg) const {
3871 switch (RC->TSFlags & SIRCFlags::RegKindMask) {
3872 case SIRCFlags::HasSGPR:
3873 return std::min(128u, getSubRegIdxSize(SubReg));
3874 case SIRCFlags::HasAGPR:
3875 case SIRCFlags::HasVGPR:
3877 return std::min(32u, getSubRegIdxSize(SubReg));
3878 default:
3879 break;
3880 }
3881 return 0;
3882}
3883
3884unsigned
3886 const TargetRegisterClass &RC) const {
3887 for (MCPhysReg Reg : reverse(RC.getRegisters()))
3888 if (MRI.isPhysRegUsed(Reg))
3889 return getHWRegIndex(Reg) + 1;
3890 return 0;
3891}
3892
3895 const MachineFunction &MF) const {
3897 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3898 if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
3899 RegFlags.push_back("WWM_REG");
3900 return RegFlags;
3901}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A set of register units.
#define I(x, y, z)
Definition: MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
This file declares the machine register scavenger class.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static int getOffenMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyAGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFLoad(unsigned Opc)
static const std::array< unsigned, 17 > SubRegFromChannelTableWidthMap
static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI, const Twine &ErrMsg)
static const TargetRegisterClass * getAlignedAGPRClassForBitWidth(unsigned BitWidth)
static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, MachineFrameInfo &MFI, MachineBasicBlock::iterator MI, int Index, int64_t Offset)
static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, unsigned LoadStoreOp, unsigned EltSize)
static const TargetRegisterClass * getAlignedVGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyVGPRClassForBitWidth(unsigned BitWidth)
static cl::opt< bool > EnableSpillSGPRToVGPR("amdgpu-spill-sgpr-to-vgpr", cl::desc("Enable spilling SGPRs to VGPRs"), cl::ReallyHidden, cl::init(true))
static unsigned getNumSubRegsForSpillOp(unsigned Op)
static const TargetRegisterClass * getAlignedVectorSuperClassForBitWidth(unsigned BitWidth)
static const TargetRegisterClass * getAnyVectorSuperClassForBitWidth(unsigned BitWidth)
static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, int Index, unsigned Lane, unsigned ValueReg, bool IsKill)
static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI, const MachineInstr &MI)
static int getOffenMUBUFLoad(unsigned Opc)
Interface definition for SIRegisterInfo.
static const char * getRegisterName(MCRegister Reg)
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const
Inverse of getMaxLocalMemWithWaveCount.
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
bool test(unsigned Idx) const
Definition: BitVector.h:461
void resize(unsigned N, bool t=false)
resize - Grow or shrink the bitvector.
Definition: BitVector.h:341
BitVector & set()
Definition: BitVector.h:351
bool empty() const
empty - Tests whether there are no bits in this bitvector.
Definition: BitVector.h:156
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Register getReg() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:369
bool hasGFX90AInsts() const
bool hasMAIInsts() const
Definition: GCNSubtarget.h:837
bool hasMFMAInlineLiteralBug() const
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
unsigned getConstantBusLimit(unsigned Opcode) const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool enableFlatScratch() const
Definition: GCNSubtarget.h:666
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:283
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:946
bool hasFlatScratchSTMode() const
Definition: GCNSubtarget.h:656
unsigned getMaxWaveScratchSize() const
Definition: GCNSubtarget.h:331
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:687
bool hasSubRanges() const
Returns true if subregister liveness information is available.
Definition: LiveInterval.h:810
iterator_range< subrange_iterator > subranges()
Definition: LiveInterval.h:782
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
bool hasInterval(Register Reg) const
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
MachineDominatorTree & getDomTree()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveRange & getRegUnit(unsigned Unit)
Return the live range for register unit Unit.
LiveInterval & getInterval(Register Reg)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
Definition: LiveInterval.h:421
A set of register units used to track register liveness.
Definition: LiveRegUnits.h:30
bool available(MCPhysReg Reg) const
Returns true if no part of physical register Reg is live.
Definition: LiveRegUnits.h:116
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
MCRegAliasIterator enumerates all registers aliasing Reg.
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition: MCRegister.h:74
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasCalls() const
Return true if the current function has any function calls.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
bool hasStackObjects() const
Return true if there are any stack objects in this function.
uint8_t getStackID(int ObjectIdx) const
unsigned getNumFixedObjects() const
Return the number of fixed objects.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
void setAsmPrinterFlag(uint8_t Flag)
Set a flag for the AsmPrinter.
Definition: MachineInstr.h:378
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void assignRegToScavengingIndex(int FI, Register Reg, MachineInstr *Restore=nullptr)
Record that Reg is in use at scavenging index FI.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
Holds all the information related to register banks.
virtual bool isDivergentRegBank(const RegisterBank *RB) const
Returns true if the register bank is considered divergent.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:510
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:642
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:534
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool usesAGPRs(const MachineFunction &MF) const
ArrayRef< MCPhysReg > getAGPRSpillVGPRs() const
MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< MCPhysReg > getVGPRSpillAGPRs() const
int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI)
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const
bool checkFlag(Register Reg, uint8_t Flag) const
const ReservedRegSet & getWWMReservedRegs() const
Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, int64_t Offset) const override
int64_t getScratchInstrOffset(const MachineInstr *MI) const
bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const override
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
ArrayRef< MCPhysReg > getAllSGPR64(const MachineFunction &MF) const
Return all SGPR64 which satisfy the waves per execution unit requirement of the subtarget.
MCRegister findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, const MachineFunction &MF, bool ReserveHighestVGPR=false) const
Returns a lowest register that is not used at any point in the function.
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
bool requiresFrameIndexReplacementScavenging(const MachineFunction &MF) const override
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool shouldRealignStack(const MachineFunction &MF) const override
bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
Register getFrameRegister(const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getVectorSuperClassForBitWidth(unsigned BitWidth) const
bool spillEmergencySGPR(MachineBasicBlock::iterator MI, MachineBasicBlock &RestoreMBB, Register SGPR, RegScavenger *RS) const
SIRegisterInfo(const GCNSubtarget &ST)
const uint32_t * getAllVGPRRegMask() const
MCRegister getReturnAddressReg(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
bool hasBasePointer(const MachineFunction &MF) const
const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override
Returns a legal register class to copy a register in the specified class to or from.
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
ArrayRef< MCPhysReg > getAllSGPR32(const MachineFunction &MF) const
Return all SGPR32 which satisfy the waves per execution unit requirement of the subtarget.
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch buffer in case spilling is needed.
bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool SpillToPhysVGPRLane=false) const
Special case of eliminateFrameIndex.
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
void buildSpillLoadStore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned LoadStoreOp, int Index, Register ValueReg, bool ValueIsKill, MCRegister ScratchOffsetReg, int64_t InstrOffset, MachineMemOperand *MMO, RegScavenger *RS, LiveRegUnits *LiveUnits=nullptr) const
std::pair< unsigned, unsigned > getMaxNumVectorRegs(const MachineFunction &MF) const
Return a pair of maximum numbers of VGPRs and AGPRs that meet the number of waves per execution unit ...
bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override
LLVM_READONLY const TargetRegisterClass * getAGPRClassForBitWidth(unsigned BitWidth) const
static bool isChainScratchRegister(Register VGPR)
bool requiresRegisterScavenging(const MachineFunction &Fn) const override
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
bool isUniformReg(const MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, Register Reg) const override
const uint32_t * getNoPreservedMask() const override
StringRef getRegAsmName(MCRegister Reg) const override
const uint32_t * getAllAllocatableSRegMask() const
MCRegister getAlignedHighSGPRForRC(const MachineFunction &MF, const unsigned Align, const TargetRegisterClass *RC) const
Return the largest available SGPR aligned to Align for the register class RC.
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, const TargetRegisterClass &RC) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const MCPhysReg * getCalleeSavedRegsViaCopy(const MachineFunction *MF) const
const uint32_t * getAllVectorRegMask() const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
bool opCanUseLiteralConstant(unsigned OpType) const
Register getBaseRegister() const
LLVM_READONLY const TargetRegisterClass * getVGPRClassForBitWidth(unsigned BitWidth) const
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override
bool shouldRewriteCopySrc(const TargetRegisterClass *DefRC, unsigned DefSubReg, const TargetRegisterClass *SrcRC, unsigned SrcSubReg) const override
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
MachineInstr * findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
SmallVector< StringLiteral > getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
ArrayRef< MCPhysReg > getAllSGPR128(const MachineFunction &MF) const
Return all SGPR128 which satisfy the waves per execution unit requirement of the subtarget.
unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const override
BitVector getReservedRegs(const MachineFunction &MF) const override
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override
const TargetRegisterClass * getRegClassForOperandReg(const MachineRegisterInfo &MRI, const MachineOperand &MO) const
const uint32_t * getAllAGPRRegMask() const
bool shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg, const TargetRegisterClass *DstRC, unsigned DstSubReg, const TargetRegisterClass *NewRC, LiveIntervals &LIS) const override
const TargetRegisterClass * getBoolRC() const
const TargetRegisterClass * getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const override
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override
bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
If OnlyToVGPR is true, this will only succeed if this manages to find a free VGPR lane to spill.
MCRegister getExec() const
MCRegister getVCC() const
int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override
bool isVectorSuperClass(const TargetRegisterClass *RC) const
const TargetRegisterClass * getWaveMaskRegClass() const
unsigned getSubRegAlignmentNumBits(const TargetRegisterClass *RC, unsigned SubReg) const
void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override
bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override
const TargetRegisterClass * getVGPR64Class() const
void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill=true) const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
const int * getRegUnitPressureSets(unsigned RegUnit) const override
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:65
bool isValid() const
Returns true if this is a valid index.
Definition: SlotIndexes.h:130
SlotIndexes pass.
Definition: SlotIndexes.h:297
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:531
SlotIndex replaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
ReplaceMachineInstrInMaps - Replacing a machine instr with a new one in maps used by register allocat...
Definition: SlotIndexes.h:588
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
const uint8_t TSFlags
Configurable target specific flags.
ArrayRef< MCPhysReg > getRegisters() const
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
virtual const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &) const
Returns the largest super class of RC that is legal to use in the current sub-target and has the same...
virtual bool shouldRealignStack(const MachineFunction &MF) const
True if storage within the function requires the stack pointer to be aligned more than the normal cal...
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
VNInfo - Value Number Information.
Definition: LiveInterval.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ PRIVATE_ADDRESS
Address space for private memory.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
LLVM_READONLY int getFlatScratchInstSTfromSS(uint16_t Opcode)
LLVM_READONLY int getFlatScratchInstSVfromSVS(uint16_t Opcode)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
@ OPERAND_REG_IMM_FIRST
Definition: SIDefines.h:256
@ OPERAND_SRC_FIRST
Definition: SIDefines.h:265
@ OPERAND_REG_INLINE_AC_FIRST
Definition: SIDefines.h:262
@ OPERAND_REG_INLINE_AC_LAST
Definition: SIDefines.h:263
@ OPERAND_REG_IMM_LAST
Definition: SIDefines.h:257
@ OPERAND_SRC_LAST
Definition: SIDefines.h:266
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition: CallingConv.h:47
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Renamable
Register that may be renamed.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:480
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition: MCRegister.h:21
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
@ HasSGPR
Definition: SIDefines.h:26
@ HasVGPR
Definition: SIDefines.h:24
@ RegKindMask
Definition: SIDefines.h:29
@ HasAGPR
Definition: SIDefines.h:25
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
unsigned getDefRegState(bool B)
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition: Threading.h:87
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:47
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Description of the encoding of one expression Op.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI)
ArrayRef< int16_t > SplitParts
SIMachineFunctionInfo & MFI
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, int Index, RegScavenger *RS)
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, bool IsKill, int Index, RegScavenger *RS)
PerVGPRData getPerVGPRData()
MachineBasicBlock::iterator MI
void readWriteTmpVGPR(unsigned Offset, bool IsLoad)
const SIRegisterInfo & TRI
MachineFunction & MF
MachineBasicBlock * MBB
const SIInstrInfo & TII
The llvm::once_flag structure.
Definition: Threading.h:68