LLVM 20.0.0git
GCNNSAReassign.cpp
Go to the documentation of this file.
1//===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
11/// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA
12/// with sequential versions where possible.
13///
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
17#include "GCNSubtarget.h"
19#include "SIRegisterInfo.h"
20#include "llvm/ADT/Statistic.h"
26
27using namespace llvm;
28
29#define DEBUG_TYPE "amdgpu-nsa-reassign"
30
31STATISTIC(NumNSAInstructions,
32 "Number of NSA instructions with non-sequential address found");
33STATISTIC(NumNSAConverted,
34 "Number of NSA instructions changed to sequential");
35
36namespace {
37
38class GCNNSAReassign : public MachineFunctionPass {
39public:
40 static char ID;
41
42 GCNNSAReassign() : MachineFunctionPass(ID) {
44 }
45
46 bool runOnMachineFunction(MachineFunction &MF) override;
47
48 StringRef getPassName() const override { return "GCN NSA Reassign"; }
49
50 void getAnalysisUsage(AnalysisUsage &AU) const override {
54 AU.setPreservesAll();
56 }
57
58private:
59 using NSA_Status = enum {
60 NOT_NSA, // Not an NSA instruction
61 FIXED, // NSA which we cannot modify
62 NON_CONTIGUOUS, // NSA with non-sequential address which we can try
63 // to optimize.
64 CONTIGUOUS // NSA with all sequential address registers
65 };
66
67 const GCNSubtarget *ST;
68
70
71 const SIRegisterInfo *TRI;
72
73 VirtRegMap *VRM;
74
75 LiveRegMatrix *LRM;
76
77 LiveIntervals *LIS;
78
79 unsigned MaxNumVGPRs;
80
81 const MCPhysReg *CSRegs;
82
83 NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
84
85 bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
86 unsigned StartReg) const;
87
88 bool canAssign(unsigned StartReg, unsigned NumRegs) const;
89
90 bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
91};
92
93} // End anonymous namespace.
94
95INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
96 false, false)
100INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
102
103
104char GCNNSAReassign::ID = 0;
105
106char &llvm::GCNNSAReassignID = GCNNSAReassign::ID;
107
108bool
109GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
110 unsigned StartReg) const {
111 unsigned NumRegs = Intervals.size();
112
113 for (unsigned N = 0; N < NumRegs; ++N)
114 if (VRM->hasPhys(Intervals[N]->reg()))
115 LRM->unassign(*Intervals[N]);
116
117 for (unsigned N = 0; N < NumRegs; ++N)
118 if (LRM->checkInterference(*Intervals[N], MCRegister::from(StartReg + N)))
119 return false;
120
121 for (unsigned N = 0; N < NumRegs; ++N)
122 LRM->assign(*Intervals[N], MCRegister::from(StartReg + N));
123
124 return true;
125}
126
127bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const {
128 for (unsigned N = 0; N < NumRegs; ++N) {
129 unsigned Reg = StartReg + N;
130 if (!MRI->isAllocatable(Reg))
131 return false;
132
133 for (unsigned I = 0; CSRegs[I]; ++I)
134 if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
135 !LRM->isPhysRegUsed(CSRegs[I]))
136 return false;
137 }
138
139 return true;
140}
141
142bool
143GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {
144 unsigned NumRegs = Intervals.size();
145
146 if (NumRegs > MaxNumVGPRs)
147 return false;
148 unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
149
150 for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
151 if (!canAssign(Reg, NumRegs))
152 continue;
153
154 if (tryAssignRegisters(Intervals, Reg))
155 return true;
156 }
157
158 return false;
159}
160
161GCNNSAReassign::NSA_Status
162GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
163 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
164 if (!Info)
165 return NSA_Status::NOT_NSA;
166
167 switch (Info->MIMGEncoding) {
168 case AMDGPU::MIMGEncGfx10NSA:
169 case AMDGPU::MIMGEncGfx11NSA:
170 break;
171 default:
172 return NSA_Status::NOT_NSA;
173 }
174
175 int VAddr0Idx =
176 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
177
178 unsigned VgprBase = 0;
179 bool NSA = false;
180 for (unsigned I = 0; I < Info->VAddrOperands; ++I) {
181 const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
182 Register Reg = Op.getReg();
183 if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
184 return NSA_Status::FIXED;
185
186 Register PhysReg = VRM->getPhys(Reg);
187
188 if (!Fast) {
189 if (!PhysReg)
190 return NSA_Status::FIXED;
191
192 // TODO: address the below limitation to handle GFX11 BVH instructions
193 // Bail if address is not a VGPR32. That should be possible to extend the
194 // optimization to work with subregs of a wider register tuples, but the
195 // logic to find free registers will be much more complicated with much
196 // less chances for success. That seems reasonable to assume that in most
197 // cases a tuple is used because a vector variable contains different
198 // parts of an address and it is either already consecutive or cannot
199 // be reassigned if not. If needed it is better to rely on register
200 // coalescer to process such address tuples.
201 if (TRI->getRegSizeInBits(*MRI->getRegClass(Reg)) != 32 || Op.getSubReg())
202 return NSA_Status::FIXED;
203
204 // InlineSpiller does not call LRM::assign() after an LI split leaving
205 // it in an inconsistent state, so we cannot call LRM::unassign().
206 // See llvm bug #48911.
207 // Skip reassign if a register has originated from such split.
208 // FIXME: Remove the workaround when bug #48911 is fixed.
209 if (VRM->getPreSplitReg(Reg))
210 return NSA_Status::FIXED;
211
212 const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
213
214 if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
215 return NSA_Status::FIXED;
216
217 for (auto U : MRI->use_nodbg_operands(Reg)) {
218 if (U.isImplicit())
219 return NSA_Status::FIXED;
220 const MachineInstr *UseInst = U.getParent();
221 if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
222 return NSA_Status::FIXED;
223 }
224
225 if (!LIS->hasInterval(Reg))
226 return NSA_Status::FIXED;
227 }
228
229 if (I == 0)
230 VgprBase = PhysReg;
231 else if (VgprBase + I != PhysReg)
232 NSA = true;
233 }
234
235 return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
236}
237
238bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
240 if (!ST->hasNSAEncoding() || !ST->hasNonNSAEncoding())
241 return false;
242
243 MRI = &MF.getRegInfo();
244 TRI = ST->getRegisterInfo();
245 VRM = &getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
246 LRM = &getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
247 LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
248
250 MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
251 MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
252 CSRegs = MRI->getCalleeSavedRegs();
253
254 using Candidate = std::pair<const MachineInstr*, bool>;
256 for (const MachineBasicBlock &MBB : MF) {
257 for (const MachineInstr &MI : MBB) {
258 switch (CheckNSA(MI)) {
259 default:
260 continue;
261 case NSA_Status::CONTIGUOUS:
262 Candidates.push_back(std::pair(&MI, true));
263 break;
264 case NSA_Status::NON_CONTIGUOUS:
265 Candidates.push_back(std::pair(&MI, false));
266 ++NumNSAInstructions;
267 break;
268 }
269 }
270 }
271
272 bool Changed = false;
273 for (auto &C : Candidates) {
274 if (C.second)
275 continue;
276
277 const MachineInstr *MI = C.first;
278 if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) {
279 // Already happen to be fixed.
280 C.second = true;
281 ++NumNSAConverted;
282 continue;
283 }
284
285 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode());
286 int VAddr0Idx =
287 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
288
291 SlotIndex MinInd, MaxInd;
292 for (unsigned I = 0; I < Info->VAddrOperands; ++I) {
293 const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
294 Register Reg = Op.getReg();
295 LiveInterval *LI = &LIS->getInterval(Reg);
296 if (llvm::is_contained(Intervals, LI)) {
297 // Same register used, unable to make sequential
298 Intervals.clear();
299 break;
300 }
301 Intervals.push_back(LI);
302 OrigRegs.push_back(VRM->getPhys(Reg));
303 if (LI->empty()) {
304 // The address input is undef, so it doesn't contribute to the relevant
305 // range. Seed a reasonable index range if required.
306 if (I == 0)
307 MinInd = MaxInd = LIS->getInstructionIndex(*MI);
308 continue;
309 }
310 MinInd = I != 0 ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
311 MaxInd = I != 0 ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
312 }
313
314 if (Intervals.empty())
315 continue;
316
317 LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
318 << "\tOriginal allocation:\t";
319 for (auto *LI
320 : Intervals) dbgs()
321 << " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI);
322 dbgs() << '\n');
323
324 bool Success = scavengeRegs(Intervals);
325 if (!Success) {
326 LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
327 if (VRM->hasPhys(Intervals.back()->reg())) // Did not change allocation.
328 continue;
329 } else {
330 // Check we did not make it worse for other instructions.
331 auto *I =
332 std::lower_bound(Candidates.begin(), &C, MinInd,
333 [this](const Candidate &C, SlotIndex I) {
334 return LIS->getInstructionIndex(*C.first) < I;
335 });
336 for (auto *E = Candidates.end();
337 Success && I != E && LIS->getInstructionIndex(*I->first) < MaxInd;
338 ++I) {
339 if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) {
340 Success = false;
341 LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
342 }
343 }
344 }
345
346 if (!Success) {
347 for (unsigned I = 0; I < Info->VAddrOperands; ++I)
348 if (VRM->hasPhys(Intervals[I]->reg()))
349 LRM->unassign(*Intervals[I]);
350
351 for (unsigned I = 0; I < Info->VAddrOperands; ++I)
352 LRM->assign(*Intervals[I], OrigRegs[I]);
353
354 continue;
355 }
356
357 C.second = true;
358 ++NumNSAConverted;
360 dbgs() << "\tNew allocation:\t\t ["
361 << llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI)
362 << " : "
363 << llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI)
364 << "]\n");
365 Changed = true;
366 }
367
368 return Changed;
369}
unsigned const MachineRegisterInfo * MRI
#define Success
MachineBasicBlock & MBB
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_DEBUG(...)
Definition: Debug.h:106
GCN NSA Reassign
#define DEBUG_TYPE
AMD GCN specific subclass of TargetSubtarget.
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
Interface definition for SIRegisterInfo.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
This class represents an Operation in the Expression.
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:687
Register reg() const
Definition: LiveInterval.h:718
bool empty() const
Definition: LiveInterval.h:382
SlotIndex beginIndex() const
beginIndex - Return the lowest numbered slot covered.
Definition: LiveInterval.h:385
SlotIndex endIndex() const
endNumber - return the maximum point of the range of the whole, exclusive.
Definition: LiveInterval.h:392
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition: MCRegister.h:74
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
Definition: MachineInstr.h:69
bool isCopy() const
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
MachineOperand class - Representation of each machine instruction operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:65
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void initializeGCNNSAReassignPass(PassRegistry &)
char & GCNNSAReassignID
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
#define N