LLVM  10.0.0svn
GCNNSAReassign.cpp
Go to the documentation of this file.
1 //===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
11 /// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA
12 /// with sequential versions where possible.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUSubtarget.h"
18 #include "SIInstrInfo.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/Statistic.h"
27 #include <algorithm>
28 
29 using namespace llvm;
30 
31 #define DEBUG_TYPE "amdgpu-nsa-reassign"
32 
33 STATISTIC(NumNSAInstructions,
34  "Number of NSA instructions with non-sequential address found");
35 STATISTIC(NumNSAConverted,
36  "Number of NSA instructions changed to sequential");
37 
38 namespace {
39 
40 class GCNNSAReassign : public MachineFunctionPass {
41 public:
42  static char ID;
43 
44  GCNNSAReassign() : MachineFunctionPass(ID) {
46  }
47 
48  bool runOnMachineFunction(MachineFunction &MF) override;
49 
50  StringRef getPassName() const override { return "GCN NSA Reassign"; }
51 
52  void getAnalysisUsage(AnalysisUsage &AU) const override {
54  AU.addRequired<VirtRegMap>();
56  AU.setPreservesAll();
58  }
59 
60 private:
61  typedef enum {
62  NOT_NSA, // Not an NSA instruction
63  FIXED, // NSA which we cannot modify
64  NON_CONTIGUOUS, // NSA with non-sequential address which we can try
65  // to optimize.
66  CONTIGUOUS // NSA with all sequential address registers
67  } NSA_Status;
68 
69  const GCNSubtarget *ST;
70 
71  const MachineRegisterInfo *MRI;
72 
73  const SIRegisterInfo *TRI;
74 
75  VirtRegMap *VRM;
76 
77  LiveRegMatrix *LRM;
78 
79  LiveIntervals *LIS;
80 
81  unsigned MaxNumVGPRs;
82 
83  const MCPhysReg *CSRegs;
84 
85  NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
86 
87  bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
88  unsigned StartReg) const;
89 
90  bool canAssign(unsigned StartReg, unsigned NumRegs) const;
91 
92  bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
93 };
94 
95 } // End anonymous namespace.
96 
97 INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
98  false, false)
102 INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
103  false, false)
104 
105 
106 char GCNNSAReassign::ID = 0;
107 
108 char &llvm::GCNNSAReassignID = GCNNSAReassign::ID;
109 
110 bool
111 GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
112  unsigned StartReg) const {
113  unsigned NumRegs = Intervals.size();
114 
115  for (unsigned N = 0; N < NumRegs; ++N)
116  if (VRM->hasPhys(Intervals[N]->reg))
117  LRM->unassign(*Intervals[N]);
118 
119  for (unsigned N = 0; N < NumRegs; ++N)
120  if (LRM->checkInterference(*Intervals[N], StartReg + N))
121  return false;
122 
123  for (unsigned N = 0; N < NumRegs; ++N)
124  LRM->assign(*Intervals[N], StartReg + N);
125 
126  return true;
127 }
128 
129 bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const {
130  for (unsigned N = 0; N < NumRegs; ++N) {
131  unsigned Reg = StartReg + N;
132  if (!MRI->isAllocatable(Reg))
133  return false;
134 
135  for (unsigned I = 0; CSRegs[I]; ++I)
136  if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
137  !LRM->isPhysRegUsed(CSRegs[I]))
138  return false;
139  }
140 
141  return true;
142 }
143 
144 bool
145 GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {
146  unsigned NumRegs = Intervals.size();
147 
148  if (NumRegs > MaxNumVGPRs)
149  return false;
150  unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
151 
152  for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
153  if (!canAssign(Reg, NumRegs))
154  continue;
155 
156  if (tryAssignRegisters(Intervals, Reg))
157  return true;
158  }
159 
160  return false;
161 }
162 
163 GCNNSAReassign::NSA_Status
164 GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
166  if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
167  return NSA_Status::NOT_NSA;
168 
169  int VAddr0Idx =
170  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
171 
172  unsigned VgprBase = 0;
173  bool NSA = false;
174  for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
175  const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
176  Register Reg = Op.getReg();
177  if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
178  return NSA_Status::FIXED;
179 
180  Register PhysReg = VRM->getPhys(Reg);
181 
182  if (!Fast) {
183  if (!PhysReg)
184  return NSA_Status::FIXED;
185 
186  // Bail if address is not a VGPR32. That should be possible to extend the
187  // optimization to work with subregs of a wider register tuples, but the
188  // logic to find free registers will be much more complicated with much
189  // less chances for success. That seems reasonable to assume that in most
190  // cases a tuple is used because a vector variable contains different
191  // parts of an address and it is either already consequitive or cannot
192  // be reassigned if not. If needed it is better to rely on register
193  // coalescer to process such address tuples.
194  if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg())
195  return NSA_Status::FIXED;
196 
197  const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
198 
199  if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
200  return NSA_Status::FIXED;
201 
202  for (auto U : MRI->use_nodbg_operands(Reg)) {
203  if (U.isImplicit())
204  return NSA_Status::FIXED;
205  const MachineInstr *UseInst = U.getParent();
206  if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
207  return NSA_Status::FIXED;
208  }
209 
210  if (!LIS->hasInterval(Reg))
211  return NSA_Status::FIXED;
212  }
213 
214  if (I == 0)
215  VgprBase = PhysReg;
216  else if (VgprBase + I != PhysReg)
217  NSA = true;
218  }
219 
220  return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
221 }
222 
223 bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
224  ST = &MF.getSubtarget<GCNSubtarget>();
225  if (ST->getGeneration() < GCNSubtarget::GFX10)
226  return false;
227 
228  MRI = &MF.getRegInfo();
229  TRI = ST->getRegisterInfo();
230  VRM = &getAnalysis<VirtRegMap>();
231  LRM = &getAnalysis<LiveRegMatrix>();
232  LIS = &getAnalysis<LiveIntervals>();
233 
235  MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
236  MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
237  CSRegs = MRI->getCalleeSavedRegs();
238 
239  using Candidate = std::pair<const MachineInstr*, bool>;
240  SmallVector<Candidate, 32> Candidates;
241  for (const MachineBasicBlock &MBB : MF) {
242  for (const MachineInstr &MI : MBB) {
243  switch (CheckNSA(MI)) {
244  default:
245  continue;
246  case NSA_Status::CONTIGUOUS:
247  Candidates.push_back(std::make_pair(&MI, true));
248  break;
249  case NSA_Status::NON_CONTIGUOUS:
250  Candidates.push_back(std::make_pair(&MI, false));
251  ++NumNSAInstructions;
252  break;
253  }
254  }
255  }
256 
257  bool Changed = false;
258  for (auto &C : Candidates) {
259  if (C.second)
260  continue;
261 
262  const MachineInstr *MI = C.first;
263  if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) {
264  // Already happen to be fixed.
265  C.second = true;
266  ++NumNSAConverted;
267  continue;
268  }
269 
271  int VAddr0Idx =
272  AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
273 
275  SmallVector<unsigned, 16> OrigRegs;
276  SlotIndex MinInd, MaxInd;
277  for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
278  const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
279  Register Reg = Op.getReg();
280  LiveInterval *LI = &LIS->getInterval(Reg);
281  if (llvm::find(Intervals, LI) != Intervals.end()) {
282  // Same register used, unable to make sequential
283  Intervals.clear();
284  break;
285  }
286  Intervals.push_back(LI);
287  OrigRegs.push_back(VRM->getPhys(Reg));
288  MinInd = I ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
289  MaxInd = I ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
290  }
291 
292  if (Intervals.empty())
293  continue;
294 
295  LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
296  << "\tOriginal allocation:\t";
297  for(auto *LI : Intervals)
298  dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI);
299  dbgs() << '\n');
300 
301  bool Success = scavengeRegs(Intervals);
302  if (!Success) {
303  LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
304  if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation.
305  continue;
306  } else {
307  // Check we did not make it worse for other instructions.
308  auto I = std::lower_bound(Candidates.begin(), &C, MinInd,
309  [this](const Candidate &C, SlotIndex I) {
310  return LIS->getInstructionIndex(*C.first) < I;
311  });
312  for (auto E = Candidates.end(); Success && I != E &&
313  LIS->getInstructionIndex(*I->first) < MaxInd; ++I) {
314  if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) {
315  Success = false;
316  LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
317  }
318  }
319  }
320 
321  if (!Success) {
322  for (unsigned I = 0; I < Info->VAddrDwords; ++I)
323  if (VRM->hasPhys(Intervals[I]->reg))
324  LRM->unassign(*Intervals[I]);
325 
326  for (unsigned I = 0; I < Info->VAddrDwords; ++I)
327  LRM->assign(*Intervals[I], OrigRegs[I]);
328 
329  continue;
330  }
331 
332  C.second = true;
333  ++NumNSAConverted;
334  LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t ["
335  << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI)
336  << " : "
337  << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI)
338  << "]\n");
339  Changed = true;
340  }
341 
342  return Changed;
343 }
auto lower_bound(R &&Range, T &&Value) -> decltype(adl_begin(Range))
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1261
uint64_t CallInst * C
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AMDGPU specific subclass of TargetSubtarget.
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
This class represents lattice values for constants.
Definition: AllocatorList.h:23
char & GCNNSAReassignID
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:63
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:679
unsigned Reg
unsigned getSubReg() const
STATISTIC(NumFunctions, "Total number of functions")
unsigned const TargetRegisterInfo * TRI
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:41
void initializeGCNNSAReassignPass(PassRegistry &)
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:410
#define DEBUG_TYPE
Analysis containing CSE Info
Definition: CSEInfo.cpp:20
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition: MCRegister.h:19
SlotIndex endIndex() const
endNumber - return the maximum point of the range of the whole, exclusive.
Definition: LiveInterval.h:383
unsigned const MachineRegisterInfo * MRI
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Represent the analysis usage information of a pass.
Fast - This calling convention attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:42
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
bool isCopy() const
size_t size() const
Definition: SmallVector.h:52
auto find(R &&Range, const T &Val) -> decltype(adl_begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1186
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:390
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", false, false) INITIALIZE_PASS_END(GCNNSAReassign
void setPreservesAll()
Set by analyses that do not transform their input at all.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:255
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
#define Success
Representation of each machine instruction.
Definition: MachineInstr.h:63
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
GCN NSA Reassign
#define I(x, y, z)
Definition: MD5.cpp:58
#define N
SlotIndex beginIndex() const
beginIndex - Return the lowest numbered slot covered.
Definition: LiveInterval.h:376
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
Register getReg() const
getReg - Returns the register number.
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:415
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:83
Wrapper class representing virtual and physical registers.
Definition: Register.h:19