LLVM  12.0.0git
GCNRegBankReassign.cpp
Go to the documentation of this file.
1 //===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Try to reassign registers on GFX10+ to reduce register bank
11 /// conflicts.
12 ///
13 /// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in
14 /// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to
15 /// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1,
16 /// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc.
17 ///
18 /// The shader can read one dword from each of these banks once per cycle.
19 /// If an instruction has to read more register operands from the same bank
20 /// an additional cycle is needed. HW attempts to pre-load registers through
21 /// input operand gathering, but a stall cycle may occur if that fails. For
22 /// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands,
23 /// potentially incuring 2 stall cycles.
24 ///
25 /// The pass tries to reassign registers to reduce bank conflicts.
26 ///
27 /// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so
28 /// that 4 has to be subtracted from an SGPR bank number to get the real value.
29 /// This also corresponds to bit numbers in bank masks used in the pass.
30 ///
31 //===----------------------------------------------------------------------===//
32 
33 #include "AMDGPU.h"
34 #include "AMDGPUSubtarget.h"
36 #include "SIInstrInfo.h"
37 #include "SIMachineFunctionInfo.h"
38 #include "llvm/ADT/SmallSet.h"
39 #include "llvm/ADT/Statistic.h"
46 #include "llvm/InitializePasses.h"
48 
49 using namespace llvm;
50 
51 static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign",
52  cl::desc("Verify stall cycles in the regbanks reassign pass"),
53  cl::value_desc("0|1|2"),
54  cl::init(0), cl::Hidden);
55 
56 #define DEBUG_TYPE "amdgpu-regbanks-reassign"
57 
58 #define NUM_VGPR_BANKS 4
59 #define NUM_SGPR_BANKS 8
60 #define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS)
61 #define SGPR_BANK_OFFSET NUM_VGPR_BANKS
62 #define VGPR_BANK_MASK 0xf
63 #define SGPR_BANK_MASK 0xff0
64 #define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET)
65 
66 STATISTIC(NumStallsDetected,
67  "Number of operand read stalls detected");
68 STATISTIC(NumStallsRecovered,
69  "Number of operand read stalls recovered");
70 
71 namespace {
72 
73 class GCNRegBankReassign : public MachineFunctionPass {
74 
75  class OperandMask {
76  public:
77  OperandMask(unsigned r, unsigned s, unsigned m)
78  : Reg(r), SubReg(s), Mask(m) {}
79  unsigned Reg;
80  unsigned SubReg;
81  unsigned Mask;
82  };
83 
84  class Candidate {
85  public:
86  Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks,
87  unsigned weight)
88  : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {}
89 
90  bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; }
91 
92 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
93  void dump(const GCNRegBankReassign *P) const {
94  MI->dump();
95  dbgs() << P->printReg(Reg) << " to banks ";
96  dumpFreeBanks(FreeBanks);
97  dbgs() << " weight " << Weight << '\n';
98  }
99 #endif
100 
101  MachineInstr *MI;
102  unsigned Reg;
103  unsigned FreeBanks;
104  unsigned Weight;
105  };
106 
107  class CandidateList : public std::list<Candidate> {
108  public:
109  // Speedup subsequent sort.
110  void push(const Candidate&& C) {
111  if (C.Weight) push_back(C);
112  else push_front(C);
113  }
114  };
115 
116 public:
117  static char ID;
118 
119 public:
120  GCNRegBankReassign() : MachineFunctionPass(ID) {
122  }
123 
124  bool runOnMachineFunction(MachineFunction &MF) override;
125 
126  StringRef getPassName() const override { return "GCN RegBank Reassign"; }
127 
128  void getAnalysisUsage(AnalysisUsage &AU) const override {
131  AU.addRequired<VirtRegMap>();
133  AU.setPreservesAll();
135  }
136 
137 private:
138  const GCNSubtarget *ST;
139 
140  const MachineRegisterInfo *MRI;
141 
142  const SIRegisterInfo *TRI;
143 
144  MachineLoopInfo *MLI;
145 
146  VirtRegMap *VRM;
147 
148  LiveRegMatrix *LRM;
149 
150  LiveIntervals *LIS;
151 
152  unsigned MaxNumVGPRs;
153 
154  unsigned MaxNumSGPRs;
155 
156  BitVector RegsUsed;
157 
158  SmallVector<OperandMask, 8> OperandMasks;
159 
160  CandidateList Candidates;
161 
162  const MCPhysReg *CSRegs;
163 
164  // Returns bank for a phys reg.
165  unsigned getPhysRegBank(unsigned Reg) const;
166 
167  // Return a bit set for each register bank used. 4 banks for VGPRs and
168  // 8 banks for SGPRs.
169  // Registers already processed and recorded in RegsUsed are excluded.
170  // If Bank is not -1 assume Reg:SubReg to belong to that Bank.
171  uint32_t getRegBankMask(unsigned Reg, unsigned SubReg, int Bank);
172 
173  // Analyze one instruction returning the number of stalls and a mask of the
174  // banks used by all operands.
175  // If Reg and Bank are provided, assume all uses of Reg will be replaced with
176  // a register chosen from Bank.
177  std::pair<unsigned, unsigned> analyzeInst(const MachineInstr &MI,
178  unsigned Reg = AMDGPU::NoRegister,
179  int Bank = -1);
180 
181  // Return true if register is regular VGPR or SGPR or their tuples.
182  // Returns false for special registers like m0, vcc etc.
183  bool isReassignable(unsigned Reg) const;
184 
185  // Check if registers' defs are old and may be pre-loaded.
186  // Returns 0 if both registers are old enough, 1 or 2 if one or both
187  // registers will not likely be pre-loaded.
188  unsigned getOperandGatherWeight(const MachineInstr& MI,
189  unsigned Reg1,
190  unsigned Reg2,
191  unsigned StallCycles) const;
192 
193 
194  // Find all bank bits in UsedBanks where Mask can be relocated to.
195  unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const;
196 
197  // Find all bank bits in UsedBanks where Mask can be relocated to.
198  // Bank is relative to the register and not its subregister component.
199  // Returns 0 is a register is not reassignable.
200  unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask,
201  unsigned UsedBanks) const;
202 
203  // Add cadidate instruction to the work list.
204  void collectCandidates(MachineInstr& MI, unsigned UsedBanks,
205  unsigned StallCycles);
206 
207  // Collect cadidate instructions across function. Returns a number stall
208  // cycles detected. Only counts stalls if Collect is false.
209  unsigned collectCandidates(MachineFunction &MF, bool Collect = true);
210 
211  // Remove all candidates that read specified register.
212  void removeCandidates(unsigned Reg);
213 
214  // Compute stalls within the uses of SrcReg replaced by a register from
215  // Bank. If Bank is -1 does not perform substitution. If Collect is set
216  // candidates are collected and added to work list.
217  unsigned computeStallCycles(unsigned SrcReg,
218  unsigned Reg = AMDGPU::NoRegister,
219  int Bank = -1, bool Collect = false);
220 
221  // Search for a register in Bank unused within LI.
222  // Returns phys reg or NoRegister.
223  unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const;
224 
225  // Try to reassign candidate. Returns number or stall cycles saved.
226  unsigned tryReassign(Candidate &C);
227 
228  bool verifyCycles(MachineFunction &MF,
229  unsigned OriginalCycles, unsigned CyclesSaved);
230 
231 
232 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
233 public:
234  Printable printReg(unsigned Reg, unsigned SubReg = 0) const {
235  return Printable([Reg, SubReg, this](raw_ostream &OS) {
236  if (Register::isPhysicalRegister(Reg)) {
237  OS << llvm::printReg(Reg, TRI);
238  return;
239  }
240  if (!VRM->isAssignedReg(Reg))
241  OS << "<unassigned> " << llvm::printReg(Reg, TRI);
242  else
243  OS << llvm::printReg(Reg, TRI) << '('
244  << llvm::printReg(VRM->getPhys(Reg), TRI) << ')';
245  if (SubReg)
246  OS << ':' << TRI->getSubRegIndexName(SubReg);
247  });
248  }
249 
250  static Printable printBank(unsigned Bank) {
251  return Printable([Bank](raw_ostream &OS) {
252  OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank);
253  });
254  }
255 
256  static void dumpFreeBanks(unsigned FreeBanks) {
257  for (unsigned L = 0; L < NUM_BANKS; ++L)
258  if (FreeBanks & (1 << L))
259  dbgs() << printBank(L) << ' ';
260  }
261 #endif
262 };
263 
264 } // End anonymous namespace.
265 
266 INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
267  false, false)
272 INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
273  false, false)
274 
275 
276 char GCNRegBankReassign::ID = 0;
277 
278 char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
279 
280 unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
282 
283  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
284  unsigned Size = TRI->getRegSizeInBits(*RC);
285  if (Size == 16)
286  Reg = TRI->get32BitRegister(Reg);
287  else if (Size > 32)
288  Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
289 
290  if (TRI->hasVGPRs(RC)) {
291  Reg -= AMDGPU::VGPR0;
292  return Reg % NUM_VGPR_BANKS;
293  }
294 
295  Reg = TRI->getEncodingValue(Reg) / 2;
296  return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
297 }
298 
299 uint32_t GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
300  int Bank) {
301  if (Register::isVirtualRegister(Reg)) {
302  if (!VRM->isAssignedReg(Reg))
303  return 0;
304 
305  Reg = VRM->getPhys(Reg);
306  if (!Reg)
307  return 0;
308  if (SubReg)
309  Reg = TRI->getSubReg(Reg, SubReg);
310  }
311 
312  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
313  unsigned Size = TRI->getRegSizeInBits(*RC);
314 
315  if (Size == 16) {
316  Reg = TRI->get32BitRegister(Reg);
317  Size = 1;
318  } else {
319  Size /= 32;
320  if (Size > 1)
321  Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
322  }
323 
324  if (TRI->hasVGPRs(RC)) {
325  // VGPRs have 4 banks assigned in a round-robin fashion.
326  Reg -= AMDGPU::VGPR0;
327  uint32_t Mask = maskTrailingOnes<uint32_t>(Size);
328  unsigned Used = 0;
329  // Bitmask lacks an extract method
330  for (unsigned I = 0; I < Size; ++I)
331  if (RegsUsed.test(Reg + I))
332  Used |= 1 << I;
333  RegsUsed.set(Reg, Reg + Size);
334  Mask &= ~Used;
335  Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : uint32_t(Bank);
336  return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
337  }
338 
339  // SGPRs have 8 banks holding 2 consequitive registers each.
340  Reg = TRI->getEncodingValue(Reg) / 2;
341  unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs();
342  if (Reg + StartBit >= RegsUsed.size())
343  return 0;
344 
345  if (Size > 1)
346  Size /= 2;
347  unsigned Mask = (1 << Size) - 1;
348  unsigned Used = 0;
349  for (unsigned I = 0; I < Size; ++I)
350  if (RegsUsed.test(StartBit + Reg + I))
351  Used |= 1 << I;
352  RegsUsed.set(StartBit + Reg, StartBit + Reg + Size);
353  Mask &= ~Used;
354  Mask <<= (Bank == -1) ? Reg % NUM_SGPR_BANKS
355  : unsigned(Bank - SGPR_BANK_OFFSET);
356  Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
357  // Reserve 4 bank ids for VGPRs.
358  return Mask << SGPR_BANK_OFFSET;
359 }
360 
361 std::pair<unsigned, unsigned>
362 GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg,
363  int Bank) {
364  unsigned StallCycles = 0;
365  unsigned UsedBanks = 0;
366 
367  if (MI.isDebugValue())
368  return std::make_pair(StallCycles, UsedBanks);
369 
370  RegsUsed.reset();
371  OperandMasks.clear();
372  for (const auto& Op : MI.explicit_uses()) {
373  // Undef can be assigned to any register, so two vregs can be assigned
374  // the same phys reg within the same instruction.
375  if (!Op.isReg() || Op.isUndef())
376  continue;
377 
378  Register R = Op.getReg();
379  if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R)))
380  continue;
381 
382  unsigned ShiftedBank = Bank;
383 
384  if (Bank != -1 && R == Reg && Op.getSubReg()) {
385  unsigned Offset = TRI->getChannelFromSubReg(Op.getSubReg());
386  LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg());
387  if (Offset && Bank < NUM_VGPR_BANKS) {
388  // If a register spans all banks we cannot shift it to avoid conflict.
389  if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS)
390  continue;
391  ShiftedBank = (Bank + Offset) % NUM_VGPR_BANKS;
392  } else if (Offset > 1 && Bank >= SGPR_BANK_OFFSET) {
393  // If a register spans all banks we cannot shift it to avoid conflict.
394  if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS)
395  continue;
396  ShiftedBank = SGPR_BANK_OFFSET +
397  (Bank - SGPR_BANK_OFFSET + (Offset >> 1)) % NUM_SGPR_BANKS;
398  }
399  }
400 
401  uint32_t Mask = getRegBankMask(R, Op.getSubReg(),
402  (Reg == R) ? ShiftedBank : -1);
403  StallCycles += countPopulation(UsedBanks & Mask);
404  UsedBanks |= Mask;
405  OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask));
406  }
407 
408  return std::make_pair(StallCycles, UsedBanks);
409 }
410 
411 unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
412  unsigned Reg1,
413  unsigned Reg2,
414  unsigned StallCycles) const
415 {
416  unsigned Defs = 0;
419  for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) {
420  if (MI.isDebugInstr())
421  continue;
422  --Def;
423  if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
424  continue;
425  if (Def->modifiesRegister(Reg1, TRI))
426  Defs |= 1;
427  if (Def->modifiesRegister(Reg2, TRI))
428  Defs |= 2;
429  }
430  return countPopulation(Defs);
431 }
432 
433 bool GCNRegBankReassign::isReassignable(unsigned Reg) const {
434  if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
435  return false;
436 
437  const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
438 
439  Register PhysReg = VRM->getPhys(Reg);
440 
441  if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
442  return false;
443 
444  for (auto U : MRI->use_nodbg_operands(Reg)) {
445  if (U.isImplicit())
446  return false;
447  const MachineInstr *UseInst = U.getParent();
448  if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
449  return false;
450  }
451 
452  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
453  unsigned Size = TRI->getRegSizeInBits(*RC);
454 
455  // TODO: Support 16 bit registers. Those needs to be moved with their
456  // parent VGPR_32 and potentially a sibling 16 bit sub-register.
457  if (Size < 32)
458  return false;
459 
460  if (TRI->hasVGPRs(RC))
461  return true;
462 
463  if (Size == 16)
464  return AMDGPU::SGPR_LO16RegClass.contains(PhysReg);
465 
466  if (Size > 32)
467  PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);
468 
469  return AMDGPU::SGPR_32RegClass.contains(PhysReg);
470 }
471 
472 unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,
473  unsigned UsedBanks) const {
474  unsigned Size = countPopulation(Mask);
475  unsigned FreeBanks = 0;
476  unsigned Bank = findFirstSet(Mask);
477 
478  UsedBanks &= ~Mask;
479 
480  // Find free VGPR banks
481  if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) {
482  for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) {
483  if (Bank == I)
484  continue;
485  unsigned NewMask = ((1 << Size) - 1) << I;
486  NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
487  if (!(UsedBanks & NewMask))
488  FreeBanks |= 1 << I;
489  }
490  return FreeBanks;
491  }
492 
493  // Find free SGPR banks
494  // SGPR tuples must be aligned, so step is size in banks it
495  // crosses.
496  Bank -= SGPR_BANK_OFFSET;
497  for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) {
498  if (Bank == I)
499  continue;
500  unsigned NewMask = ((1 << Size) - 1) << I;
501  NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
502  if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET)))
503  FreeBanks |= (1 << SGPR_BANK_OFFSET) << I;
504  }
505 
506  return FreeBanks;
507 }
508 
509 unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg,
510  unsigned SubReg,
511  unsigned Mask,
512  unsigned UsedBanks) const {
513  if (!isReassignable(Reg))
514  return 0;
515 
516  unsigned FreeBanks = getFreeBanks(Mask, UsedBanks);
517 
518  unsigned Offset = TRI->getChannelFromSubReg(SubReg);
519  if (Offset && (Mask & VGPR_BANK_MASK)) {
520  unsigned Shift = Offset;
521  if (Shift >= NUM_VGPR_BANKS)
522  return 0;
523  unsigned VB = FreeBanks & VGPR_BANK_MASK;
524  FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) &
526  } else if (Offset > 1 && (Mask & SGPR_BANK_MASK)) {
527  unsigned Shift = Offset >> 1;
528  if (Shift >= NUM_SGPR_BANKS)
529  return 0;
530  unsigned SB = FreeBanks >> SGPR_BANK_OFFSET;
531  FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) &
533  FreeBanks <<= SGPR_BANK_OFFSET;
534  }
535 
536  LLVM_DEBUG(if (FreeBanks) {
537  dbgs() << "Potential reassignments of " << printReg(Reg, SubReg)
538  << " to banks: "; dumpFreeBanks(FreeBanks);
539  dbgs() << '\n'; });
540 
541  return FreeBanks;
542 }
543 
544 void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
545  unsigned UsedBanks,
546  unsigned StallCycles) {
547  LLVM_DEBUG(MI.dump());
548 
549  if (!StallCycles)
550  return;
551 
552  LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n');
553 
554  for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) {
555  for (unsigned J = I + 1; J != E; ++J) {
556  if (!(OperandMasks[I].Mask & OperandMasks[J].Mask))
557  continue;
558 
559  unsigned Reg1 = OperandMasks[I].Reg;
560  unsigned Reg2 = OperandMasks[J].Reg;
561  unsigned SubReg1 = OperandMasks[I].SubReg;
562  unsigned SubReg2 = OperandMasks[J].SubReg;
563  unsigned Mask1 = OperandMasks[I].Mask;
564  unsigned Mask2 = OperandMasks[J].Mask;
565  unsigned Size1 = countPopulation(Mask1);
566  unsigned Size2 = countPopulation(Mask2);
567 
568  LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) <<
569  " and " << printReg(Reg2, SubReg2) << '\n');
570 
571  unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles);
572  Weight += MLI->getLoopDepth(MI.getParent()) * 10;
573 
574  LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n');
575 
576  unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
577  unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
578  if (FreeBanks1)
579  Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight
580  + ((Size2 > Size1) ? 1 : 0)));
581  if (FreeBanks2)
582  Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight
583  + ((Size1 > Size2) ? 1 : 0)));
584  }
585  }
586 }
587 
588 unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
589  unsigned Reg, int Bank,
590  bool Collect) {
591  unsigned TotalStallCycles = 0;
593 
594  for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) {
595  if (MI.isBundle())
596  continue;
597  if (!Visited.insert(&MI).second)
598  continue;
599  unsigned StallCycles;
600  unsigned UsedBanks;
601  std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, Bank);
602  TotalStallCycles += StallCycles;
603  if (Collect)
604  collectCandidates(MI, UsedBanks, StallCycles);
605  }
606 
607  return TotalStallCycles;
608 }
609 
610 unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI,
611  unsigned Bank) const {
612  const TargetRegisterClass *RC = MRI->getRegClass(LI.reg);
613  unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs
614  : MaxNumSGPRs;
615  unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0
616  : AMDGPU::SGPR0);
617 
618  for (unsigned Reg : RC->getRegisters()) {
619  // Check occupancy limit.
620  if (TRI->isSubRegisterEq(Reg, MaxReg))
621  break;
622 
623  if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank)
624  continue;
625 
626  for (unsigned I = 0; CSRegs[I]; ++I)
627  if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
628  !LRM->isPhysRegUsed(CSRegs[I]))
629  return AMDGPU::NoRegister;
630 
631  LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n');
632 
633  if (!LRM->checkInterference(LI, Reg))
634  return Reg;
635  }
636 
637  return AMDGPU::NoRegister;
638 }
639 
640 unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
641  if (!LIS->hasInterval(C.Reg))
642  return 0;
643 
644  LiveInterval &LI = LIS->getInterval(C.Reg);
645  LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump();
646  LI.dump());
647 
648  // For each candidate bank walk all instructions in the range of live
649  // interval and check if replacing the register with one belonging to
650  // the candidate bank reduces conflicts.
651 
652  unsigned OrigStalls = computeStallCycles(C.Reg);
653  LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n');
654  if (!OrigStalls)
655  return 0;
656 
657  struct BankStall {
658  BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {};
659  bool operator<(const BankStall &RHS) const {
660  if (Stalls == RHS.Stalls)
661  return Bank < RHS.Bank;
662  return Stalls > RHS.Stalls;
663  }
664  unsigned Bank;
665  unsigned Stalls;
666  };
667  SmallVector<BankStall, 8> BankStalls;
668 
669  for (int Bank = 0; Bank < NUM_BANKS; ++Bank) {
670  if (C.FreeBanks & (1 << Bank)) {
671  LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
672  unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank);
673  if (Stalls < OrigStalls) {
674  LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
675  << Stalls << '\n');
676  BankStalls.push_back(BankStall((unsigned)Bank, Stalls));
677  }
678  }
679  }
680  llvm::sort(BankStalls);
681 
682  Register OrigReg = VRM->getPhys(C.Reg);
683  LRM->unassign(LI);
684  while (!BankStalls.empty()) {
685  BankStall BS = BankStalls.pop_back_val();
686  unsigned Reg = scavengeReg(LI, BS.Bank);
687  if (Reg == AMDGPU::NoRegister) {
688  LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
689  << '\n');
690  continue;
691  }
692  LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg)
693  << (LRM->isPhysRegUsed(Reg) ? "" : " (new)")
694  << " in bank " << printBank(BS.Bank) << '\n');
695 
696  LRM->assign(LI, Reg);
697 
698  LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n');
699 
700  return OrigStalls - BS.Stalls;
701  }
702  LRM->assign(LI, OrigReg);
703 
704  return 0;
705 }
706 
707 unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
708  bool Collect) {
709  unsigned TotalStallCycles = 0;
710 
711  for (MachineBasicBlock &MBB : MF) {
712 
713  LLVM_DEBUG(if (Collect) {
714  if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber();
715  else dbgs() << MBB.getName(); dbgs() << ":\n";
716  });
717 
718  for (MachineInstr &MI : MBB.instrs()) {
719  if (MI.isBundle())
720  continue; // we analyze the instructions inside the bundle individually
721 
722  unsigned StallCycles;
723  unsigned UsedBanks;
724  std::tie(StallCycles, UsedBanks) = analyzeInst(MI);
725 
726  if (Collect)
727  collectCandidates(MI, UsedBanks, StallCycles);
728 
729  TotalStallCycles += StallCycles;
730  }
731 
732  LLVM_DEBUG(if (Collect) { dbgs() << '\n'; });
733  }
734 
735  return TotalStallCycles;
736 }
737 
738 void GCNRegBankReassign::removeCandidates(unsigned Reg) {
739  Candidates.remove_if([Reg, this](const Candidate& C) {
740  return C.MI->readsRegister(Reg, TRI);
741  });
742 }
743 
744 bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
745  unsigned OriginalCycles,
746  unsigned CyclesSaved) {
747  unsigned StallCycles = collectCandidates(MF, false);
748  LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles
749  << " stall cycles left\n");
750  return StallCycles + CyclesSaved == OriginalCycles;
751 }
752 
753 bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
754  ST = &MF.getSubtarget<GCNSubtarget>();
755  if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction()))
756  return false;
757 
758  MRI = &MF.getRegInfo();
759  TRI = ST->getRegisterInfo();
760  MLI = &getAnalysis<MachineLoopInfo>();
761  VRM = &getAnalysis<VirtRegMap>();
762  LRM = &getAnalysis<LiveRegMatrix>();
763  LIS = &getAnalysis<LiveIntervals>();
764 
766  unsigned Occupancy = MFI->getOccupancy();
767  MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
768  MaxNumSGPRs = ST->getMaxNumSGPRs(MF);
769  MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs);
770  MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs);
771 
772  CSRegs = MRI->getCalleeSavedRegs();
773 
774  RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() +
775  TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1);
776 
777  LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName()
778  << '\n');
779 
780  unsigned StallCycles = collectCandidates(MF);
781  NumStallsDetected += StallCycles;
782 
783  LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
784  "function " << MF.getName() << '\n');
785 
786  Candidates.sort();
787 
788  LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
789  for (auto C : Candidates) C.dump(this);
790  dbgs() << "\n\n");
791 
792  unsigned CyclesSaved = 0;
793  while (!Candidates.empty()) {
794  Candidate C = Candidates.back();
795  unsigned LocalCyclesSaved = tryReassign(C);
796  CyclesSaved += LocalCyclesSaved;
797 
798  if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
799  report_fatal_error("RegBank reassign stall cycles verification failed.");
800 
801  Candidates.pop_back();
802  if (LocalCyclesSaved) {
803  removeCandidates(C.Reg);
804  computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true);
805  Candidates.sort();
806 
807  LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
808  for (auto C : Candidates)
809  C.dump(this);
810  dbgs() << "\n\n");
811  }
812  }
813  NumStallsRecovered += CyclesSaved;
814 
815  LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved
816  << " cycles saved in function " << MF.getName() << '\n');
817 
818  Candidates.clear();
819 
820  if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
821  report_fatal_error("RegBank reassign stall cycles verification failed.");
822 
823  RegsUsed.clear();
824 
825  return CyclesSaved > 0;
826 }
void resize(unsigned N, bool t=false)
resize - Grow or shrink the bitvector.
Definition: BitVector.h:372
uint64_t CallInst * C
BitVector & set()
Definition: BitVector.h:398
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
instr_iterator instr_begin()
const unsigned reg
Definition: LiveInterval.h:709
AMDGPU specific subclass of TargetSubtarget.
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:140
This class represents lattice values for constants.
Definition: AllocatorList.h:23
InterferenceKind checkInterference(LiveInterval &VirtReg, unsigned PhysReg)
Check for interference before assigning VirtReg to PhysReg.
INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", false, false) INITIALIZE_PASS_END(GCNRegBankReassign
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:65
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:680
unsigned Reg
bool test(unsigned Idx) const
Definition: BitVector.h:502
unsigned getLoopDepth(const MachineBasicBlock *BB) const
Return the loop nesting level of the specified block.
STATISTIC(NumFunctions, "Total number of functions")
unsigned const TargetRegisterInfo * TRI
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
Function & getFunction()
Return the LLVM function that this machine code represents.
MachineBasicBlock & MBB
AnalysisUsage & addRequired()
void clear()
clear - Removes all bits from the bitvector. Does not change capacity.
Definition: BitVector.h:367
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
#define SGPR_BANK_OFFSET
unsigned SubReg
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E&#39;s largest value.
Definition: BitmaskEnum.h:80
#define VGPR_BANK_MASK
LLVM_NODISCARD bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:156
bool isPhysRegUsed(unsigned PhysReg) const
Returns true if the given PhysReg has any live intervals assigned.
unsigned getChannelFromSubReg(unsigned SubReg) const
static unsigned getNumCoveredRegs(LaneBitmask LM)
bool hasInterval(Register Reg) const
bool isBundle() const
void assign(LiveInterval &VirtReg, unsigned PhysReg)
Assign VirtReg to PhysReg.
#define SGPR_BANK_SHIFTED_MASK
void dump() const
Definition: Pass.cpp:131
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they&#39;re not in a MachineFuncti...
bool isAssignedReg(Register virtReg) const
returns true if the specified virtual register is not mapped to a stack slot or rematerialized.
Definition: VirtRegMap.h:154
bool isAllocatable(MCRegister PhysReg) const
isAllocatable - Returns true when PhysReg belongs to an allocatable register class and it hasn&#39;t been...
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition: MCRegister.h:19
#define NUM_VGPR_BANKS
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
#define P(N)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:434
iterator_range< SmallVectorImpl< MCPhysReg >::const_iterator > getRegisters() const
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define NUM_BANKS
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
unsigned const MachineRegisterInfo * MRI
StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
static void push(SmallVectorImpl< uint64_t > &R, StringRef Str)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static cl::opt< unsigned > VerifyStallCycles("amdgpu-verify-regbanks-reassign", cl::desc("Verify stall cycles in the regbanks reassign pass"), cl::value_desc("0|1|2"), cl::init(0), cl::Hidden)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
bool hasVGPRs(const TargetRegisterClass *RC) const
Represent the analysis usage information of a pass.
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:305
BitVector & reset()
Definition: BitVector.h:439
bool hasRegisterBanking() const
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
self_iterator getIterator()
Definition: ilist_node.h:81
LiveInterval & getInterval(Register Reg)
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn&#39;t already there.
Definition: SmallSet.h:180
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1433
iterator_range< use_instr_nodbg_iterator > use_nodbg_instructions(Register Reg) const
bool isCopy() const
void unassign(LiveInterval &VirtReg)
Unassign VirtReg from its PhysReg.
T findFirstSet(T Val, ZeroBehavior ZB=ZB_Max)
Get the index of the first set bit starting from the least significant bit.
Definition: MathExtras.h:240
bool isDebugInstr() const
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
iterator_range< mop_iterator > explicit_uses()
Definition: MachineInstr.h:610
#define DEBUG_TYPE
void initializeGCNRegBankReassignPass(PassRegistry &)
Iterator for intrusive lists based on ilist_node.
unsigned countPopulation(T Value)
Count the number of set bits in a value.
Definition: MathExtras.h:568
bool hasAGPRs(const TargetRegisterClass *RC) const
bool isDebugValue() const
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:883
GCN RegBank Reassign
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:420
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void setPreservesAll()
Set by analyses that do not transform their input at all.
#define NUM_SGPR_BANKS
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:280
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:62
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Register getPhys(Register virtReg) const
returns the physical register mapped to the specified virtual register
Definition: VirtRegMap.h:101
#define I(x, y, z)
Definition: MD5.cpp:59
MCPhysReg get32BitRegister(MCPhysReg Reg) const
size_type size() const
size - Returns the number of bits in this bitvector.
Definition: BitVector.h:170
uint32_t Size
Definition: Profile.cpp:46
char & GCNRegBankReassignID
iterator_range< use_nodbg_iterator > use_nodbg_operands(Register Reg) const
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool operator<(int64_t V1, const APSInt &V2)
Definition: APSInt.h:343
#define SGPR_BANK_MASK
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:71
const MCPhysReg * getCalleeSavedRegs() const
Returns list of callee saved registers.
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:46
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
Register getReg() const
getReg - Returns the register number.
Simple wrapper around std::function<void(raw_ostream&)>.
Definition: Printable.h:37
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:466
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
const SIRegisterInfo * getRegisterInfo() const override