LLVM  10.0.0svn
GCNRegBankReassign.cpp
Go to the documentation of this file.
1 //===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Try to reassign registers on GFX10+ to reduce register bank
11 /// conflicts.
12 ///
13 /// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in
14 /// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to
15 /// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1,
16 /// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc.
17 ///
18 /// The shader can read one dword from each of these banks once per cycle.
19 /// If an instruction has to read more register operands from the same bank
20 /// an additional cycle is needed. HW attempts to pre-load registers through
21 /// input operand gathering, but a stall cycle may occur if that fails. For
22 /// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands,
23 /// potentially incuring 2 stall cycles.
24 ///
25 /// The pass tries to reassign registers to reduce bank conflicts.
26 ///
27 /// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so
28 /// that 4 has to be subtracted from an SGPR bank number to get the real value.
29 /// This also corresponds to bit numbers in bank masks used in the pass.
30 ///
31 //===----------------------------------------------------------------------===//
32 
33 #include "AMDGPU.h"
34 #include "AMDGPUSubtarget.h"
35 #include "SIInstrInfo.h"
36 #include "SIMachineFunctionInfo.h"
38 #include "llvm/ADT/SmallSet.h"
39 #include "llvm/ADT/Statistic.h"
47 
48 using namespace llvm;
49 
50 static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign",
51  cl::desc("Verify stall cycles in the regbanks reassign pass"),
52  cl::value_desc("0|1|2"),
53  cl::init(0), cl::Hidden);
54 
55 #define DEBUG_TYPE "amdgpu-regbanks-reassign"
56 
57 #define NUM_VGPR_BANKS 4
58 #define NUM_SGPR_BANKS 8
59 #define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS)
60 #define SGPR_BANK_OFFSET NUM_VGPR_BANKS
61 #define VGPR_BANK_MASK 0xf
62 #define SGPR_BANK_MASK 0xff0
63 #define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET)
64 
65 STATISTIC(NumStallsDetected,
66  "Number of operand read stalls detected");
67 STATISTIC(NumStallsRecovered,
68  "Number of operand read stalls recovered");
69 
70 namespace {
71 
72 class GCNRegBankReassign : public MachineFunctionPass {
73 
74  class OperandMask {
75  public:
76  OperandMask(unsigned r, unsigned s, unsigned m)
77  : Reg(r), SubReg(s), Mask(m) {}
78  unsigned Reg;
79  unsigned SubReg;
80  unsigned Mask;
81  };
82 
83  class Candidate {
84  public:
85  Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks,
86  unsigned weight)
87  : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {}
88 
89  bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; }
90 
91 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
92  void dump(const GCNRegBankReassign *P) const {
93  MI->dump();
94  dbgs() << P->printReg(Reg) << " to banks ";
95  dumpFreeBanks(FreeBanks);
96  dbgs() << " weight " << Weight << '\n';
97  }
98 #endif
99 
100  MachineInstr *MI;
101  unsigned Reg;
102  unsigned FreeBanks;
103  unsigned Weight;
104  };
105 
106  class CandidateList : public std::list<Candidate> {
107  public:
108  // Speedup subsequent sort.
109  void push(const Candidate&& C) {
110  if (C.Weight) push_back(C);
111  else push_front(C);
112  }
113  };
114 
115 public:
116  static char ID;
117 
118 public:
119  GCNRegBankReassign() : MachineFunctionPass(ID) {
121  }
122 
123  bool runOnMachineFunction(MachineFunction &MF) override;
124 
125  StringRef getPassName() const override { return "GCN RegBank Reassign"; }
126 
127  void getAnalysisUsage(AnalysisUsage &AU) const override {
130  AU.addRequired<VirtRegMap>();
132  AU.setPreservesAll();
134  }
135 
136 private:
137  const GCNSubtarget *ST;
138 
139  const MachineRegisterInfo *MRI;
140 
141  const SIRegisterInfo *TRI;
142 
143  MachineLoopInfo *MLI;
144 
145  VirtRegMap *VRM;
146 
147  LiveRegMatrix *LRM;
148 
149  LiveIntervals *LIS;
150 
151  unsigned MaxNumVGPRs;
152 
153  unsigned MaxNumSGPRs;
154 
155  BitVector RegsUsed;
156 
157  SmallVector<OperandMask, 8> OperandMasks;
158 
159  CandidateList Candidates;
160 
161  const MCPhysReg *CSRegs;
162 
163  // Returns bank for a phys reg.
164  unsigned getPhysRegBank(unsigned Reg) const;
165 
166  // Return a bit set for each register bank used. 4 banks for VGPRs and
167  // 8 banks for SGPRs.
168  // Registers already processed and recorded in RegsUsed are excluded.
169  // If Bank is not -1 assume Reg:SubReg to belong to that Bank.
170  unsigned getRegBankMask(unsigned Reg, unsigned SubReg, int Bank);
171 
172  // Return number of stalls in the instructions.
173  // UsedBanks has bits set for the banks used by all operands.
174  // If Reg and Bank provided substitute the Reg with the Bank.
175  unsigned analyzeInst(const MachineInstr& MI, unsigned& UsedBanks,
176  unsigned Reg = AMDGPU::NoRegister, int Bank = -1);
177 
178  // Return true if register is regular VGPR or SGPR or their tuples.
179  // Returns false for special registers like m0, vcc etc.
180  bool isReassignable(unsigned Reg) const;
181 
182  // Check if registers' defs are old and may be pre-loaded.
183  // Returns 0 if both registers are old enough, 1 or 2 if one or both
184  // registers will not likely be pre-loaded.
185  unsigned getOperandGatherWeight(const MachineInstr& MI,
186  unsigned Reg1,
187  unsigned Reg2,
188  unsigned StallCycles) const;
189 
190 
191  // Find all bank bits in UsedBanks where Mask can be relocated to.
192  unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const;
193 
194  // Find all bank bits in UsedBanks where Mask can be relocated to.
195  // Bank is relative to the register and not its subregister component.
196  // Returns 0 is a register is not reassignable.
197  unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask,
198  unsigned UsedBanks) const;
199 
200  // Add cadidate instruction to the work list.
201  void collectCandidates(MachineInstr& MI, unsigned UsedBanks,
202  unsigned StallCycles);
203 
204  // Collect cadidate instructions across function. Returns a number stall
205  // cycles detected. Only counts stalls if Collect is false.
206  unsigned collectCandidates(MachineFunction &MF, bool Collect = true);
207 
208  // Remove all candidates that read specified register.
209  void removeCandidates(unsigned Reg);
210 
211  // Compute stalls within the uses of SrcReg replaced by a register from
212  // Bank. If Bank is -1 does not perform substitution. If Collect is set
213  // candidates are collected and added to work list.
214  unsigned computeStallCycles(unsigned SrcReg,
215  unsigned Reg = AMDGPU::NoRegister,
216  int Bank = -1, bool Collect = false);
217 
218  // Search for a register in Bank unused within LI.
219  // Returns phys reg or NoRegister.
220  unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const;
221 
222  // Try to reassign candidate. Returns number or stall cycles saved.
223  unsigned tryReassign(Candidate &C);
224 
225  bool verifyCycles(MachineFunction &MF,
226  unsigned OriginalCycles, unsigned CyclesSaved);
227 
228 
229 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
230 public:
231  Printable printReg(unsigned Reg, unsigned SubReg = 0) const {
232  return Printable([Reg, SubReg, this](raw_ostream &OS) {
233  if (Register::isPhysicalRegister(Reg)) {
234  OS << llvm::printReg(Reg, TRI);
235  return;
236  }
237  if (!VRM->isAssignedReg(Reg))
238  OS << "<unassigned> " << llvm::printReg(Reg, TRI);
239  else
240  OS << llvm::printReg(Reg, TRI) << '('
241  << llvm::printReg(VRM->getPhys(Reg), TRI) << ')';
242  if (SubReg)
243  OS << ':' << TRI->getSubRegIndexName(SubReg);
244  });
245  }
246 
247  static Printable printBank(unsigned Bank) {
248  return Printable([Bank](raw_ostream &OS) {
249  OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank);
250  });
251  }
252 
253  static void dumpFreeBanks(unsigned FreeBanks) {
254  for (unsigned L = 0; L < NUM_BANKS; ++L)
255  if (FreeBanks & (1 << L))
256  dbgs() << printBank(L) << ' ';
257  }
258 #endif
259 };
260 
261 } // End anonymous namespace.
262 
263 INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
264  false, false)
269 INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
270  false, false)
271 
272 
273 char GCNRegBankReassign::ID = 0;
274 
275 char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
276 
277 unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
279 
280  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
281  unsigned Size = TRI->getRegSizeInBits(*RC);
282  if (Size > 32)
283  Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
284 
285  if (TRI->hasVGPRs(RC)) {
286  Reg -= AMDGPU::VGPR0;
287  return Reg % NUM_VGPR_BANKS;
288  }
289 
290  Reg = TRI->getEncodingValue(Reg) / 2;
291  return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
292 }
293 
294 unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
295  int Bank) {
296  if (Register::isVirtualRegister(Reg)) {
297  if (!VRM->isAssignedReg(Reg))
298  return 0;
299 
300  Reg = VRM->getPhys(Reg);
301  if (!Reg)
302  return 0;
303  if (SubReg)
304  Reg = TRI->getSubReg(Reg, SubReg);
305  }
306 
307  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
308  unsigned Size = TRI->getRegSizeInBits(*RC) / 32;
309  if (Size > 1)
310  Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
311 
312  if (TRI->hasVGPRs(RC)) {
313  // VGPRs have 4 banks assigned in a round-robin fashion.
314  Reg -= AMDGPU::VGPR0;
315  unsigned Mask = (1 << Size) - 1;
316  unsigned Used = 0;
317  // Bitmask lacks an extract method
318  for (unsigned I = 0; I < Size; ++I)
319  if (RegsUsed.test(Reg + I))
320  Used |= 1 << I;
321  RegsUsed.set(Reg, Reg + Size);
322  Mask &= ~Used;
323  Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : unsigned(Bank);
324  return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
325  }
326 
327  // SGPRs have 8 banks holding 2 consequitive registers each.
328  Reg = TRI->getEncodingValue(Reg) / 2;
329  unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs();
330  if (Reg + StartBit >= RegsUsed.size())
331  return 0;
332 
333  if (Size > 1)
334  Size /= 2;
335  unsigned Mask = (1 << Size) - 1;
336  unsigned Used = 0;
337  for (unsigned I = 0; I < Size; ++I)
338  if (RegsUsed.test(StartBit + Reg + I))
339  Used |= 1 << I;
340  RegsUsed.set(StartBit + Reg, StartBit + Reg + Size);
341  Mask &= ~Used;
342  Mask <<= (Bank == -1) ? Reg % NUM_SGPR_BANKS
343  : unsigned(Bank - SGPR_BANK_OFFSET);
344  Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
345  // Reserve 4 bank ids for VGPRs.
346  return Mask << SGPR_BANK_OFFSET;
347 }
348 
349 unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI,
350  unsigned& UsedBanks,
351  unsigned Reg,
352  int Bank) {
353  unsigned StallCycles = 0;
354  UsedBanks = 0;
355 
356  if (MI.isDebugValue())
357  return 0;
358 
359  RegsUsed.reset();
360  OperandMasks.clear();
361  for (const auto& Op : MI.explicit_uses()) {
362  // Undef can be assigned to any register, so two vregs can be assigned
363  // the same phys reg within the same instruction.
364  if (!Op.isReg() || Op.isUndef())
365  continue;
366 
367  Register R = Op.getReg();
368  if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R)))
369  continue;
370 
371  unsigned ShiftedBank = Bank;
372 
373  if (Bank != -1 && R == Reg && Op.getSubReg()) {
374  unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger();
375  if (!(LM & 1) && (Bank < NUM_VGPR_BANKS)) {
376  // If a register spans all banks we cannot shift it to avoid conflict.
377  if (countPopulation(LM) >= NUM_VGPR_BANKS)
378  continue;
379  ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS;
380  } else if (!(LM & 3) && (Bank >= SGPR_BANK_OFFSET)) {
381  // If a register spans all banks we cannot shift it to avoid conflict.
382  if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS)
383  continue;
384  ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET +
385  (countTrailingZeros(LM) >> 1)) %
387  }
388  }
389 
390  unsigned Mask = getRegBankMask(R, Op.getSubReg(),
391  (Reg == R) ? ShiftedBank : -1);
392  StallCycles += countPopulation(UsedBanks & Mask);
393  UsedBanks |= Mask;
394  OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask));
395  }
396 
397  return StallCycles;
398 }
399 
400 unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
401  unsigned Reg1,
402  unsigned Reg2,
403  unsigned StallCycles) const
404 {
405  unsigned Defs = 0;
408  for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) {
409  if (MI.isDebugInstr())
410  continue;
411  --Def;
412  if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
413  continue;
414  if (Def->modifiesRegister(Reg1, TRI))
415  Defs |= 1;
416  if (Def->modifiesRegister(Reg2, TRI))
417  Defs |= 2;
418  }
419  return countPopulation(Defs);
420 }
421 
422 bool GCNRegBankReassign::isReassignable(unsigned Reg) const {
423  if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
424  return false;
425 
426  const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
427 
428  Register PhysReg = VRM->getPhys(Reg);
429 
430  if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
431  return false;
432 
433  for (auto U : MRI->use_nodbg_operands(Reg)) {
434  if (U.isImplicit())
435  return false;
436  const MachineInstr *UseInst = U.getParent();
437  if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
438  return false;
439  }
440 
441  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
442  if (TRI->hasVGPRs(RC))
443  return true;
444 
445  unsigned Size = TRI->getRegSizeInBits(*RC);
446  if (Size > 32)
447  PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);
448 
449  return AMDGPU::SGPR_32RegClass.contains(PhysReg);
450 }
451 
452 unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,
453  unsigned UsedBanks) const {
454  unsigned Size = countPopulation(Mask);
455  unsigned FreeBanks = 0;
456  unsigned Bank = findFirstSet(Mask);
457 
458  UsedBanks &= ~Mask;
459 
460  // Find free VGPR banks
461  if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) {
462  for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) {
463  if (Bank == I)
464  continue;
465  unsigned NewMask = ((1 << Size) - 1) << I;
466  NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
467  if (!(UsedBanks & NewMask))
468  FreeBanks |= 1 << I;
469  }
470  return FreeBanks;
471  }
472 
473  // Find free SGPR banks
474  // SGPR tuples must be aligned, so step is size in banks it
475  // crosses.
476  Bank -= SGPR_BANK_OFFSET;
477  for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) {
478  if (Bank == I)
479  continue;
480  unsigned NewMask = ((1 << Size) - 1) << I;
481  NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
482  if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET)))
483  FreeBanks |= (1 << SGPR_BANK_OFFSET) << I;
484  }
485 
486  return FreeBanks;
487 }
488 
489 unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg,
490  unsigned SubReg,
491  unsigned Mask,
492  unsigned UsedBanks) const {
493  if (!isReassignable(Reg))
494  return 0;
495 
496  unsigned FreeBanks = getFreeBanks(Mask, UsedBanks);
497 
498  unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger();
499  if (!(LM & 1) && (Mask & VGPR_BANK_MASK)) {
500  unsigned Shift = countTrailingZeros(LM);
501  if (Shift >= NUM_VGPR_BANKS)
502  return 0;
503  unsigned VB = FreeBanks & VGPR_BANK_MASK;
504  FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) &
506  } else if (!(LM & 3) && (Mask & SGPR_BANK_MASK)) {
507  unsigned Shift = countTrailingZeros(LM) >> 1;
508  if (Shift >= NUM_SGPR_BANKS)
509  return 0;
510  unsigned SB = FreeBanks >> SGPR_BANK_OFFSET;
511  FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) &
513  FreeBanks <<= SGPR_BANK_OFFSET;
514  }
515 
516  LLVM_DEBUG(if (FreeBanks) {
517  dbgs() << "Potential reassignments of " << printReg(Reg, SubReg)
518  << " to banks: "; dumpFreeBanks(FreeBanks);
519  dbgs() << '\n'; });
520 
521  return FreeBanks;
522 }
523 
524 void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
525  unsigned UsedBanks,
526  unsigned StallCycles) {
527  LLVM_DEBUG(MI.dump());
528 
529  if (!StallCycles)
530  return;
531 
532  LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n');
533 
534  for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) {
535  for (unsigned J = I + 1; J != E; ++J) {
536  if (!(OperandMasks[I].Mask & OperandMasks[J].Mask))
537  continue;
538 
539  unsigned Reg1 = OperandMasks[I].Reg;
540  unsigned Reg2 = OperandMasks[J].Reg;
541  unsigned SubReg1 = OperandMasks[I].SubReg;
542  unsigned SubReg2 = OperandMasks[J].SubReg;
543  unsigned Mask1 = OperandMasks[I].Mask;
544  unsigned Mask2 = OperandMasks[J].Mask;
545  unsigned Size1 = countPopulation(Mask1);
546  unsigned Size2 = countPopulation(Mask2);
547 
548  LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) <<
549  " and " << printReg(Reg2, SubReg2) << '\n');
550 
551  unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles);
552  Weight += MLI->getLoopDepth(MI.getParent()) * 10;
553 
554  LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n');
555 
556  unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
557  unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
558  if (FreeBanks1)
559  Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight
560  + ((Size2 > Size1) ? 1 : 0)));
561  if (FreeBanks2)
562  Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight
563  + ((Size1 > Size2) ? 1 : 0)));
564  }
565  }
566 }
567 
568 unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
569  unsigned Reg, int Bank,
570  bool Collect) {
571  unsigned TotalStallCycles = 0;
572  unsigned UsedBanks = 0;
574 
575  for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) {
576  if (MI.isBundle())
577  continue;
578  if (!Visited.insert(&MI).second)
579  continue;
580  unsigned StallCycles = analyzeInst(MI, UsedBanks, Reg, Bank);
581  TotalStallCycles += StallCycles;
582  if (Collect)
583  collectCandidates(MI, UsedBanks, StallCycles);
584  }
585 
586  return TotalStallCycles;
587 }
588 
589 unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI,
590  unsigned Bank) const {
591  const TargetRegisterClass *RC = MRI->getRegClass(LI.reg);
592  unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs
593  : MaxNumSGPRs;
594  unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0
595  : AMDGPU::SGPR0);
596 
597  for (unsigned Reg : RC->getRegisters()) {
598  // Check occupancy limit.
599  if (TRI->isSubRegisterEq(Reg, MaxReg))
600  break;
601 
602  if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank)
603  continue;
604 
605  for (unsigned I = 0; CSRegs[I]; ++I)
606  if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
607  !LRM->isPhysRegUsed(CSRegs[I]))
608  return AMDGPU::NoRegister;
609 
610  LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n');
611 
612  if (!LRM->checkInterference(LI, Reg))
613  return Reg;
614  }
615 
616  return AMDGPU::NoRegister;
617 }
618 
619 unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
620  if (!LIS->hasInterval(C.Reg))
621  return 0;
622 
623  LiveInterval &LI = LIS->getInterval(C.Reg);
624  LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump();
625  LI.dump());
626 
627  // For each candidate bank walk all instructions in the range of live
628  // interval and check if replacing the register with one belonging to
629  // the candidate bank reduces conflicts.
630 
631  unsigned OrigStalls = computeStallCycles(C.Reg);
632  LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n');
633  if (!OrigStalls)
634  return 0;
635 
636  struct BankStall {
637  BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {};
638  bool operator< (const BankStall &RHS) const { return Stalls > RHS.Stalls; }
639  unsigned Bank;
640  unsigned Stalls;
641  };
642  SmallVector<BankStall, 8> BankStalls;
643 
644  for (int Bank = 0; Bank < NUM_BANKS; ++Bank) {
645  if (C.FreeBanks & (1 << Bank)) {
646  LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
647  unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank);
648  if (Stalls < OrigStalls) {
649  LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
650  << Stalls << '\n');
651  BankStalls.push_back(BankStall((unsigned)Bank, Stalls));
652  }
653  }
654  }
655  std::sort(BankStalls.begin(), BankStalls.end());
656 
657  Register OrigReg = VRM->getPhys(C.Reg);
658  LRM->unassign(LI);
659  while (!BankStalls.empty()) {
660  BankStall BS = BankStalls.pop_back_val();
661  unsigned Reg = scavengeReg(LI, BS.Bank);
662  if (Reg == AMDGPU::NoRegister) {
663  LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
664  << '\n');
665  continue;
666  }
667  LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg)
668  << (LRM->isPhysRegUsed(Reg) ? "" : " (new)")
669  << " in bank " << printBank(BS.Bank) << '\n');
670 
671  LRM->assign(LI, Reg);
672 
673  LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n');
674 
675  return OrigStalls - BS.Stalls;
676  }
677  LRM->assign(LI, OrigReg);
678 
679  return 0;
680 }
681 
682 unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
683  bool Collect) {
684  unsigned TotalStallCycles = 0;
685 
686  for (MachineBasicBlock &MBB : MF) {
687 
688  LLVM_DEBUG(if (Collect) {
689  if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber();
690  else dbgs() << MBB.getName(); dbgs() << ":\n";
691  });
692 
693  for (MachineInstr &MI : MBB.instrs()) {
694  if (MI.isBundle())
695  continue; // we analyze the instructions inside the bundle individually
696 
697  unsigned UsedBanks = 0;
698  unsigned StallCycles = analyzeInst(MI, UsedBanks);
699 
700  if (Collect)
701  collectCandidates(MI, UsedBanks, StallCycles);
702 
703  TotalStallCycles += StallCycles;
704  }
705 
706  LLVM_DEBUG(if (Collect) { dbgs() << '\n'; });
707  }
708 
709  return TotalStallCycles;
710 }
711 
712 void GCNRegBankReassign::removeCandidates(unsigned Reg) {
713  Candidates.remove_if([Reg, this](const Candidate& C) {
714  return C.MI->readsRegister(Reg, TRI);
715  });
716 }
717 
718 bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
719  unsigned OriginalCycles,
720  unsigned CyclesSaved) {
721  unsigned StallCycles = collectCandidates(MF, false);
722  LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles
723  << " stall cycles left\n");
724  return StallCycles + CyclesSaved == OriginalCycles;
725 }
726 
727 bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
728  ST = &MF.getSubtarget<GCNSubtarget>();
729  if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction()))
730  return false;
731 
732  MRI = &MF.getRegInfo();
733  TRI = ST->getRegisterInfo();
734  MLI = &getAnalysis<MachineLoopInfo>();
735  VRM = &getAnalysis<VirtRegMap>();
736  LRM = &getAnalysis<LiveRegMatrix>();
737  LIS = &getAnalysis<LiveIntervals>();
738 
740  unsigned Occupancy = MFI->getOccupancy();
741  MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
742  MaxNumSGPRs = ST->getMaxNumSGPRs(MF);
743  MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs);
744  MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs);
745 
746  CSRegs = MRI->getCalleeSavedRegs();
747 
748  RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() +
749  TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1);
750 
751  LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName()
752  << '\n');
753 
754  unsigned StallCycles = collectCandidates(MF);
755  NumStallsDetected += StallCycles;
756 
757  LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
758  "function " << MF.getName() << '\n');
759 
760  Candidates.sort();
761 
762  LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
763  for (auto C : Candidates) C.dump(this);
764  dbgs() << "\n\n");
765 
766  unsigned CyclesSaved = 0;
767  while (!Candidates.empty()) {
768  Candidate C = Candidates.back();
769  unsigned LocalCyclesSaved = tryReassign(C);
770  CyclesSaved += LocalCyclesSaved;
771 
772  if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
773  report_fatal_error("RegBank reassign stall cycles verification failed.");
774 
775  Candidates.pop_back();
776  if (LocalCyclesSaved) {
777  removeCandidates(C.Reg);
778  computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true);
779  Candidates.sort();
780 
781  LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
782  for (auto C : Candidates)
783  C.dump(this);
784  dbgs() << "\n\n");
785  }
786  }
787  NumStallsRecovered += CyclesSaved;
788 
789  LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved
790  << " cycles saved in function " << MF.getName() << '\n');
791 
792  Candidates.clear();
793 
794  if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
795  report_fatal_error("RegBank reassign stall cycles verification failed.");
796 
797  RegsUsed.clear();
798 
799  return CyclesSaved > 0;
800 }
void resize(unsigned N, bool t=false)
resize - Grow or shrink the bitvector.
Definition: BitVector.h:371
uint64_t CallInst * C
BitVector & set()
Definition: BitVector.h:397
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
instr_iterator instr_begin()
const unsigned reg
Definition: LiveInterval.h:708
AMDGPU specific subclass of TargetSubtarget.
bool isAllocatable(unsigned PhysReg) const
isAllocatable - Returns true when PhysReg belongs to an allocatable register class and it hasn&#39;t been...
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:139
This class represents lattice values for constants.
Definition: AllocatorList.h:23
InterferenceKind checkInterference(LiveInterval &VirtReg, unsigned PhysReg)
Check for interference before assigning VirtReg to PhysReg.
INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign", false, false) INITIALIZE_PASS_END(GCNRegBankReassign
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:63
LiveInterval - This class represents the liveness of a register, or stack slot.
Definition: LiveInterval.h:679
iterator_range< use_nodbg_iterator > use_nodbg_operands(unsigned Reg) const
unsigned Reg
bool test(unsigned Idx) const
Definition: BitVector.h:501
unsigned getLoopDepth(const MachineBasicBlock *BB) const
Return the loop nesting level of the specified block.
STATISTIC(NumFunctions, "Total number of functions")
unsigned const TargetRegisterInfo * TRI
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
unsigned countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0&#39;s from the least significant bit to the most stopping at the first 1...
Definition: MathExtras.h:156
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
void clear()
clear - Removes all bits from the bitvector. Does not change capacity.
Definition: BitVector.h:366
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
#define SGPR_BANK_OFFSET
unsigned SubReg
#define VGPR_BANK_MASK
bool isPhysRegUsed(unsigned PhysReg) const
Returns true if the given PhysReg has any live intervals assigned.
bool hasInterval(Register Reg) const
bool isBundle() const
void assign(LiveInterval &VirtReg, unsigned PhysReg)
Assign VirtReg to PhysReg.
#define SGPR_BANK_SHIFTED_MASK
void dump() const
Definition: Pass.cpp:134
bool isAssignedReg(Register virtReg) const
returns true if the specified virtual register is not mapped to a stack slot or rematerialized.
Definition: VirtRegMap.h:155
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition: MCRegister.h:19
#define NUM_VGPR_BANKS
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
#define P(N)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
iterator_range< SmallVectorImpl< MCPhysReg >::const_iterator > getRegisters() const
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define NUM_BANKS
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
unsigned const MachineRegisterInfo * MRI
static void push(SmallVectorImpl< uint64_t > &R, StringRef Str)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static cl::opt< unsigned > VerifyStallCycles("amdgpu-verify-regbanks-reassign", cl::desc("Verify stall cycles in the regbanks reassign pass"), cl::value_desc("0|1|2"), cl::init(0), cl::Hidden)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
bool hasVGPRs(const TargetRegisterClass *RC) const
Represent the analysis usage information of a pass.
BitVector & reset()
Definition: BitVector.h:438
bool hasRegisterBanking() const
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
self_iterator getIterator()
Definition: ilist_node.h:81
LiveInterval & getInterval(Register Reg)
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn&#39;t already there.
Definition: SmallSet.h:180
bool isCopy() const
void unassign(LiveInterval &VirtReg)
Unassign VirtReg from its PhysReg.
size_t size() const
Definition: SmallVector.h:52
T findFirstSet(T Val, ZeroBehavior ZB=ZB_Max)
Get the index of the first set bit starting from the least significant bit.
Definition: MathExtras.h:239
bool isDebugInstr() const
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
iterator_range< mop_iterator > explicit_uses()
Definition: MachineInstr.h:516
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1095
#define DEBUG_TYPE
void initializeGCNRegBankReassignPass(PassRegistry &)
Iterator for intrusive lists based on ilist_node.
unsigned countPopulation(T Value)
Count the number of set bits in a value.
Definition: MathExtras.h:556
bool hasAGPRs(const TargetRegisterClass *RC) const
bool isDebugValue() const
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
GCN RegBank Reassign
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:374
MachineInstr * getUniqueVRegDef(unsigned Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
const Function & getFunction() const
Return the LLVM function that this machine code represents.
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, unsigned Reg) const
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void setPreservesAll()
Set by analyses that do not transform their input at all.
#define NUM_SGPR_BANKS
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:255
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:63
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Register getPhys(Register virtReg) const
returns the physical register mapped to the specified virtual register
Definition: VirtRegMap.h:102
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:55
#define I(x, y, z)
Definition: MD5.cpp:58
size_type size() const
size - Returns the number of bits in this bitvector.
Definition: BitVector.h:169
uint32_t Size
Definition: Profile.cpp:46
char & GCNRegBankReassignID
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool operator<(int64_t V1, const APSInt &V2)
Definition: APSInt.h:343
#define SGPR_BANK_MASK
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:69
const MCPhysReg * getCalleeSavedRegs() const
Returns list of callee saved registers.
std::underlying_type< E >::type Mask()
Get a bitmask with 1s in all places up to the high-order bit of E&#39;s largest value.
Definition: BitmaskEnum.h:80
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:45
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
Register getReg() const
getReg - Returns the register number.
Simple wrapper around std::function<void(raw_ostream&)>.
Definition: Printable.h:37
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:415
iterator_range< use_instr_nodbg_iterator > use_nodbg_instructions(unsigned Reg) const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
const SIRegisterInfo * getRegisterInfo() const override