LLVM  7.0.0svn
R600MachineScheduler.cpp
Go to the documentation of this file.
1 //===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// R600 Machine Scheduler interface
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "R600MachineScheduler.h"
16 #include "AMDGPUSubtarget.h"
17 #include "R600InstrInfo.h"
21 #include "llvm/Pass.h"
23 
24 using namespace llvm;
25 
26 #define DEBUG_TYPE "machine-scheduler"
27 
29  assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
30  DAG = static_cast<ScheduleDAGMILive*>(dag);
31  const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>();
32  TII = static_cast<const R600InstrInfo*>(DAG->TII);
33  TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
34  VLIW5 = !ST.hasCaymanISA();
35  MRI = &DAG->MRI;
36  CurInstKind = IDOther;
37  CurEmitted = 0;
38  OccupedSlotsMask = 31;
39  InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
40  InstKindLimit[IDOther] = 32;
41  InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
42  AluInstCount = 0;
43  FetchInstCount = 0;
44 }
45 
46 void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
47  std::vector<SUnit *> &QDst)
48 {
49  QDst.insert(QDst.end(), QSrc.begin(), QSrc.end());
50  QSrc.clear();
51 }
52 
53 static unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
54  assert (GPRCount && "GPRCount cannot be 0");
55  return 248 / GPRCount;
56 }
57 
59  SUnit *SU = nullptr;
60  NextInstKind = IDOther;
61 
62  IsTopNode = false;
63 
64  // check if we might want to switch current clause type
65  bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) ||
66  (Available[CurInstKind].empty());
67  bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
68  (!Available[IDFetch].empty() || !Available[IDOther].empty());
69 
70  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
71  // We use the heuristic provided by AMD Accelerated Parallel Processing
72  // OpenCL Programming Guide :
73  // The approx. number of WF that allows TEX inst to hide ALU inst is :
74  // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
75  float ALUFetchRationEstimate =
76  (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
77  (FetchInstCount + Available[IDFetch].size());
78  if (ALUFetchRationEstimate == 0) {
79  AllowSwitchFromAlu = true;
80  } else {
81  unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
82  LLVM_DEBUG(dbgs() << NeededWF << " approx. Wavefronts Required\n");
83  // We assume the local GPR requirements to be "dominated" by the requirement
84  // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
85  // after TEX are indeed likely to consume or generate values from/for the
86  // TEX clause.
87  // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
88  // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
89  // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
90  // (TODO : use RegisterPressure)
91  // If we are going too use too many GPR, we flush Fetch instruction to lower
92  // register pressure on 128 bits regs.
93  unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
94  if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
95  AllowSwitchFromAlu = true;
96  }
97  }
98 
99  if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
100  (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
101  // try to pick ALU
102  SU = pickAlu();
103  if (!SU && !PhysicalRegCopy.empty()) {
104  SU = PhysicalRegCopy.front();
105  PhysicalRegCopy.erase(PhysicalRegCopy.begin());
106  }
107  if (SU) {
108  if (CurEmitted >= InstKindLimit[IDAlu])
109  CurEmitted = 0;
110  NextInstKind = IDAlu;
111  }
112  }
113 
114  if (!SU) {
115  // try to pick FETCH
116  SU = pickOther(IDFetch);
117  if (SU)
118  NextInstKind = IDFetch;
119  }
120 
121  // try to pick other
122  if (!SU) {
123  SU = pickOther(IDOther);
124  if (SU)
125  NextInstKind = IDOther;
126  }
127 
128  LLVM_DEBUG(if (SU) {
129  dbgs() << " ** Pick node **\n";
130  SU->dump(DAG);
131  } else {
132  dbgs() << "NO NODE \n";
133  for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
134  const SUnit &S = DAG->SUnits[i];
135  if (!S.isScheduled)
136  S.dump(DAG);
137  }
138  });
139 
140  return SU;
141 }
142 
143 void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
144  if (NextInstKind != CurInstKind) {
145  LLVM_DEBUG(dbgs() << "Instruction Type Switch\n");
146  if (NextInstKind != IDAlu)
147  OccupedSlotsMask |= 31;
148  CurEmitted = 0;
149  CurInstKind = NextInstKind;
150  }
151 
152  if (CurInstKind == IDAlu) {
153  AluInstCount ++;
154  switch (getAluKind(SU)) {
155  case AluT_XYZW:
156  CurEmitted += 4;
157  break;
158  case AluDiscarded:
159  break;
160  default: {
161  ++CurEmitted;
163  E = SU->getInstr()->operands_end(); It != E; ++It) {
164  MachineOperand &MO = *It;
165  if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X)
166  ++CurEmitted;
167  }
168  }
169  }
170  } else {
171  ++CurEmitted;
172  }
173 
174  LLVM_DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
175 
176  if (CurInstKind != IDFetch) {
177  MoveUnits(Pending[IDFetch], Available[IDFetch]);
178  } else
179  FetchInstCount++;
180 }
181 
182 static bool
184  if (MI->getOpcode() != R600::COPY)
185  return false;
186 
188 }
189 
191  LLVM_DEBUG(dbgs() << "Top Releasing "; SU->dump(DAG););
192 }
193 
195  LLVM_DEBUG(dbgs() << "Bottom Releasing "; SU->dump(DAG););
196  if (isPhysicalRegCopy(SU->getInstr())) {
197  PhysicalRegCopy.push_back(SU);
198  return;
199  }
200 
201  int IK = getInstKind(SU);
202 
203  // There is no export clause, we can schedule one as soon as its ready
204  if (IK == IDOther)
205  Available[IDOther].push_back(SU);
206  else
207  Pending[IK].push_back(SU);
208 
209 }
210 
211 bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
212  const TargetRegisterClass *RC) const {
214  return RC->contains(Reg);
215  } else {
216  return MRI->getRegClass(Reg) == RC;
217  }
218 }
219 
220 R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
221  MachineInstr *MI = SU->getInstr();
222 
223  if (TII->isTransOnly(*MI))
224  return AluTrans;
225 
226  switch (MI->getOpcode()) {
227  case R600::PRED_X:
228  return AluPredX;
229  case R600::INTERP_PAIR_XY:
230  case R600::INTERP_PAIR_ZW:
231  case R600::INTERP_VEC_LOAD:
232  case R600::DOT_4:
233  return AluT_XYZW;
234  case R600::COPY:
235  if (MI->getOperand(1).isUndef()) {
236  // MI will become a KILL, don't considers it in scheduling
237  return AluDiscarded;
238  }
239  default:
240  break;
241  }
242 
243  // Does the instruction take a whole IG ?
244  // XXX: Is it possible to add a helper function in R600InstrInfo that can
245  // be used here and in R600PacketizerList::isSoloInstruction() ?
246  if(TII->isVector(*MI) ||
247  TII->isCubeOp(MI->getOpcode()) ||
248  TII->isReductionOp(MI->getOpcode()) ||
249  MI->getOpcode() == R600::GROUP_BARRIER) {
250  return AluT_XYZW;
251  }
252 
253  if (TII->isLDSInstr(MI->getOpcode())) {
254  return AluT_X;
255  }
256 
257  // Is the result already assigned to a channel ?
258  unsigned DestSubReg = MI->getOperand(0).getSubReg();
259  switch (DestSubReg) {
260  case R600::sub0:
261  return AluT_X;
262  case R600::sub1:
263  return AluT_Y;
264  case R600::sub2:
265  return AluT_Z;
266  case R600::sub3:
267  return AluT_W;
268  default:
269  break;
270  }
271 
272  // Is the result already member of a X/Y/Z/W class ?
273  unsigned DestReg = MI->getOperand(0).getReg();
274  if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) ||
275  regBelongsToClass(DestReg, &R600::R600_AddrRegClass))
276  return AluT_X;
277  if (regBelongsToClass(DestReg, &R600::R600_TReg32_YRegClass))
278  return AluT_Y;
279  if (regBelongsToClass(DestReg, &R600::R600_TReg32_ZRegClass))
280  return AluT_Z;
281  if (regBelongsToClass(DestReg, &R600::R600_TReg32_WRegClass))
282  return AluT_W;
283  if (regBelongsToClass(DestReg, &R600::R600_Reg128RegClass))
284  return AluT_XYZW;
285 
286  // LDS src registers cannot be used in the Trans slot.
287  if (TII->readsLDSSrcReg(*MI))
288  return AluT_XYZW;
289 
290  return AluAny;
291 }
292 
293 int R600SchedStrategy::getInstKind(SUnit* SU) {
294  int Opcode = SU->getInstr()->getOpcode();
295 
296  if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
297  return IDFetch;
298 
299  if (TII->isALUInstr(Opcode)) {
300  return IDAlu;
301  }
302 
303  switch (Opcode) {
304  case R600::PRED_X:
305  case R600::COPY:
306  case R600::CONST_COPY:
307  case R600::INTERP_PAIR_XY:
308  case R600::INTERP_PAIR_ZW:
309  case R600::INTERP_VEC_LOAD:
310  case R600::DOT_4:
311  return IDAlu;
312  default:
313  return IDOther;
314  }
315 }
316 
317 SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
318  if (Q.empty())
319  return nullptr;
320  for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
321  It != E; ++It) {
322  SUnit *SU = *It;
323  InstructionsGroupCandidate.push_back(SU->getInstr());
324  if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) &&
325  (!AnyALU || !TII->isVectorOnly(*SU->getInstr()))) {
326  InstructionsGroupCandidate.pop_back();
327  Q.erase((It + 1).base());
328  return SU;
329  } else {
330  InstructionsGroupCandidate.pop_back();
331  }
332  }
333  return nullptr;
334 }
335 
336 void R600SchedStrategy::LoadAlu() {
337  std::vector<SUnit *> &QSrc = Pending[IDAlu];
338  for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
339  AluKind AK = getAluKind(QSrc[i]);
340  AvailableAlus[AK].push_back(QSrc[i]);
341  }
342  QSrc.clear();
343 }
344 
345 void R600SchedStrategy::PrepareNextSlot() {
346  LLVM_DEBUG(dbgs() << "New Slot\n");
347  assert (OccupedSlotsMask && "Slot wasn't filled");
348  OccupedSlotsMask = 0;
349 // if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
350 // OccupedSlotsMask |= 16;
351  InstructionsGroupCandidate.clear();
352  LoadAlu();
353 }
354 
355 void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
356  int DstIndex = TII->getOperandIdx(MI->getOpcode(), R600::OpName::dst);
357  if (DstIndex == -1) {
358  return;
359  }
360  unsigned DestReg = MI->getOperand(DstIndex).getReg();
361  // PressureRegister crashes if an operand is def and used in the same inst
362  // and we try to constraint its regclass
364  E = MI->operands_end(); It != E; ++It) {
365  MachineOperand &MO = *It;
366  if (MO.isReg() && !MO.isDef() &&
367  MO.getReg() == DestReg)
368  return;
369  }
370  // Constrains the regclass of DestReg to assign it to Slot
371  switch (Slot) {
372  case 0:
373  MRI->constrainRegClass(DestReg, &R600::R600_TReg32_XRegClass);
374  break;
375  case 1:
376  MRI->constrainRegClass(DestReg, &R600::R600_TReg32_YRegClass);
377  break;
378  case 2:
379  MRI->constrainRegClass(DestReg, &R600::R600_TReg32_ZRegClass);
380  break;
381  case 3:
382  MRI->constrainRegClass(DestReg, &R600::R600_TReg32_WRegClass);
383  break;
384  }
385 }
386 
387 SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
388  static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
389  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
390  if (SlotedSU)
391  return SlotedSU;
392  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
393  if (UnslotedSU)
394  AssignSlot(UnslotedSU->getInstr(), Slot);
395  return UnslotedSU;
396 }
397 
398 unsigned R600SchedStrategy::AvailablesAluCount() const {
399  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
400  AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
401  AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
402  AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
403  AvailableAlus[AluPredX].size();
404 }
405 
406 SUnit* R600SchedStrategy::pickAlu() {
407  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
408  if (!OccupedSlotsMask) {
409  // Bottom up scheduling : predX must comes first
410  if (!AvailableAlus[AluPredX].empty()) {
411  OccupedSlotsMask |= 31;
412  return PopInst(AvailableAlus[AluPredX], false);
413  }
414  // Flush physical reg copies (RA will discard them)
415  if (!AvailableAlus[AluDiscarded].empty()) {
416  OccupedSlotsMask |= 31;
417  return PopInst(AvailableAlus[AluDiscarded], false);
418  }
419  // If there is a T_XYZW alu available, use it
420  if (!AvailableAlus[AluT_XYZW].empty()) {
421  OccupedSlotsMask |= 15;
422  return PopInst(AvailableAlus[AluT_XYZW], false);
423  }
424  }
425  bool TransSlotOccuped = OccupedSlotsMask & 16;
426  if (!TransSlotOccuped && VLIW5) {
427  if (!AvailableAlus[AluTrans].empty()) {
428  OccupedSlotsMask |= 16;
429  return PopInst(AvailableAlus[AluTrans], false);
430  }
431  SUnit *SU = AttemptFillSlot(3, true);
432  if (SU) {
433  OccupedSlotsMask |= 16;
434  return SU;
435  }
436  }
437  for (int Chan = 3; Chan > -1; --Chan) {
438  bool isOccupied = OccupedSlotsMask & (1 << Chan);
439  if (!isOccupied) {
440  SUnit *SU = AttemptFillSlot(Chan, false);
441  if (SU) {
442  OccupedSlotsMask |= (1 << Chan);
443  InstructionsGroupCandidate.push_back(SU->getInstr());
444  return SU;
445  }
446  }
447  }
448  PrepareNextSlot();
449  }
450  return nullptr;
451 }
452 
453 SUnit* R600SchedStrategy::pickOther(int QID) {
454  SUnit *SU = nullptr;
455  std::vector<SUnit *> &AQ = Available[QID];
456 
457  if (AQ.empty()) {
458  MoveUnits(Pending[QID], AQ);
459  }
460  if (!AQ.empty()) {
461  SU = AQ.back();
462  AQ.pop_back();
463  }
464  return SU;
465 }
mop_iterator operands_end()
Definition: MachineInstr.h:356
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
bool contains(unsigned Reg) const
Return true if the specified register is included in this register class.
AMDGPU specific subclass of TargetSubtarget.
const TargetRegisterClass * getRegClass(unsigned Reg) const
Return the register class of the specified virtual register.
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
Interface definition for R600InstrInfo.
void initialize(ScheduleDAGMI *dag) override
Initialize the strategy after building the DAG for a new region.
void dump(const ScheduleDAG *G) const
unsigned getReg() const
getReg - Returns the register number.
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
unsigned Reg
unsigned getSubReg() const
R600 Machine Scheduler interface.
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
ScheduleDAGMILive is an implementation of ScheduleDAGInstrs that schedules machine instructions while...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
bool isScheduled
True once scheduled.
Definition: ScheduleDAG.h:289
void releaseBottomNode(SUnit *SU) override
When all successor dependencies have been resolved, free this node for bottom-up scheduling.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:311
bool usesVertexCache(unsigned Opcode) const
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
const TargetRegisterClass * constrainRegClass(unsigned Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:378
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool isPhysicalRegCopy(MachineInstr *MI)
void releaseTopNode(SUnit *SU) override
When all predecessor dependencies have been resolved, free this node for top-down scheduling...
bool isLDSInstr(unsigned Opcode) const
int getOperandIdx(const MachineInstr &MI, unsigned Op) const
Get the index of Op in the MachineInstr.
bool isVectorOnly(unsigned Opcode) const
bool fitsConstReadLimitations(const std::vector< MachineInstr *> &) const
An instruction group can only access 2 channel pair (either [XY] or [ZW]) from KCache bank on R700+...
bool isTransOnly(unsigned Opcode) const
bool isVector(const MachineInstr &MI) const
Vector instructions are instructions that must fill all instruction slots within an instruction group...
auto size(R &&Range, typename std::enable_if< std::is_same< typename std::iterator_traits< decltype(Range.begin())>::iterator_category, std::random_access_iterator_tag >::value, void >::type *=nullptr) -> decltype(std::distance(Range.begin(), Range.end()))
Get the size of a range.
Definition: STLExtras.h:1032
MachineOperand class - Representation of each machine instruction operand.
bool hasCaymanISA() const
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
bool isALUInstr(unsigned Opcode) const
static unsigned getWFCountLimitedByGPR(unsigned GPRCount)
bool readsLDSSrcReg(const MachineInstr &MI) const
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:60
short getTexVTXClauseSize() const
bool isReductionOp(unsigned opcode) const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
mop_iterator operands_begin()
Definition: MachineInstr.h:355
IRTranslator LLVM IR MI
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:572
#define LLVM_DEBUG(X)
Definition: Debug.h:119
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:316
bool usesTextureCache(unsigned Opcode) const
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:247
bool isCubeOp(unsigned opcode) const