LLVM  6.0.0svn
R600MachineScheduler.cpp
Go to the documentation of this file.
1 //===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief R600 Machine Scheduler interface
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "R600MachineScheduler.h"
16 #include "AMDGPUSubtarget.h"
17 #include "R600InstrInfo.h"
20 #include "llvm/Pass.h"
22 
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "machine-scheduler"
26 
28  assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
29  DAG = static_cast<ScheduleDAGMILive*>(dag);
30  const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>();
31  TII = static_cast<const R600InstrInfo*>(DAG->TII);
32  TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
33  VLIW5 = !ST.hasCaymanISA();
34  MRI = &DAG->MRI;
35  CurInstKind = IDOther;
36  CurEmitted = 0;
37  OccupedSlotsMask = 31;
38  InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
39  InstKindLimit[IDOther] = 32;
40  InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
41  AluInstCount = 0;
42  FetchInstCount = 0;
43 }
44 
45 void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
46  std::vector<SUnit *> &QDst)
47 {
48  QDst.insert(QDst.end(), QSrc.begin(), QSrc.end());
49  QSrc.clear();
50 }
51 
52 static unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
53  assert (GPRCount && "GPRCount cannot be 0");
54  return 248 / GPRCount;
55 }
56 
58  SUnit *SU = nullptr;
59  NextInstKind = IDOther;
60 
61  IsTopNode = false;
62 
63  // check if we might want to switch current clause type
64  bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) ||
65  (Available[CurInstKind].empty());
66  bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
67  (!Available[IDFetch].empty() || !Available[IDOther].empty());
68 
69  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
70  // We use the heuristic provided by AMD Accelerated Parallel Processing
71  // OpenCL Programming Guide :
72  // The approx. number of WF that allows TEX inst to hide ALU inst is :
73  // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
74  float ALUFetchRationEstimate =
75  (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
76  (FetchInstCount + Available[IDFetch].size());
77  if (ALUFetchRationEstimate == 0) {
78  AllowSwitchFromAlu = true;
79  } else {
80  unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
81  DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
82  // We assume the local GPR requirements to be "dominated" by the requirement
83  // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
84  // after TEX are indeed likely to consume or generate values from/for the
85  // TEX clause.
86  // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
87  // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
88  // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
89  // (TODO : use RegisterPressure)
90  // If we are going too use too many GPR, we flush Fetch instruction to lower
91  // register pressure on 128 bits regs.
92  unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
93  if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
94  AllowSwitchFromAlu = true;
95  }
96  }
97 
98  if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
99  (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
100  // try to pick ALU
101  SU = pickAlu();
102  if (!SU && !PhysicalRegCopy.empty()) {
103  SU = PhysicalRegCopy.front();
104  PhysicalRegCopy.erase(PhysicalRegCopy.begin());
105  }
106  if (SU) {
107  if (CurEmitted >= InstKindLimit[IDAlu])
108  CurEmitted = 0;
109  NextInstKind = IDAlu;
110  }
111  }
112 
113  if (!SU) {
114  // try to pick FETCH
115  SU = pickOther(IDFetch);
116  if (SU)
117  NextInstKind = IDFetch;
118  }
119 
120  // try to pick other
121  if (!SU) {
122  SU = pickOther(IDOther);
123  if (SU)
124  NextInstKind = IDOther;
125  }
126 
127  DEBUG(
128  if (SU) {
129  dbgs() << " ** Pick node **\n";
130  SU->dump(DAG);
131  } else {
132  dbgs() << "NO NODE \n";
133  for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
134  const SUnit &S = DAG->SUnits[i];
135  if (!S.isScheduled)
136  S.dump(DAG);
137  }
138  }
139  );
140 
141  return SU;
142 }
143 
144 void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
145  if (NextInstKind != CurInstKind) {
146  DEBUG(dbgs() << "Instruction Type Switch\n");
147  if (NextInstKind != IDAlu)
148  OccupedSlotsMask |= 31;
149  CurEmitted = 0;
150  CurInstKind = NextInstKind;
151  }
152 
153  if (CurInstKind == IDAlu) {
154  AluInstCount ++;
155  switch (getAluKind(SU)) {
156  case AluT_XYZW:
157  CurEmitted += 4;
158  break;
159  case AluDiscarded:
160  break;
161  default: {
162  ++CurEmitted;
164  E = SU->getInstr()->operands_end(); It != E; ++It) {
165  MachineOperand &MO = *It;
166  if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
167  ++CurEmitted;
168  }
169  }
170  }
171  } else {
172  ++CurEmitted;
173  }
174 
175 
176  DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
177 
178  if (CurInstKind != IDFetch) {
179  MoveUnits(Pending[IDFetch], Available[IDFetch]);
180  } else
181  FetchInstCount++;
182 }
183 
184 static bool
186  if (MI->getOpcode() != AMDGPU::COPY)
187  return false;
188 
190 }
191 
193  DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG););
194 }
195 
197  DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG););
198  if (isPhysicalRegCopy(SU->getInstr())) {
199  PhysicalRegCopy.push_back(SU);
200  return;
201  }
202 
203  int IK = getInstKind(SU);
204 
205  // There is no export clause, we can schedule one as soon as its ready
206  if (IK == IDOther)
207  Available[IDOther].push_back(SU);
208  else
209  Pending[IK].push_back(SU);
210 
211 }
212 
213 bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
214  const TargetRegisterClass *RC) const {
216  return RC->contains(Reg);
217  } else {
218  return MRI->getRegClass(Reg) == RC;
219  }
220 }
221 
222 R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
223  MachineInstr *MI = SU->getInstr();
224 
225  if (TII->isTransOnly(*MI))
226  return AluTrans;
227 
228  switch (MI->getOpcode()) {
229  case AMDGPU::PRED_X:
230  return AluPredX;
231  case AMDGPU::INTERP_PAIR_XY:
232  case AMDGPU::INTERP_PAIR_ZW:
233  case AMDGPU::INTERP_VEC_LOAD:
234  case AMDGPU::DOT_4:
235  return AluT_XYZW;
236  case AMDGPU::COPY:
237  if (MI->getOperand(1).isUndef()) {
238  // MI will become a KILL, don't considers it in scheduling
239  return AluDiscarded;
240  }
241  default:
242  break;
243  }
244 
245  // Does the instruction take a whole IG ?
246  // XXX: Is it possible to add a helper function in R600InstrInfo that can
247  // be used here and in R600PacketizerList::isSoloInstruction() ?
248  if(TII->isVector(*MI) ||
249  TII->isCubeOp(MI->getOpcode()) ||
250  TII->isReductionOp(MI->getOpcode()) ||
251  MI->getOpcode() == AMDGPU::GROUP_BARRIER) {
252  return AluT_XYZW;
253  }
254 
255  if (TII->isLDSInstr(MI->getOpcode())) {
256  return AluT_X;
257  }
258 
259  // Is the result already assigned to a channel ?
260  unsigned DestSubReg = MI->getOperand(0).getSubReg();
261  switch (DestSubReg) {
262  case AMDGPU::sub0:
263  return AluT_X;
264  case AMDGPU::sub1:
265  return AluT_Y;
266  case AMDGPU::sub2:
267  return AluT_Z;
268  case AMDGPU::sub3:
269  return AluT_W;
270  default:
271  break;
272  }
273 
274  // Is the result already member of a X/Y/Z/W class ?
275  unsigned DestReg = MI->getOperand(0).getReg();
276  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
277  regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
278  return AluT_X;
279  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
280  return AluT_Y;
281  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
282  return AluT_Z;
283  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
284  return AluT_W;
285  if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
286  return AluT_XYZW;
287 
288  // LDS src registers cannot be used in the Trans slot.
289  if (TII->readsLDSSrcReg(*MI))
290  return AluT_XYZW;
291 
292  return AluAny;
293 }
294 
295 int R600SchedStrategy::getInstKind(SUnit* SU) {
296  int Opcode = SU->getInstr()->getOpcode();
297 
298  if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
299  return IDFetch;
300 
301  if (TII->isALUInstr(Opcode)) {
302  return IDAlu;
303  }
304 
305  switch (Opcode) {
306  case AMDGPU::PRED_X:
307  case AMDGPU::COPY:
308  case AMDGPU::CONST_COPY:
309  case AMDGPU::INTERP_PAIR_XY:
310  case AMDGPU::INTERP_PAIR_ZW:
311  case AMDGPU::INTERP_VEC_LOAD:
312  case AMDGPU::DOT_4:
313  return IDAlu;
314  default:
315  return IDOther;
316  }
317 }
318 
319 SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
320  if (Q.empty())
321  return nullptr;
322  for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
323  It != E; ++It) {
324  SUnit *SU = *It;
325  InstructionsGroupCandidate.push_back(SU->getInstr());
326  if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) &&
327  (!AnyALU || !TII->isVectorOnly(*SU->getInstr()))) {
328  InstructionsGroupCandidate.pop_back();
329  Q.erase((It + 1).base());
330  return SU;
331  } else {
332  InstructionsGroupCandidate.pop_back();
333  }
334  }
335  return nullptr;
336 }
337 
338 void R600SchedStrategy::LoadAlu() {
339  std::vector<SUnit *> &QSrc = Pending[IDAlu];
340  for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
341  AluKind AK = getAluKind(QSrc[i]);
342  AvailableAlus[AK].push_back(QSrc[i]);
343  }
344  QSrc.clear();
345 }
346 
347 void R600SchedStrategy::PrepareNextSlot() {
348  DEBUG(dbgs() << "New Slot\n");
349  assert (OccupedSlotsMask && "Slot wasn't filled");
350  OccupedSlotsMask = 0;
351 // if (HwGen == R600Subtarget::NORTHERN_ISLANDS)
352 // OccupedSlotsMask |= 16;
353  InstructionsGroupCandidate.clear();
354  LoadAlu();
355 }
356 
357 void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
358  int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
359  if (DstIndex == -1) {
360  return;
361  }
362  unsigned DestReg = MI->getOperand(DstIndex).getReg();
363  // PressureRegister crashes if an operand is def and used in the same inst
364  // and we try to constraint its regclass
366  E = MI->operands_end(); It != E; ++It) {
367  MachineOperand &MO = *It;
368  if (MO.isReg() && !MO.isDef() &&
369  MO.getReg() == DestReg)
370  return;
371  }
372  // Constrains the regclass of DestReg to assign it to Slot
373  switch (Slot) {
374  case 0:
375  MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass);
376  break;
377  case 1:
378  MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass);
379  break;
380  case 2:
381  MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass);
382  break;
383  case 3:
384  MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass);
385  break;
386  }
387 }
388 
389 SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
390  static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
391  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
392  if (SlotedSU)
393  return SlotedSU;
394  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
395  if (UnslotedSU)
396  AssignSlot(UnslotedSU->getInstr(), Slot);
397  return UnslotedSU;
398 }
399 
400 unsigned R600SchedStrategy::AvailablesAluCount() const {
401  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
402  AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
403  AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
404  AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
405  AvailableAlus[AluPredX].size();
406 }
407 
408 SUnit* R600SchedStrategy::pickAlu() {
409  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
410  if (!OccupedSlotsMask) {
411  // Bottom up scheduling : predX must comes first
412  if (!AvailableAlus[AluPredX].empty()) {
413  OccupedSlotsMask |= 31;
414  return PopInst(AvailableAlus[AluPredX], false);
415  }
416  // Flush physical reg copies (RA will discard them)
417  if (!AvailableAlus[AluDiscarded].empty()) {
418  OccupedSlotsMask |= 31;
419  return PopInst(AvailableAlus[AluDiscarded], false);
420  }
421  // If there is a T_XYZW alu available, use it
422  if (!AvailableAlus[AluT_XYZW].empty()) {
423  OccupedSlotsMask |= 15;
424  return PopInst(AvailableAlus[AluT_XYZW], false);
425  }
426  }
427  bool TransSlotOccuped = OccupedSlotsMask & 16;
428  if (!TransSlotOccuped && VLIW5) {
429  if (!AvailableAlus[AluTrans].empty()) {
430  OccupedSlotsMask |= 16;
431  return PopInst(AvailableAlus[AluTrans], false);
432  }
433  SUnit *SU = AttemptFillSlot(3, true);
434  if (SU) {
435  OccupedSlotsMask |= 16;
436  return SU;
437  }
438  }
439  for (int Chan = 3; Chan > -1; --Chan) {
440  bool isOccupied = OccupedSlotsMask & (1 << Chan);
441  if (!isOccupied) {
442  SUnit *SU = AttemptFillSlot(Chan, false);
443  if (SU) {
444  OccupedSlotsMask |= (1 << Chan);
445  InstructionsGroupCandidate.push_back(SU->getInstr());
446  return SU;
447  }
448  }
449  }
450  PrepareNextSlot();
451  }
452  return nullptr;
453 }
454 
455 SUnit* R600SchedStrategy::pickOther(int QID) {
456  SUnit *SU = nullptr;
457  std::vector<SUnit *> &AQ = Available[QID];
458 
459  if (AQ.empty()) {
460  MoveUnits(Pending[QID], AQ);
461  }
462  if (!AQ.empty()) {
463  SU = AQ.back();
464  AQ.resize(AQ.size() - 1);
465  }
466  return SU;
467 }
mop_iterator operands_end()
Definition: MachineInstr.h:327
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
bool contains(unsigned Reg) const
Return true if the specified register is included in this register class.
AMDGPU specific subclass of TargetSubtarget.
const TargetRegisterClass * getRegClass(unsigned Reg) const
Return the register class of the specified virtual register.
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
Interface definition for R600InstrInfo.
void initialize(ScheduleDAGMI *dag) override
Initialize the strategy after building the DAG for a new region.
void dump(const ScheduleDAG *G) const
unsigned getReg() const
getReg - Returns the register number.
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
unsigned getSubReg() const
R600 Machine Scheduler interface.
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
ScheduleDAGMILive is an implementation of ScheduleDAGInstrs that schedules machine instructions while...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
bool isScheduled
True once scheduled.
Definition: ScheduleDAG.h:289
void releaseBottomNode(SUnit *SU) override
When all successor dependencies have been resolved, free this node for bottom-up scheduling.
Reg
All possible values of the reg field in the ModR/M byte.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:290
bool usesVertexCache(unsigned Opcode) const
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
const TargetRegisterClass * constrainRegClass(unsigned Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:378
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool isPhysicalRegCopy(MachineInstr *MI)
void releaseTopNode(SUnit *SU) override
When all predecessor dependencies have been resolved, free this node for top-down scheduling...
bool isLDSInstr(unsigned Opcode) const
int getOperandIdx(const MachineInstr &MI, unsigned Op) const
Get the index of Op in the MachineInstr.
bool isVectorOnly(unsigned Opcode) const
bool fitsConstReadLimitations(const std::vector< MachineInstr *> &) const
An instruction group can only access 2 channel pair (either [XY] or [ZW]) from KCache bank on R700+...
bool isTransOnly(unsigned Opcode) const
bool isVector(const MachineInstr &MI) const
Vector instructions are instructions that must fill all instruction slots within an instruction group...
MachineOperand class - Representation of each machine instruction operand.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
bool isALUInstr(unsigned Opcode) const
static unsigned getWFCountLimitedByGPR(unsigned GPRCount)
bool readsLDSSrcReg(const MachineInstr &MI) const
Representation of each machine instruction.
Definition: MachineInstr.h:59
short getTexVTXClauseSize() const
bool isReductionOp(unsigned opcode) const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
mop_iterator operands_begin()
Definition: MachineInstr.h:326
#define DEBUG(X)
Definition: Debug.h:118
IRTranslator LLVM IR MI
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:572
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:295
bool usesTextureCache(unsigned Opcode) const
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:247
bool isCubeOp(unsigned opcode) const