LLVM  3.7.0
R600MachineScheduler.cpp
Go to the documentation of this file.
1 //===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief R600 Machine Scheduler interface
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "R600MachineScheduler.h"
16 #include "AMDGPUSubtarget.h"
18 #include "llvm/Pass.h"
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "misched"
25 
27  assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
28  DAG = static_cast<ScheduleDAGMILive*>(dag);
29  const AMDGPUSubtarget &ST = DAG->MF.getSubtarget<AMDGPUSubtarget>();
30  TII = static_cast<const R600InstrInfo*>(DAG->TII);
31  TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
32  VLIW5 = !ST.hasCaymanISA();
33  MRI = &DAG->MRI;
34  CurInstKind = IDOther;
35  CurEmitted = 0;
36  OccupedSlotsMask = 31;
37  InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
38  InstKindLimit[IDOther] = 32;
39  InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
40  AluInstCount = 0;
41  FetchInstCount = 0;
42 }
43 
44 void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
45  std::vector<SUnit *> &QDst)
46 {
47  QDst.insert(QDst.end(), QSrc.begin(), QSrc.end());
48  QSrc.clear();
49 }
50 
51 static
52 unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
53  assert (GPRCount && "GPRCount cannot be 0");
54  return 248 / GPRCount;
55 }
56 
58  SUnit *SU = nullptr;
59  NextInstKind = IDOther;
60 
61  IsTopNode = false;
62 
63  // check if we might want to switch current clause type
64  bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) ||
65  (Available[CurInstKind].empty());
66  bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
67  (!Available[IDFetch].empty() || !Available[IDOther].empty());
68 
69  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
70  // We use the heuristic provided by AMD Accelerated Parallel Processing
71  // OpenCL Programming Guide :
72  // The approx. number of WF that allows TEX inst to hide ALU inst is :
73  // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
74  float ALUFetchRationEstimate =
75  (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
76  (FetchInstCount + Available[IDFetch].size());
77  if (ALUFetchRationEstimate == 0) {
78  AllowSwitchFromAlu = true;
79  } else {
80  unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
81  DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
82  // We assume the local GPR requirements to be "dominated" by the requirement
83  // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
84  // after TEX are indeed likely to consume or generate values from/for the
85  // TEX clause.
86  // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
87  // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
88  // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
89  // (TODO : use RegisterPressure)
90  // If we are going too use too many GPR, we flush Fetch instruction to lower
91  // register pressure on 128 bits regs.
92  unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
93  if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
94  AllowSwitchFromAlu = true;
95  }
96  }
97 
98  if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
99  (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
100  // try to pick ALU
101  SU = pickAlu();
102  if (!SU && !PhysicalRegCopy.empty()) {
103  SU = PhysicalRegCopy.front();
104  PhysicalRegCopy.erase(PhysicalRegCopy.begin());
105  }
106  if (SU) {
107  if (CurEmitted >= InstKindLimit[IDAlu])
108  CurEmitted = 0;
109  NextInstKind = IDAlu;
110  }
111  }
112 
113  if (!SU) {
114  // try to pick FETCH
115  SU = pickOther(IDFetch);
116  if (SU)
117  NextInstKind = IDFetch;
118  }
119 
120  // try to pick other
121  if (!SU) {
122  SU = pickOther(IDOther);
123  if (SU)
124  NextInstKind = IDOther;
125  }
126 
127  DEBUG(
128  if (SU) {
129  dbgs() << " ** Pick node **\n";
130  SU->dump(DAG);
131  } else {
132  dbgs() << "NO NODE \n";
133  for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
134  const SUnit &S = DAG->SUnits[i];
135  if (!S.isScheduled)
136  S.dump(DAG);
137  }
138  }
139  );
140 
141  return SU;
142 }
143 
144 void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
145  if (NextInstKind != CurInstKind) {
146  DEBUG(dbgs() << "Instruction Type Switch\n");
147  if (NextInstKind != IDAlu)
148  OccupedSlotsMask |= 31;
149  CurEmitted = 0;
150  CurInstKind = NextInstKind;
151  }
152 
153  if (CurInstKind == IDAlu) {
154  AluInstCount ++;
155  switch (getAluKind(SU)) {
156  case AluT_XYZW:
157  CurEmitted += 4;
158  break;
159  case AluDiscarded:
160  break;
161  default: {
162  ++CurEmitted;
164  E = SU->getInstr()->operands_end(); It != E; ++It) {
165  MachineOperand &MO = *It;
166  if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
167  ++CurEmitted;
168  }
169  }
170  }
171  } else {
172  ++CurEmitted;
173  }
174 
175 
176  DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
177 
178  if (CurInstKind != IDFetch) {
179  MoveUnits(Pending[IDFetch], Available[IDFetch]);
180  } else
181  FetchInstCount++;
182 }
183 
184 static bool
186  if (MI->getOpcode() != AMDGPU::COPY)
187  return false;
188 
190 }
191 
193  DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG););
194 }
195 
197  DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG););
198  if (isPhysicalRegCopy(SU->getInstr())) {
199  PhysicalRegCopy.push_back(SU);
200  return;
201  }
202 
203  int IK = getInstKind(SU);
204 
205  // There is no export clause, we can schedule one as soon as its ready
206  if (IK == IDOther)
207  Available[IDOther].push_back(SU);
208  else
209  Pending[IK].push_back(SU);
210 
211 }
212 
213 bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
214  const TargetRegisterClass *RC) const {
216  return RC->contains(Reg);
217  } else {
218  return MRI->getRegClass(Reg) == RC;
219  }
220 }
221 
222 R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
223  MachineInstr *MI = SU->getInstr();
224 
225  if (TII->isTransOnly(MI))
226  return AluTrans;
227 
228  switch (MI->getOpcode()) {
229  case AMDGPU::PRED_X:
230  return AluPredX;
231  case AMDGPU::INTERP_PAIR_XY:
232  case AMDGPU::INTERP_PAIR_ZW:
233  case AMDGPU::INTERP_VEC_LOAD:
234  case AMDGPU::DOT_4:
235  return AluT_XYZW;
236  case AMDGPU::COPY:
237  if (MI->getOperand(1).isUndef()) {
238  // MI will become a KILL, don't considers it in scheduling
239  return AluDiscarded;
240  }
241  default:
242  break;
243  }
244 
245  // Does the instruction take a whole IG ?
246  // XXX: Is it possible to add a helper function in R600InstrInfo that can
247  // be used here and in R600PacketizerList::isSoloInstruction() ?
248  if(TII->isVector(*MI) ||
249  TII->isCubeOp(MI->getOpcode()) ||
250  TII->isReductionOp(MI->getOpcode()) ||
251  MI->getOpcode() == AMDGPU::GROUP_BARRIER) {
252  return AluT_XYZW;
253  }
254 
255  if (TII->isLDSInstr(MI->getOpcode())) {
256  return AluT_X;
257  }
258 
259  // Is the result already assigned to a channel ?
260  unsigned DestSubReg = MI->getOperand(0).getSubReg();
261  switch (DestSubReg) {
262  case AMDGPU::sub0:
263  return AluT_X;
264  case AMDGPU::sub1:
265  return AluT_Y;
266  case AMDGPU::sub2:
267  return AluT_Z;
268  case AMDGPU::sub3:
269  return AluT_W;
270  default:
271  break;
272  }
273 
274  // Is the result already member of a X/Y/Z/W class ?
275  unsigned DestReg = MI->getOperand(0).getReg();
276  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
277  regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
278  return AluT_X;
279  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
280  return AluT_Y;
281  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
282  return AluT_Z;
283  if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
284  return AluT_W;
285  if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
286  return AluT_XYZW;
287 
288  // LDS src registers cannot be used in the Trans slot.
289  if (TII->readsLDSSrcReg(MI))
290  return AluT_XYZW;
291 
292  return AluAny;
293 
294 }
295 
296 int R600SchedStrategy::getInstKind(SUnit* SU) {
297  int Opcode = SU->getInstr()->getOpcode();
298 
299  if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
300  return IDFetch;
301 
302  if (TII->isALUInstr(Opcode)) {
303  return IDAlu;
304  }
305 
306  switch (Opcode) {
307  case AMDGPU::PRED_X:
308  case AMDGPU::COPY:
309  case AMDGPU::CONST_COPY:
310  case AMDGPU::INTERP_PAIR_XY:
311  case AMDGPU::INTERP_PAIR_ZW:
312  case AMDGPU::INTERP_VEC_LOAD:
313  case AMDGPU::DOT_4:
314  return IDAlu;
315  default:
316  return IDOther;
317  }
318 }
319 
320 SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
321  if (Q.empty())
322  return nullptr;
323  for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
324  It != E; ++It) {
325  SUnit *SU = *It;
326  InstructionsGroupCandidate.push_back(SU->getInstr());
327  if (TII->fitsConstReadLimitations(InstructionsGroupCandidate)
328  && (!AnyALU || !TII->isVectorOnly(SU->getInstr()))
329  ) {
330  InstructionsGroupCandidate.pop_back();
331  Q.erase((It + 1).base());
332  return SU;
333  } else {
334  InstructionsGroupCandidate.pop_back();
335  }
336  }
337  return nullptr;
338 }
339 
340 void R600SchedStrategy::LoadAlu() {
341  std::vector<SUnit *> &QSrc = Pending[IDAlu];
342  for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
343  AluKind AK = getAluKind(QSrc[i]);
344  AvailableAlus[AK].push_back(QSrc[i]);
345  }
346  QSrc.clear();
347 }
348 
349 void R600SchedStrategy::PrepareNextSlot() {
350  DEBUG(dbgs() << "New Slot\n");
351  assert (OccupedSlotsMask && "Slot wasn't filled");
352  OccupedSlotsMask = 0;
353 // if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
354 // OccupedSlotsMask |= 16;
355  InstructionsGroupCandidate.clear();
356  LoadAlu();
357 }
358 
359 void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
360  int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
361  if (DstIndex == -1) {
362  return;
363  }
364  unsigned DestReg = MI->getOperand(DstIndex).getReg();
365  // PressureRegister crashes if an operand is def and used in the same inst
366  // and we try to constraint its regclass
368  E = MI->operands_end(); It != E; ++It) {
369  MachineOperand &MO = *It;
370  if (MO.isReg() && !MO.isDef() &&
371  MO.getReg() == DestReg)
372  return;
373  }
374  // Constrains the regclass of DestReg to assign it to Slot
375  switch (Slot) {
376  case 0:
377  MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass);
378  break;
379  case 1:
380  MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass);
381  break;
382  case 2:
383  MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass);
384  break;
385  case 3:
386  MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass);
387  break;
388  }
389 }
390 
391 SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
392  static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
393  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
394  if (SlotedSU)
395  return SlotedSU;
396  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
397  if (UnslotedSU)
398  AssignSlot(UnslotedSU->getInstr(), Slot);
399  return UnslotedSU;
400 }
401 
402 unsigned R600SchedStrategy::AvailablesAluCount() const {
403  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
404  AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
405  AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
406  AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
407  AvailableAlus[AluPredX].size();
408 }
409 
410 SUnit* R600SchedStrategy::pickAlu() {
411  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
412  if (!OccupedSlotsMask) {
413  // Bottom up scheduling : predX must comes first
414  if (!AvailableAlus[AluPredX].empty()) {
415  OccupedSlotsMask |= 31;
416  return PopInst(AvailableAlus[AluPredX], false);
417  }
418  // Flush physical reg copies (RA will discard them)
419  if (!AvailableAlus[AluDiscarded].empty()) {
420  OccupedSlotsMask |= 31;
421  return PopInst(AvailableAlus[AluDiscarded], false);
422  }
423  // If there is a T_XYZW alu available, use it
424  if (!AvailableAlus[AluT_XYZW].empty()) {
425  OccupedSlotsMask |= 15;
426  return PopInst(AvailableAlus[AluT_XYZW], false);
427  }
428  }
429  bool TransSlotOccuped = OccupedSlotsMask & 16;
430  if (!TransSlotOccuped && VLIW5) {
431  if (!AvailableAlus[AluTrans].empty()) {
432  OccupedSlotsMask |= 16;
433  return PopInst(AvailableAlus[AluTrans], false);
434  }
435  SUnit *SU = AttemptFillSlot(3, true);
436  if (SU) {
437  OccupedSlotsMask |= 16;
438  return SU;
439  }
440  }
441  for (int Chan = 3; Chan > -1; --Chan) {
442  bool isOccupied = OccupedSlotsMask & (1 << Chan);
443  if (!isOccupied) {
444  SUnit *SU = AttemptFillSlot(Chan, false);
445  if (SU) {
446  OccupedSlotsMask |= (1 << Chan);
447  InstructionsGroupCandidate.push_back(SU->getInstr());
448  return SU;
449  }
450  }
451  }
452  PrepareNextSlot();
453  }
454  return nullptr;
455 }
456 
457 SUnit* R600SchedStrategy::pickOther(int QID) {
458  SUnit *SU = nullptr;
459  std::vector<SUnit *> &AQ = Available[QID];
460 
461  if (AQ.empty()) {
462  MoveUnits(Pending[QID], AQ);
463  }
464  if (!AQ.empty()) {
465  SU = AQ.back();
466  AQ.resize(AQ.size() - 1);
467  }
468  return SU;
469 }
bool readsLDSSrcReg(const MachineInstr *MI) const
bool hasCaymanISA() const
mop_iterator operands_end()
Definition: MachineInstr.h:290
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
bool isLDSInstr(unsigned Opcode) const
AMDGPU specific subclass of TargetSubtarget.
void initialize(ScheduleDAGMI *dag) override
Initialize the strategy after building the DAG for a new region.
MachineInstr * getInstr() const
getInstr - Return the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:406
bool isVector(const MachineInstr &MI) const
Vector instructions are instructions that must fill all instruction slots within an instruction group...
static bool isVirtualRegister(unsigned Reg)
isVirtualRegister - Return true if the specified register number is in the virtual register namespace...
R600 Machine Scheduler interface.
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
bool isVectorOnly(unsigned Opcode) const
ScheduleDAGMILive is an implementation of ScheduleDAGInstrs that schedules machine instructions while...
COPY - Target-independent register copy.
Definition: TargetOpcodes.h:86
bool isScheduled
Definition: ScheduleDAG.h:303
bool isCubeOp(unsigned opcode) const
void releaseBottomNode(SUnit *SU) override
When all successor dependencies have been resolved, free this node for bottom-up scheduling.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
const TargetRegisterClass * getRegClass(unsigned Reg) const
getRegClass - Return the register class of the specified virtual register.
Reg
All possible values of the reg field in the ModR/M byte.
bool isUndef() const
bool usesVertexCache(unsigned Opcode) const
bool usesTextureCache(unsigned Opcode) const
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
const TargetRegisterClass * constrainRegClass(unsigned Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:267
short getTexVTXClauseSize() const
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:273
static bool isPhysicalRegCopy(MachineInstr *MI)
void releaseTopNode(SUnit *SU) override
When all predecessor dependencies have been resolved, free this node for top-down scheduling...
int getOperandIdx(const MachineInstr &MI, unsigned Op) const
Get the index of Op in the MachineInstr.
unsigned getSubReg() const
MachineOperand class - Representation of each machine instruction operand.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:123
static unsigned getWFCountLimitedByGPR(unsigned GPRCount)
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
Representation of each machine instruction.
Definition: MachineInstr.h:51
void size_t size
bool isReductionOp(unsigned opcode) const
unsigned getReg() const
getReg - Returns the register number.
mop_iterator operands_begin()
Definition: MachineInstr.h:289
#define DEBUG(X)
Definition: Debug.h:92
bool fitsConstReadLimitations(const std::vector< MachineInstr * > &) const
An instruction group can only access 2 channel pair (either [XY] or [ZW]) from KCache bank on R700+...
std::vector< SUnit > SUnits
Definition: ScheduleDAG.h:565
bool isALUInstr(unsigned Opcode) const
void dump(const ScheduleDAG *G) const
SUnit - Scheduling unit.
bool isTransOnly(unsigned Opcode) const
SUnit - Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:261
bool contains(unsigned Reg) const
contains - Return true if the specified register is included in this register class.