LLVM 20.0.0git
R600MachineScheduler.cpp
Go to the documentation of this file.
1//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// R600 Machine Scheduler interface
11//
12//===----------------------------------------------------------------------===//
13
16#include "R600Subtarget.h"
17
18using namespace llvm;
19
20#define DEBUG_TYPE "machine-scheduler"
21
23 assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
24 DAG = static_cast<ScheduleDAGMILive*>(dag);
25 const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>();
26 TII = static_cast<const R600InstrInfo*>(DAG->TII);
27 TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
28 VLIW5 = !ST.hasCaymanISA();
29 MRI = &DAG->MRI;
30 CurInstKind = IDOther;
31 CurEmitted = 0;
32 OccupiedSlotsMask = 31;
33 InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
34 InstKindLimit[IDOther] = 32;
35 InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
36 AluInstCount = 0;
37 FetchInstCount = 0;
38}
39
40void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
41 std::vector<SUnit *> &QDst)
42{
43 llvm::append_range(QDst, QSrc);
44 QSrc.clear();
45}
46
47static unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
48 assert (GPRCount && "GPRCount cannot be 0");
49 return 248 / GPRCount;
50}
51
53 SUnit *SU = nullptr;
54 NextInstKind = IDOther;
55
56 IsTopNode = false;
57
58 // check if we might want to switch current clause type
59 bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) ||
60 (Available[CurInstKind].empty());
61 bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
62 (!Available[IDFetch].empty() || !Available[IDOther].empty());
63
64 if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
65 // We use the heuristic provided by AMD Accelerated Parallel Processing
66 // OpenCL Programming Guide :
67 // The approx. number of WF that allows TEX inst to hide ALU inst is :
68 // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
69 float ALUFetchRationEstimate =
70 (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
71 (FetchInstCount + Available[IDFetch].size());
72 if (ALUFetchRationEstimate == 0) {
73 AllowSwitchFromAlu = true;
74 } else {
75 unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
76 LLVM_DEBUG(dbgs() << NeededWF << " approx. Wavefronts Required\n");
77 // We assume the local GPR requirements to be "dominated" by the requirement
78 // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
79 // after TEX are indeed likely to consume or generate values from/for the
80 // TEX clause.
81 // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
82 // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
83 // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
84 // (TODO : use RegisterPressure)
85 // If we are going too use too many GPR, we flush Fetch instruction to lower
86 // register pressure on 128 bits regs.
87 unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
88 if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
89 AllowSwitchFromAlu = true;
90 }
91 }
92
93 if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
94 (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
95 // try to pick ALU
96 SU = pickAlu();
97 if (!SU && !PhysicalRegCopy.empty()) {
98 SU = PhysicalRegCopy.front();
99 PhysicalRegCopy.erase(PhysicalRegCopy.begin());
100 }
101 if (SU) {
102 if (CurEmitted >= InstKindLimit[IDAlu])
103 CurEmitted = 0;
104 NextInstKind = IDAlu;
105 }
106 }
107
108 if (!SU) {
109 // try to pick FETCH
110 SU = pickOther(IDFetch);
111 if (SU)
112 NextInstKind = IDFetch;
113 }
114
115 // try to pick other
116 if (!SU) {
117 SU = pickOther(IDOther);
118 if (SU)
119 NextInstKind = IDOther;
120 }
121
122 LLVM_DEBUG(if (SU) {
123 dbgs() << " ** Pick node **\n";
124 DAG->dumpNode(*SU);
125 } else {
126 dbgs() << "NO NODE \n";
127 for (const SUnit &S : DAG->SUnits)
128 if (!S.isScheduled)
129 DAG->dumpNode(S);
130 });
131
132 return SU;
133}
134
135void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
136 if (NextInstKind != CurInstKind) {
137 LLVM_DEBUG(dbgs() << "Instruction Type Switch\n");
138 if (NextInstKind != IDAlu)
139 OccupiedSlotsMask |= 31;
140 CurEmitted = 0;
141 CurInstKind = NextInstKind;
142 }
143
144 if (CurInstKind == IDAlu) {
145 AluInstCount ++;
146 switch (getAluKind(SU)) {
147 case AluT_XYZW:
148 CurEmitted += 4;
149 break;
150 case AluDiscarded:
151 break;
152 default: {
153 ++CurEmitted;
155 E = SU->getInstr()->operands_end(); It != E; ++It) {
156 MachineOperand &MO = *It;
157 if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X)
158 ++CurEmitted;
159 }
160 }
161 }
162 } else {
163 ++CurEmitted;
164 }
165
166 LLVM_DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
167
168 if (CurInstKind != IDFetch) {
169 MoveUnits(Pending[IDFetch], Available[IDFetch]);
170 } else
171 FetchInstCount++;
172}
173
174static bool
176 if (MI->getOpcode() != R600::COPY)
177 return false;
178
179 return !MI->getOperand(1).getReg().isVirtual();
180}
181
183 LLVM_DEBUG(dbgs() << "Top Releasing "; DAG->dumpNode(*SU));
184}
185
187 LLVM_DEBUG(dbgs() << "Bottom Releasing "; DAG->dumpNode(*SU));
188 if (isPhysicalRegCopy(SU->getInstr())) {
189 PhysicalRegCopy.push_back(SU);
190 return;
191 }
192
193 int IK = getInstKind(SU);
194
195 // There is no export clause, we can schedule one as soon as its ready
196 if (IK == IDOther)
197 Available[IDOther].push_back(SU);
198 else
199 Pending[IK].push_back(SU);
200
201}
202
203bool R600SchedStrategy::regBelongsToClass(Register Reg,
204 const TargetRegisterClass *RC) const {
205 if (!Reg.isVirtual())
206 return RC->contains(Reg);
207 return MRI->getRegClass(Reg) == RC;
208}
209
210R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
211 MachineInstr *MI = SU->getInstr();
212
213 if (TII->isTransOnly(*MI))
214 return AluTrans;
215
216 switch (MI->getOpcode()) {
217 case R600::PRED_X:
218 return AluPredX;
219 case R600::INTERP_PAIR_XY:
220 case R600::INTERP_PAIR_ZW:
221 case R600::INTERP_VEC_LOAD:
222 case R600::DOT_4:
223 return AluT_XYZW;
224 case R600::COPY:
225 if (MI->getOperand(1).isUndef()) {
226 // MI will become a KILL, don't considers it in scheduling
227 return AluDiscarded;
228 }
229 break;
230 default:
231 break;
232 }
233
234 // Does the instruction take a whole IG ?
235 // XXX: Is it possible to add a helper function in R600InstrInfo that can
236 // be used here and in R600PacketizerList::isSoloInstruction() ?
237 if(TII->isVector(*MI) ||
238 TII->isCubeOp(MI->getOpcode()) ||
239 TII->isReductionOp(MI->getOpcode()) ||
240 MI->getOpcode() == R600::GROUP_BARRIER) {
241 return AluT_XYZW;
242 }
243
244 if (TII->isLDSInstr(MI->getOpcode())) {
245 return AluT_X;
246 }
247
248 // Is the result already assigned to a channel ?
249 unsigned DestSubReg = MI->getOperand(0).getSubReg();
250 switch (DestSubReg) {
251 case R600::sub0:
252 return AluT_X;
253 case R600::sub1:
254 return AluT_Y;
255 case R600::sub2:
256 return AluT_Z;
257 case R600::sub3:
258 return AluT_W;
259 default:
260 break;
261 }
262
263 // Is the result already member of a X/Y/Z/W class ?
264 Register DestReg = MI->getOperand(0).getReg();
265 if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) ||
266 regBelongsToClass(DestReg, &R600::R600_AddrRegClass))
267 return AluT_X;
268 if (regBelongsToClass(DestReg, &R600::R600_TReg32_YRegClass))
269 return AluT_Y;
270 if (regBelongsToClass(DestReg, &R600::R600_TReg32_ZRegClass))
271 return AluT_Z;
272 if (regBelongsToClass(DestReg, &R600::R600_TReg32_WRegClass))
273 return AluT_W;
274 if (regBelongsToClass(DestReg, &R600::R600_Reg128RegClass))
275 return AluT_XYZW;
276
277 // LDS src registers cannot be used in the Trans slot.
278 if (TII->readsLDSSrcReg(*MI))
279 return AluT_XYZW;
280
281 return AluAny;
282}
283
284int R600SchedStrategy::getInstKind(SUnit* SU) {
285 int Opcode = SU->getInstr()->getOpcode();
286
287 if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
288 return IDFetch;
289
290 if (TII->isALUInstr(Opcode)) {
291 return IDAlu;
292 }
293
294 switch (Opcode) {
295 case R600::PRED_X:
296 case R600::COPY:
297 case R600::CONST_COPY:
298 case R600::INTERP_PAIR_XY:
299 case R600::INTERP_PAIR_ZW:
300 case R600::INTERP_VEC_LOAD:
301 case R600::DOT_4:
302 return IDAlu;
303 default:
304 return IDOther;
305 }
306}
307
308SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
309 if (Q.empty())
310 return nullptr;
311 for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
312 It != E; ++It) {
313 SUnit *SU = *It;
314 InstructionsGroupCandidate.push_back(SU->getInstr());
315 if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) &&
316 (!AnyALU || !TII->isVectorOnly(*SU->getInstr()))) {
317 InstructionsGroupCandidate.pop_back();
318 Q.erase((It + 1).base());
319 return SU;
320 }
321 InstructionsGroupCandidate.pop_back();
322 }
323 return nullptr;
324}
325
326void R600SchedStrategy::LoadAlu() {
327 std::vector<SUnit *> &QSrc = Pending[IDAlu];
328 for (SUnit *SU : QSrc) {
329 AluKind AK = getAluKind(SU);
330 AvailableAlus[AK].push_back(SU);
331 }
332 QSrc.clear();
333}
334
335void R600SchedStrategy::PrepareNextSlot() {
336 LLVM_DEBUG(dbgs() << "New Slot\n");
337 assert(OccupiedSlotsMask && "Slot wasn't filled");
338 OccupiedSlotsMask = 0;
339 // if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
340 // OccupiedSlotsMask |= 16;
341 InstructionsGroupCandidate.clear();
342 LoadAlu();
343}
344
345void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
346 int DstIndex = TII->getOperandIdx(MI->getOpcode(), R600::OpName::dst);
347 if (DstIndex == -1) {
348 return;
349 }
350 Register DestReg = MI->getOperand(DstIndex).getReg();
351 // PressureRegister crashes if an operand is def and used in the same inst
352 // and we try to constraint its regclass
353 for (const MachineOperand &MO : MI->all_uses())
354 if (MO.getReg() == DestReg)
355 return;
356 // Constrains the regclass of DestReg to assign it to Slot
357 switch (Slot) {
358 case 0:
359 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_XRegClass);
360 break;
361 case 1:
362 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_YRegClass);
363 break;
364 case 2:
365 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_ZRegClass);
366 break;
367 case 3:
368 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_WRegClass);
369 break;
370 }
371}
372
373SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
374 static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
375 SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
376 if (SlotedSU)
377 return SlotedSU;
378 SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
379 if (UnslotedSU)
380 AssignSlot(UnslotedSU->getInstr(), Slot);
381 return UnslotedSU;
382}
383
384unsigned R600SchedStrategy::AvailablesAluCount() const {
385 return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
386 AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
387 AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
388 AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
389 AvailableAlus[AluPredX].size();
390}
391
392SUnit* R600SchedStrategy::pickAlu() {
393 while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
394 if (!OccupiedSlotsMask) {
395 // Bottom up scheduling : predX must comes first
396 if (!AvailableAlus[AluPredX].empty()) {
397 OccupiedSlotsMask |= 31;
398 return PopInst(AvailableAlus[AluPredX], false);
399 }
400 // Flush physical reg copies (RA will discard them)
401 if (!AvailableAlus[AluDiscarded].empty()) {
402 OccupiedSlotsMask |= 31;
403 return PopInst(AvailableAlus[AluDiscarded], false);
404 }
405 // If there is a T_XYZW alu available, use it
406 if (!AvailableAlus[AluT_XYZW].empty()) {
407 OccupiedSlotsMask |= 15;
408 return PopInst(AvailableAlus[AluT_XYZW], false);
409 }
410 }
411 bool TransSlotOccupied = OccupiedSlotsMask & 16;
412 if (!TransSlotOccupied && VLIW5) {
413 if (!AvailableAlus[AluTrans].empty()) {
414 OccupiedSlotsMask |= 16;
415 return PopInst(AvailableAlus[AluTrans], false);
416 }
417 SUnit *SU = AttemptFillSlot(3, true);
418 if (SU) {
419 OccupiedSlotsMask |= 16;
420 return SU;
421 }
422 }
423 for (int Chan = 3; Chan > -1; --Chan) {
424 bool isOccupied = OccupiedSlotsMask & (1 << Chan);
425 if (!isOccupied) {
426 SUnit *SU = AttemptFillSlot(Chan, false);
427 if (SU) {
428 OccupiedSlotsMask |= (1 << Chan);
429 InstructionsGroupCandidate.push_back(SU->getInstr());
430 return SU;
431 }
432 }
433 }
434 PrepareNextSlot();
435 }
436 return nullptr;
437}
438
439SUnit* R600SchedStrategy::pickOther(int QID) {
440 SUnit *SU = nullptr;
441 std::vector<SUnit *> &AQ = Available[QID];
442
443 if (AQ.empty()) {
444 MoveUnits(Pending[QID], AQ);
445 }
446 if (!AQ.empty()) {
447 SU = AQ.back();
448 AQ.pop_back();
449 }
450 return SU;
451}
#define LLVM_DEBUG(...)
Definition: Debug.h:106
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Provides R600 specific target descriptions.
static unsigned getWFCountLimitedByGPR(unsigned GPRCount)
static bool isPhysicalRegCopy(MachineInstr *MI)
R600 Machine Scheduler interface.
AMDGPU R600 specific subclass of TargetSubtarget.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Representation of each machine instruction.
Definition: MachineInstr.h:69
mop_iterator operands_begin()
Definition: MachineInstr.h:685
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
mop_iterator operands_end()
Definition: MachineInstr.h:686
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
bool usesVertexCache(unsigned Opcode) const
bool usesTextureCache(unsigned Opcode) const
bool fitsConstReadLimitations(const std::vector< MachineInstr * > &) const
An instruction group can only access 2 channel pair (either [XY] or [ZW]) from KCache bank on R700+.
bool isVector(const MachineInstr &MI) const
Vector instructions are instructions that must fill all instruction slots within an instruction group...
bool isTransOnly(unsigned Opcode) const
bool isReductionOp(unsigned opcode) const
bool isCubeOp(unsigned opcode) const
bool isLDSInstr(unsigned Opcode) const
bool readsLDSSrcReg(const MachineInstr &MI) const
bool isALUInstr(unsigned Opcode) const
bool isVectorOnly(unsigned Opcode) const
int getOperandIdx(const MachineInstr &MI, unsigned Op) const
Get the index of Op in the MachineInstr.
void initialize(ScheduleDAGMI *dag) override
Initialize the strategy after building the DAG for a new region.
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
void releaseBottomNode(SUnit *SU) override
When all successor dependencies have been resolved, free this node for bottom-up scheduling.
void releaseTopNode(SUnit *SU) override
When all predecessor dependencies have been resolved, free this node for top-down scheduling.
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
bool isScheduled
True once scheduled.
Definition: ScheduleDAG.h:296
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:390
void dumpNode(const SUnit &SU) const override
ScheduleDAGMILive is an implementation of ScheduleDAGInstrs that schedules machine instructions while...
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineRegisterInfo & MRI
Virtual/real register map.
Definition: ScheduleDAG.h:578
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:575
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:579
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:576
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163