LLVM 20.0.0git
SystemZHazardRecognizer.cpp
Go to the documentation of this file.
1//=-- SystemZHazardRecognizer.h - SystemZ Hazard Recognizer -----*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines a hazard recognizer for the SystemZ scheduler.
10//
11// This class is used by the SystemZ scheduling strategy to maintain
12// the state during scheduling, and provide cost functions for
13// scheduling candidates. This includes:
14//
15// * Decoder grouping. A decoder group can maximally hold 3 uops, and
16// instructions that always begin a new group should be scheduled when
17// the current decoder group is empty.
18// * Processor resources usage. It is beneficial to balance the use of
19// resources.
20//
21// A goal is to consider all instructions, also those outside of any
22// scheduling region. Such instructions are "advanced" past and include
23// single instructions before a scheduling region, branches etc.
24//
25// A block that has only one predecessor continues scheduling with the state
26// of it (which may be updated by emitting branches).
27//
28// ===---------------------------------------------------------------------===//
29
31#include "llvm/ADT/Statistic.h"
32
33using namespace llvm;
34
35#define DEBUG_TYPE "machine-scheduler"
36
37// This is the limit of processor resource usage at which the
38// scheduler should try to look for other instructions (not using the
39// critical resource).
40static cl::opt<int> ProcResCostLim("procres-cost-lim", cl::Hidden,
41 cl::desc("The OOO window for processor "
42 "resources during scheduling."),
43 cl::init(8));
44
45unsigned SystemZHazardRecognizer::
46getNumDecoderSlots(SUnit *SU) const {
47 const MCSchedClassDesc *SC = getSchedClass(SU);
48 if (!SC->isValid())
49 return 0; // IMPLICIT_DEF / KILL -- will not make impact in output.
50
51 assert((SC->NumMicroOps != 2 || (SC->BeginGroup && !SC->EndGroup)) &&
52 "Only cracked instruction can have 2 uops.");
53 assert((SC->NumMicroOps < 3 || (SC->BeginGroup && SC->EndGroup)) &&
54 "Expanded instructions always group alone.");
55 assert((SC->NumMicroOps < 3 || (SC->NumMicroOps % 3 == 0)) &&
56 "Expanded instructions fill the group(s).");
57
58 return SC->NumMicroOps;
59}
60
61unsigned SystemZHazardRecognizer::getCurrCycleIdx(SUnit *SU) const {
62 unsigned Idx = CurrGroupSize;
63 if (GrpCount % 2)
64 Idx += 3;
65
66 if (SU != nullptr && !fitsIntoCurrentGroup(SU)) {
67 if (Idx == 1 || Idx == 2)
68 Idx = 3;
69 else if (Idx == 4 || Idx == 5)
70 Idx = 0;
71 }
72
73 return Idx;
74}
75
77getHazardType(SUnit *SU, int Stalls) {
78 return (fitsIntoCurrentGroup(SU) ? NoHazard : Hazard);
79}
80
82 CurrGroupSize = 0;
83 CurrGroupHas4RegOps = false;
84 clearProcResCounters();
85 GrpCount = 0;
86 LastFPdOpCycleIdx = UINT_MAX;
87 LastEmittedMI = nullptr;
89}
90
91bool
92SystemZHazardRecognizer::fitsIntoCurrentGroup(SUnit *SU) const {
93 const MCSchedClassDesc *SC = getSchedClass(SU);
94 if (!SC->isValid())
95 return true;
96
97 // A cracked instruction only fits into schedule if the current
98 // group is empty.
99 if (SC->BeginGroup)
100 return (CurrGroupSize == 0);
101
102 // An instruction with 4 register operands will not fit in last slot.
103 assert ((CurrGroupSize < 2 || !CurrGroupHas4RegOps) &&
104 "Current decoder group is already full!");
105 if (CurrGroupSize == 2 && has4RegOps(SU->getInstr()))
106 return false;
107
108 // Since a full group is handled immediately in EmitInstruction(),
109 // SU should fit into current group. NumSlots should be 1 or 0,
110 // since it is not a cracked or expanded instruction.
111 assert ((getNumDecoderSlots(SU) <= 1) && (CurrGroupSize < 3) &&
112 "Expected normal instruction to fit in non-full group!");
113
114 return true;
115}
116
117bool SystemZHazardRecognizer::has4RegOps(const MachineInstr *MI) const {
118 const MachineFunction &MF = *MI->getParent()->getParent();
119 const TargetRegisterInfo *TRI = &TII->getRegisterInfo();
120 const MCInstrDesc &MID = MI->getDesc();
121 unsigned Count = 0;
122 for (unsigned OpIdx = 0; OpIdx < MID.getNumOperands(); OpIdx++) {
123 const TargetRegisterClass *RC = TII->getRegClass(MID, OpIdx, TRI, MF);
124 if (RC == nullptr)
125 continue;
126 if (OpIdx >= MID.getNumDefs() &&
127 MID.getOperandConstraint(OpIdx, MCOI::TIED_TO) != -1)
128 continue;
129 Count++;
130 }
131 return Count >= 4;
132}
133
134void SystemZHazardRecognizer::nextGroup() {
135 if (CurrGroupSize == 0)
136 return;
137
138 LLVM_DEBUG(dumpCurrGroup("Completed decode group"));
139 LLVM_DEBUG(CurGroupDbg = "";);
140
141 int NumGroups = ((CurrGroupSize > 3) ? (CurrGroupSize / 3) : 1);
142 assert((CurrGroupSize <= 3 || CurrGroupSize % 3 == 0) &&
143 "Current decoder group bad.");
144
145 // Reset counter for next group.
146 CurrGroupSize = 0;
147 CurrGroupHas4RegOps = false;
148
149 GrpCount += ((unsigned) NumGroups);
150
151 // Decrease counters for execution units.
152 for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
153 ProcResourceCounters[i] = ((ProcResourceCounters[i] > NumGroups)
154 ? (ProcResourceCounters[i] - NumGroups)
155 : 0);
156
157 // Clear CriticalResourceIdx if it is now below the threshold.
158 if (CriticalResourceIdx != UINT_MAX &&
159 (ProcResourceCounters[CriticalResourceIdx] <=
161 CriticalResourceIdx = UINT_MAX;
162
164}
165
166#ifndef NDEBUG // Debug output
168 OS << "SU(" << SU->NodeNum << "):";
169 OS << TII->getName(SU->getInstr()->getOpcode());
170
171 const MCSchedClassDesc *SC = getSchedClass(SU);
172 if (!SC->isValid())
173 return;
174
176 PI = SchedModel->getWriteProcResBegin(SC),
177 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
178 const MCProcResourceDesc &PRD =
179 *SchedModel->getProcResource(PI->ProcResourceIdx);
180 std::string FU(PRD.Name);
181 // trim e.g. Z13_FXaUnit -> FXa
182 FU = FU.substr(FU.find('_') + 1);
183 size_t Pos = FU.find("Unit");
184 if (Pos != std::string::npos)
185 FU.resize(Pos);
186 if (FU == "LS") // LSUnit -> LSU
187 FU = "LSU";
188 OS << "/" << FU;
189
190 if (PI->ReleaseAtCycle> 1)
191 OS << "(" << PI->ReleaseAtCycle << "cyc)";
192 }
193
194 if (SC->NumMicroOps > 1)
195 OS << "/" << SC->NumMicroOps << "uops";
196 if (SC->BeginGroup && SC->EndGroup)
197 OS << "/GroupsAlone";
198 else if (SC->BeginGroup)
199 OS << "/BeginsGroup";
200 else if (SC->EndGroup)
201 OS << "/EndsGroup";
202 if (SU->isUnbuffered)
203 OS << "/Unbuffered";
204 if (has4RegOps(SU->getInstr()))
205 OS << "/4RegOps";
206}
207
208void SystemZHazardRecognizer::dumpCurrGroup(std::string Msg) const {
209 dbgs() << "++ " << Msg;
210 dbgs() << ": ";
211
212 if (CurGroupDbg.empty())
213 dbgs() << " <empty>\n";
214 else {
215 dbgs() << "{ " << CurGroupDbg << " }";
216 dbgs() << " (" << CurrGroupSize << " decoder slot"
217 << (CurrGroupSize > 1 ? "s":"")
218 << (CurrGroupHas4RegOps ? ", 4RegOps" : "")
219 << ")\n";
220 }
221}
222
224 bool any = false;
225
226 for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
227 if (ProcResourceCounters[i] > 0) {
228 any = true;
229 break;
230 }
231
232 if (!any)
233 return;
234
235 dbgs() << "++ | Resource counters: ";
236 for (unsigned i = 0; i < SchedModel->getNumProcResourceKinds(); ++i)
237 if (ProcResourceCounters[i] > 0)
238 dbgs() << SchedModel->getProcResource(i)->Name
239 << ":" << ProcResourceCounters[i] << " ";
240 dbgs() << "\n";
241
242 if (CriticalResourceIdx != UINT_MAX)
243 dbgs() << "++ | Critical resource: "
244 << SchedModel->getProcResource(CriticalResourceIdx)->Name
245 << "\n";
246}
247
249 dumpCurrGroup("| Current decoder group");
250 dbgs() << "++ | Current cycle index: "
251 << getCurrCycleIdx() << "\n";
253 if (LastFPdOpCycleIdx != UINT_MAX)
254 dbgs() << "++ | Last FPd cycle index: " << LastFPdOpCycleIdx << "\n";
255}
256
257#endif //NDEBUG
258
259void SystemZHazardRecognizer::clearProcResCounters() {
260 ProcResourceCounters.assign(SchedModel->getNumProcResourceKinds(), 0);
261 CriticalResourceIdx = UINT_MAX;
262}
263
264static inline bool isBranchRetTrap(MachineInstr *MI) {
265 return (MI->isBranch() || MI->isReturn() ||
266 MI->getOpcode() == SystemZ::CondTrap);
267}
268
269// Update state with SU as the next scheduled unit.
272 const MCSchedClassDesc *SC = getSchedClass(SU);
273 LLVM_DEBUG(dbgs() << "++ HazardRecognizer emitting "; dumpSU(SU, dbgs());
274 dbgs() << "\n";);
275 LLVM_DEBUG(dumpCurrGroup("Decode group before emission"););
276
277 // If scheduling an SU that must begin a new decoder group, move on
278 // to next group.
279 if (!fitsIntoCurrentGroup(SU))
280 nextGroup();
281
283 if (CurGroupDbg.length()) cgd << ", "; dumpSU(SU, cgd););
284
285 LastEmittedMI = SU->getInstr();
286
287 // After returning from a call, we don't know much about the state.
288 if (SU->isCall) {
289 LLVM_DEBUG(dbgs() << "++ Clearing state after call.\n";);
290 Reset();
291 LastEmittedMI = SU->getInstr();
292 return;
293 }
294
295 // Increase counter for execution unit(s).
297 PI = SchedModel->getWriteProcResBegin(SC),
298 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
299 // Don't handle FPd together with the other resources.
300 if (SchedModel->getProcResource(PI->ProcResourceIdx)->BufferSize == 1)
301 continue;
302 int &CurrCounter =
303 ProcResourceCounters[PI->ProcResourceIdx];
304 CurrCounter += PI->ReleaseAtCycle;
305 // Check if this is now the new critical resource.
306 if ((CurrCounter > ProcResCostLim) &&
307 (CriticalResourceIdx == UINT_MAX ||
308 (PI->ProcResourceIdx != CriticalResourceIdx &&
309 CurrCounter >
310 ProcResourceCounters[CriticalResourceIdx]))) {
312 dbgs() << "++ New critical resource: "
313 << SchedModel->getProcResource(PI->ProcResourceIdx)->Name
314 << "\n";);
315 CriticalResourceIdx = PI->ProcResourceIdx;
316 }
317 }
318
319 // Make note of an instruction that uses a blocking resource (FPd).
320 if (SU->isUnbuffered) {
321 LastFPdOpCycleIdx = getCurrCycleIdx(SU);
322 LLVM_DEBUG(dbgs() << "++ Last FPd cycle index: " << LastFPdOpCycleIdx
323 << "\n";);
324 }
325
326 // Insert SU into current group by increasing number of slots used
327 // in current group.
328 CurrGroupSize += getNumDecoderSlots(SU);
329 CurrGroupHas4RegOps |= has4RegOps(SU->getInstr());
330 unsigned GroupLim = (CurrGroupHas4RegOps ? 2 : 3);
331 assert((CurrGroupSize <= GroupLim || CurrGroupSize == getNumDecoderSlots(SU))
332 && "SU does not fit into decoder group!");
333
334 // Check if current group is now full/ended. If so, move on to next
335 // group to be ready to evaluate more candidates.
336 if (CurrGroupSize >= GroupLim || SC->EndGroup)
337 nextGroup();
338}
339
341 const MCSchedClassDesc *SC = getSchedClass(SU);
342 if (!SC->isValid())
343 return 0;
344
345 // If SU begins new group, it can either break a current group early
346 // or fit naturally if current group is empty (negative cost).
347 if (SC->BeginGroup) {
348 if (CurrGroupSize)
349 return 3 - CurrGroupSize;
350 return -1;
351 }
352
353 // Similarly, a group-ending SU may either fit well (last in group), or
354 // end the group prematurely.
355 if (SC->EndGroup) {
356 unsigned resultingGroupSize =
357 (CurrGroupSize + getNumDecoderSlots(SU));
358 if (resultingGroupSize < 3)
359 return (3 - resultingGroupSize);
360 return -1;
361 }
362
363 // An instruction with 4 register operands will not fit in last slot.
364 if (CurrGroupSize == 2 && has4RegOps(SU->getInstr()))
365 return 1;
366
367 // Most instructions can be placed in any decoder slot.
368 return 0;
369}
370
371bool SystemZHazardRecognizer::isFPdOpPreferred_distance(SUnit *SU) const {
372 assert (SU->isUnbuffered);
373 // If this is the first FPd op, it should be scheduled high.
374 if (LastFPdOpCycleIdx == UINT_MAX)
375 return true;
376 // If this is not the first PFd op, it should go into the other side
377 // of the processor to use the other FPd unit there. This should
378 // generally happen if two FPd ops are placed with 2 other
379 // instructions between them (modulo 6).
380 unsigned SUCycleIdx = getCurrCycleIdx(SU);
381 if (LastFPdOpCycleIdx > SUCycleIdx)
382 return ((LastFPdOpCycleIdx - SUCycleIdx) == 3);
383 return ((SUCycleIdx - LastFPdOpCycleIdx) == 3);
384}
385
387resourcesCost(SUnit *SU) {
388 int Cost = 0;
389
390 const MCSchedClassDesc *SC = getSchedClass(SU);
391 if (!SC->isValid())
392 return 0;
393
394 // For a FPd op, either return min or max value as indicated by the
395 // distance to any prior FPd op.
396 if (SU->isUnbuffered)
397 Cost = (isFPdOpPreferred_distance(SU) ? INT_MIN : INT_MAX);
398 // For other instructions, give a cost to the use of the critical resource.
399 else if (CriticalResourceIdx != UINT_MAX) {
401 PI = SchedModel->getWriteProcResBegin(SC),
402 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI)
403 if (PI->ProcResourceIdx == CriticalResourceIdx)
404 Cost = PI->ReleaseAtCycle;
405 }
406
407 return Cost;
408}
409
411 bool TakenBranch) {
412 // Make a temporary SUnit.
413 SUnit SU(MI, 0);
414
415 // Set interesting flags.
416 SU.isCall = MI->isCall();
417
418 const MCSchedClassDesc *SC = SchedModel->resolveSchedClass(MI);
419 for (const MCWriteProcResEntry &PRE :
420 make_range(SchedModel->getWriteProcResBegin(SC),
421 SchedModel->getWriteProcResEnd(SC))) {
422 switch (SchedModel->getProcResource(PRE.ProcResourceIdx)->BufferSize) {
423 case 0:
424 SU.hasReservedResource = true;
425 break;
426 case 1:
427 SU.isUnbuffered = true;
428 break;
429 default:
430 break;
431 }
432 }
433
434 unsigned GroupSizeBeforeEmit = CurrGroupSize;
435 EmitInstruction(&SU);
436
437 if (!TakenBranch && isBranchRetTrap(MI)) {
438 // NT Branch on second slot ends group.
439 if (GroupSizeBeforeEmit == 1)
440 nextGroup();
441 }
442
443 if (TakenBranch && CurrGroupSize > 0)
444 nextGroup();
445
446 assert ((!MI->isTerminator() || isBranchRetTrap(MI)) &&
447 "Scheduler: unhandled terminator!");
448}
449
452 // Current decoder group
453 CurrGroupSize = Incoming->CurrGroupSize;
454 LLVM_DEBUG(CurGroupDbg = Incoming->CurGroupDbg;);
455
456 // Processor resources
457 ProcResourceCounters = Incoming->ProcResourceCounters;
458 CriticalResourceIdx = Incoming->CriticalResourceIdx;
459
460 // FPd
461 LastFPdOpCycleIdx = Incoming->LastFPdOpCycleIdx;
462 GrpCount = Incoming->GrpCount;
463}
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
IRTranslator LLVM IR MI
unsigned const TargetRegisterInfo * TRI
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
static bool isBranchRetTrap(MachineInstr *MI)
static cl::opt< int > ProcResCostLim("procres-cost-lim", cl::Hidden, cl::desc("The OOO window for processor " "resources during scheduling."), cl::init(8))
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
Definition: MCInstrDesc.h:248
int getOperandConstraint(unsigned OpNum, MCOI::OperandConstraint Constraint) const
Returns the value of the specified operand constraint if it is present.
Definition: MCInstrDesc.h:219
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
bool isCall
Is a function call.
Definition: ScheduleDAG.h:287
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:270
bool isUnbuffered
Uses an unbuffered resource.
Definition: ScheduleDAG.h:300
bool hasReservedResource
Uses a reserved resource.
Definition: ScheduleDAG.h:301
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:390
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
SystemZHazardRecognizer maintains the state for one MBB during scheduling.
int groupingCost(SUnit *SU) const
Return the cost of decoder grouping for SU.
void emitInstruction(MachineInstr *MI, bool TakenBranch=false)
Wrap a non-scheduled instruction in an SU and emit it.
const MCSchedClassDesc * getSchedClass(SUnit *SU) const
Resolves and cache a resolved scheduling class for an SUnit.
void copyState(SystemZHazardRecognizer *Incoming)
Copy counters from end of single predecessor.
void Reset() override
Reset - This callback is invoked when a new block of instructions is about to be schedule.
void dumpSU(SUnit *SU, raw_ostream &OS) const
HazardType getHazardType(SUnit *SU, int Stalls=0) override
getHazardType - Return the hazard type of emitting this node.
void dumpCurrGroup(std::string Msg="") const
int resourcesCost(SUnit *SU)
Return the cost of SU in regards to processor resources usage.
void EmitInstruction(SUnit *SU) override
EmitInstruction - This callback is invoked when an instruction is emitted, to advance the hazard stat...
const SystemZRegisterInfo & getRegisterInfo() const
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
ProcResIter getWriteProcResEnd(const MCSchedClassDesc *SC) const
const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
const MCProcResourceDesc * getProcResource(unsigned PIdx) const
Get a processor resource by ID for convenience.
unsigned getNumProcResourceKinds() const
Get the number of kinds of resources for this target.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
InstructionCost Cost
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Define a kind of processor resource that will be modeled by the scheduler.
Definition: MCSchedule.h:31
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition: MCSchedule.h:118
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition: MCSchedule.h:63