LLVM 23.0.0git
AMDGPUCoExecSchedStrategy.cpp
Go to the documentation of this file.
1//===- AMDGPUCoExecSchedStrategy.cpp - CoExec Scheduling Strategy ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Coexecution-focused scheduling strategy for AMDGPU.
11//
12//===----------------------------------------------------------------------===//
13
15#include "llvm/Support/Debug.h"
16
17using namespace llvm;
18
19#define DEBUG_TYPE "machine-scheduler"
20
21namespace {
22
23// Used to disable post-RA scheduling with function level granularity.
24class GCNNoopPostScheduleDAG final : public ScheduleDAGInstrs {
25public:
26 explicit GCNNoopPostScheduleDAG(MachineSchedContext *C)
27 : ScheduleDAGInstrs(*C->MF, C->MLI, /*RemoveKillFlags=*/true) {}
28
29 // Do nothing.
30 void schedule() override {}
31};
32
33} // namespace
34
36 // pickOnlyChoice() releases pending instructions and checks for new hazards.
37 SUnit *OnlyChoice = Zone.pickOnlyChoice();
38 if (!Zone.Pending.empty())
39 return nullptr;
40
41 return OnlyChoice;
42}
43
52
55 unsigned NumRegionInstrs) {
59 "coexec scheduler only supports top-down scheduling");
60 RegionPolicy.OnlyTopDown = true;
61 RegionPolicy.OnlyBottomUp = false;
62}
63
65 // Coexecution scheduling strategy is only done top-down to support new
66 // resource balancing heuristics.
67 RegionPolicy.OnlyTopDown = true;
68 RegionPolicy.OnlyBottomUp = false;
69
71}
72
74 assert(RegionPolicy.OnlyTopDown && !RegionPolicy.OnlyBottomUp &&
75 "coexec scheduler only supports top-down scheduling");
76
77 if (DAG->top() == DAG->bottom()) {
78 assert(Top.Available.empty() && Top.Pending.empty() &&
79 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
80 return nullptr;
81 }
82
83 bool PickedPending = false;
84 SUnit *SU = nullptr;
85 do {
86 PickedPending = false;
87 SU = pickOnlyChoice(Top);
88 if (!SU) {
89 CandPolicy NoPolicy;
90 TopCand.reset(NoPolicy);
91 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
92 PickedPending, /*IsBottomUp=*/false);
93 assert(TopCand.Reason != NoCand && "failed to find a candidate");
94 SU = TopCand.SU;
95 }
96 IsTopNode = true;
97 } while (SU->isScheduled);
98
99 if (PickedPending) {
100 unsigned ReadyCycle = SU->TopReadyCycle;
101 unsigned CurrentCycle = Top.getCurrCycle();
102 if (ReadyCycle > CurrentCycle)
103 Top.bumpCycle(ReadyCycle);
104
105 // checkHazard() does not expose the exact cycle where the hazard clears.
106 while (Top.checkHazard(SU))
107 Top.bumpCycle(Top.getCurrCycle() + 1);
108
109 Top.releasePending();
110 }
111
112 if (SU->isTopReady())
113 Top.removeReady(SU);
114 if (SU->isBottomReady())
115 Bot.removeReady(SU);
116
117 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
118 << *SU->getInstr());
119
120 assert(IsTopNode && "coexec scheduler must only schedule from top boundary");
121 return SU;
122}
123
125 SchedBoundary &Zone, const CandPolicy &ZonePolicy,
126 const RegPressureTracker &RPTracker, SchedCandidate &Cand,
127 bool &PickedPending, bool IsBottomUp) {
128 assert(Zone.isTop() && "coexec scheduler only supports top boundary");
129 assert(!IsBottomUp && "coexec scheduler only supports top-down scheduling");
130
131 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
133 unsigned SGPRPressure = 0;
134 unsigned VGPRPressure = 0;
135 PickedPending = false;
136 if (DAG->isTrackingPressure()) {
137 if (!useGCNTrackers()) {
138 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
139 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
140 } else {
141 SGPRPressure = DownwardTracker.getPressure().getSGPRNum();
142 VGPRPressure = DownwardTracker.getPressure().getArchVGPRNum();
143 }
144 }
145
146 auto EvaluateQueue = [&](ReadyQueue &Q, bool FromPending) {
147 for (SUnit *SU : Q) {
148 SchedCandidate TryCand(ZonePolicy);
149 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
150 VGPRPressure, IsBottomUp);
151 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
152 tryCandidate(Cand, TryCand, ZoneArg);
153 if (TryCand.Reason != NoCand) {
154 if (TryCand.ResDelta == SchedResourceDelta())
155 TryCand.initResourceDelta(Zone.DAG, SchedModel);
156 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
157 PickedPending = FromPending;
158 Cand.setBest(TryCand);
159 } else {
160 printCandidateDecision(TryCand, Cand);
161 }
162 }
163 };
164
165 LLVM_DEBUG(dbgs() << "Available Q:\n");
166 EvaluateQueue(Zone.Available, /*FromPending=*/false);
167
168 LLVM_DEBUG(dbgs() << "Pending Q:\n");
169 EvaluateQueue(Zone.Pending, /*FromPending=*/true);
170}
171
173 SchedCandidate &TryCand,
174 SchedBoundary *Zone) const {
175 // Initialize the candidate if needed.
176 if (!Cand.isValid()) {
177 TryCand.Reason = FirstValid;
178 return true;
179 }
180
181 // Bias PhysReg Defs and copies to their uses and defined respectively.
182 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
183 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
184 return TryCand.Reason != NoCand;
185
186 // Avoid exceeding the target's limit.
187 if (DAG->isTrackingPressure() &&
188 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
189 RegExcess, TRI, DAG->MF))
190 return TryCand.Reason != NoCand;
191
192 // We only compare a subset of features when comparing nodes between
193 // Top and Bottom boundary. Some properties are simply incomparable, in many
194 // other instances we should only override the other boundary if something
195 // is a clear good pick on one boundary. Skip heuristics that are more
196 // "tie-breaking" in nature.
197 bool SameBoundary = Zone != nullptr;
198 if (SameBoundary) {
199 // For loops that are acyclic path limited, aggressively schedule for
200 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
201 // heuristics to take precedence.
202 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
203 tryLatency(TryCand, Cand, *Zone))
204 return TryCand.Reason != NoCand;
205
206 // Otherwise compare candidates by the stall they would introduce if
207 // scheduled in the current cycle.
208 if (tryEffectiveStall(Cand, TryCand, *Zone))
209 return TryCand.Reason != NoCand;
210 }
211
212 // Keep clustered nodes together to encourage downstream peephole
213 // optimizations which may reduce resource requirements.
214 //
215 // This is a best effort to set things up for a post-RA pass. Optimizations
216 // like generating loads of multiple registers should ideally be done within
217 // the scheduler pass by combining the loads during DAG postprocessing.
218 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
219 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
220 bool CandIsClusterSucc =
221 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
222 bool TryCandIsClusterSucc =
223 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
224
225 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
226 Cluster))
227 return TryCand.Reason != NoCand;
228
229 if (SameBoundary) {
230 // Weak edges are for clustering and other constraints.
231 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
232 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
233 return TryCand.Reason != NoCand;
234 }
235
236 // Avoid increasing the max pressure of the entire region.
237 if (DAG->isTrackingPressure() &&
238 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
239 Cand, RegMax, TRI, DAG->MF))
240 return TryCand.Reason != NoCand;
241
242 if (SameBoundary) {
243 // Avoid critical resource consumption and balance the schedule.
246 TryCand, Cand, ResourceReduce))
247 return TryCand.Reason != NoCand;
249 Cand.ResDelta.DemandedResources, TryCand, Cand,
251 return TryCand.Reason != NoCand;
252
253 // Avoid serializing long latency dependence chains.
254 // For acyclic path limited loops, latency was already checked above.
255 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
256 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
257 return TryCand.Reason != NoCand;
258
259 // Fall through to original instruction order.
260 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
261 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
262 TryCand.Reason = NodeOrder;
263 return true;
264 }
265 }
266
267 return false;
268}
269
271 SchedCandidate &TryCand,
272 SchedBoundary &Zone) const {
273 // Treat structural and latency stalls as a single scheduling cost for the
274 // current cycle.
275 struct StallCosts {
276 unsigned Ready = 0;
277 unsigned Structural = 0;
278 unsigned Latency = 0;
279 unsigned Effective = 0;
280 };
281
282 unsigned CurrCycle = Zone.getCurrCycle();
283 auto GetStallCosts = [&](SUnit *SU) {
284 unsigned ReadyCycle = Zone.isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
285 StallCosts Costs;
286 Costs.Ready = ReadyCycle > CurrCycle ? ReadyCycle - CurrCycle : 0;
287 Costs.Structural = getStructuralStallCycles(Zone, SU);
288 Costs.Latency = Zone.getLatencyStallCycles(SU);
289 Costs.Effective = std::max({Costs.Ready, Costs.Structural, Costs.Latency});
290 return Costs;
291 };
292
293 StallCosts TryCosts = GetStallCosts(TryCand.SU);
294 StallCosts CandCosts = GetStallCosts(Cand.SU);
295
296 LLVM_DEBUG(if (TryCosts.Effective || CandCosts.Effective) {
297 dbgs() << "Effective stalls: try=" << TryCosts.Effective
298 << " (ready=" << TryCosts.Ready << ", struct=" << TryCosts.Structural
299 << ", lat=" << TryCosts.Latency << ") cand=" << CandCosts.Effective
300 << " (ready=" << CandCosts.Ready
301 << ", struct=" << CandCosts.Structural
302 << ", lat=" << CandCosts.Latency << ")\n";
303 });
304
305 return tryLess(TryCosts.Effective, CandCosts.Effective, TryCand, Cand, Stall);
306}
307
310 LLVM_DEBUG(dbgs() << "AMDGPU coexec preRA scheduler selected for "
311 << C->MF->getName() << '\n');
312 return new GCNScheduleDAGMILive(
313 C, std::make_unique<AMDGPUCoExecSchedStrategy>(C));
314}
315
318 LLVM_DEBUG(dbgs() << "AMDGPU nop postRA scheduler selected for "
319 << C->MF->getName() << '\n');
320 return new GCNNoopPostScheduleDAG(C);
321}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SUnit * pickOnlyChoice(SchedBoundary &Zone)
Coexecution-focused scheduling strategy for AMDGPU.
#define LLVM_DEBUG(...)
Definition Debug.h:114
bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary &Zone) const
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
Apply a set of heuristics to a new candidate.
void initPolicy(MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, unsigned NumRegionInstrs) override
Optionally override the per-region scheduling policy.
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand, bool &PickedPending, bool IsBottomUp)
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
AMDGPUCoExecSchedStrategy(const MachineSchedContext *C)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
GCNDownwardRPTracker DownwardTracker
GCNSchedStrategy(const MachineSchedContext *C)
SmallVector< GCNSchedStageID, 4 > SchedStages
std::vector< unsigned > Pressure
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
void printCandidateDecision(const SchedCandidate &Current, const SchedCandidate &Preferred)
unsigned getStructuralStallCycles(SchedBoundary &Zone, SUnit *SU) const
Estimate how many cycles SU must wait due to structural hazards at the current boundary cycle.
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp)
MachineSchedPolicy RegionPolicy
const TargetSchedModel * SchedModel
const TargetRegisterInfo * TRI
SchedCandidate TopCand
Candidate last picked from Top boundary.
ScheduleDAGMILive * DAG
MachineInstrBundleIterator< MachineInstr > iterator
virtual void initPolicy(MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, unsigned NumRegionInstrs)
Optionally override the per-region scheduling policy.
Helpers for implementing custom MachineSchedStrategy classes.
Track the current register pressure at some position in the instruction stream, and remember the high...
const std::vector< unsigned > & getRegSetPressureAtPos() const
Get the register set pressure at the current position, which may be less than the pressure across the...
Scheduling unit. This is a node in the scheduling DAG.
unsigned TopReadyCycle
Cycle relative to start when node is ready.
unsigned NodeNum
Entry # of node in the node vector.
bool isScheduled
True once scheduled.
unsigned ParentClusterIdx
The parent cluster id.
bool isBottomReady() const
bool isTopReady() const
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Each Scheduling boundary is associated with ready queues.
LLVM_ABI unsigned getLatencyStallCycles(SUnit *SU)
Get the difference between the given SUnit's ready time and the current cycle.
LLVM_ABI SUnit * pickOnlyChoice()
Call this before applying any other heuristics to the Available queue.
unsigned getCurrMOps() const
Micro-ops issued in the current cycle.
unsigned getCurrCycle() const
Number of cycles to issue the instructions scheduled in this zone.
A ScheduleDAG for scheduling lists of MachineInstr.
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
LLVM_ABI unsigned getWeakLeft(const SUnit *SU, bool isTop)
LLVM_ABI bool tryPressure(const PressureChange &TryP, const PressureChange &CandP, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason, const TargetRegisterInfo *TRI, const MachineFunction &MF)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
ScheduleDAGInstrs * createGCNNoopPostMachineScheduler(MachineSchedContext *C)
LLVM_ABI bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone)
ScheduleDAGInstrs * createGCNCoExecMachineScheduler(MachineSchedContext *C)
bool isTheSameCluster(unsigned A, unsigned B)
Return whether the input cluster ID's are the same and valid.
LLVM_ABI bool tryGreater(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
LLVM_ABI bool tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
Return true if this heuristic determines order.
LLVM_ABI cl::opt< MISched::Direction > PreRADirection
LLVM_ABI int biasPhysReg(const SUnit *SU, bool isTop)
Minimize physical register live ranges.
Policy for scheduling the next instruction in the candidate's zone.
Store the state used by GenericScheduler heuristics, required for the lifetime of one invocation of p...
LLVM_ABI void initResourceDelta(const ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel)
Status of an instruction's critical resource consumption.
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...