LLVM 22.0.0git
GCNSchedStrategy.cpp
Go to the documentation of this file.
1//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This contains a MachineSchedStrategy implementation for maximizing wave
11/// occupancy on GCN hardware.
12///
13/// This pass will apply multiple scheduling stages to the same function.
14/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15/// entry point for the scheduling of those regions is
16/// GCNScheduleDAGMILive::runSchedStages.
17
18/// Generally, the reason for having multiple scheduling stages is to account
19/// for the kernel-wide effect of register usage on occupancy. Usually, only a
20/// few scheduling regions will have register pressure high enough to limit
21/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22/// other regions.
23///
24//===----------------------------------------------------------------------===//
25
26#include "GCNSchedStrategy.h"
27#include "AMDGPUIGroupLP.h"
28#include "GCNRegPressure.h"
31#include "llvm/ADT/STLExtras.h"
34#include "llvm/MC/LaneBitmask.h"
36
37#define DEBUG_TYPE "machine-scheduler"
38
39using namespace llvm;
40
42 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
43 cl::desc("Disable unclustered high register pressure "
44 "reduction scheduling stage."),
45 cl::init(false));
46
48 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
49 cl::desc("Disable clustered low occupancy "
50 "rescheduling for ILP scheduling stage."),
51 cl::init(false));
52
54 "amdgpu-schedule-metric-bias", cl::Hidden,
56 "Sets the bias which adds weight to occupancy vs latency. Set it to "
57 "100 to chase the occupancy only."),
58 cl::init(10));
59
60static cl::opt<bool>
61 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
62 cl::desc("Relax occupancy targets for kernels which are memory "
63 "bound (amdgpu-membound-threshold), or "
64 "Wave Limited (amdgpu-limit-wave-threshold)."),
65 cl::init(false));
66
68 "amdgpu-use-amdgpu-trackers", cl::Hidden,
69 cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
70 cl::init(false));
71
72#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
73#define DUMP_MAX_REG_PRESSURE
75 "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
76 cl::desc("Print a list of live registers along with their def/uses at the "
77 "point of maximum register pressure before scheduling."),
78 cl::init(false));
79
81 "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
82 cl::desc("Print a list of live registers along with their def/uses at the "
83 "point of maximum register pressure after scheduling."),
84 cl::init(false));
85#endif
86
87const unsigned ScheduleMetrics::ScaleFactor = 100;
88
93
96
97 MF = &DAG->MF;
98
99 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
100
102 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
104 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
105
107 // Set the initial TargetOccupnacy to the maximum occupancy that we can
108 // achieve for this function. This effectively sets a lower bound on the
109 // 'Critical' register limits in the scheduler.
110 // Allow for lower occupancy targets if kernel is wave limited or memory
111 // bound, and using the relaxed occupancy feature.
115 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
116
117 if (!KnownExcessRP) {
118 VGPRCriticalLimit = std::min(
119 ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
121 } else {
122 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
123 // returns a reasonably small number for targets with lots of VGPRs, such
124 // as GFX10 and GFX11.
125 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
126 "VGPRCriticalLimit calculation method.\n");
127 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
128 unsigned Granule =
129 AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, DynamicVGPRBlockSize);
130 unsigned Addressable =
131 AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, DynamicVGPRBlockSize);
132 unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
133 VGPRBudget = std::max(VGPRBudget, Granule);
134 VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
135 }
136
137 // Subtract error margin and bias from register limits and avoid overflow.
142
143 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
144 << ", VGPRExcessLimit = " << VGPRExcessLimit
145 << ", SGPRCriticalLimit = " << SGPRCriticalLimit
146 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
147}
148
149/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
150/// current register pressure.
151///
152/// This works for the common case, but it has a few exceptions that have been
153/// observed through trial and error:
154/// - Explicit physical register operands
155/// - Subregister definitions
156///
157/// In both of those cases, PressureDiff doesn't represent the actual pressure,
158/// and querying LiveIntervals through the RegPressureTracker is needed to get
159/// an accurate value.
160///
161/// We should eventually only use PressureDiff for maximum performance, but this
162/// already allows 80% of SUs to take the fast path without changing scheduling
163/// at all. Further changes would either change scheduling, or require a lot
164/// more logic to recover an accurate pressure estimate from the PressureDiffs.
165static bool canUsePressureDiffs(const SUnit &SU) {
166 if (!SU.isInstr())
167 return false;
168
169 // Cannot use pressure diffs for subregister defs or with physregs, it's
170 // imprecise in both cases.
171 for (const auto &Op : SU.getInstr()->operands()) {
172 if (!Op.isReg() || Op.isImplicit())
173 continue;
174 if (Op.getReg().isPhysical() ||
175 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
176 return false;
177 }
178 return true;
179}
180
182 bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
183 std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
184 GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
185 ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
186 // getDownwardPressure() and getUpwardPressure() make temporary changes to
187 // the tracker, so we need to pass those function a non-const copy.
188 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
189 if (!GCNTrackers) {
190 AtTop
191 ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
192 : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
193
194 return;
195 }
196
197 // GCNTrackers
198 Pressure.resize(4, 0);
199 MachineInstr *MI = SU->getInstr();
200 GCNRegPressure NewPressure;
201 if (AtTop) {
202 GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
203 NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
204 } else {
205 GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
206 TempUpwardTracker.recede(*MI);
207 NewPressure = TempUpwardTracker.getPressure();
208 }
209 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
210 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
211 NewPressure.getArchVGPRNum();
212 Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
213}
214
216 bool AtTop,
217 const RegPressureTracker &RPTracker,
218 const SIRegisterInfo *SRI,
219 unsigned SGPRPressure,
220 unsigned VGPRPressure, bool IsBottomUp) {
221 Cand.SU = SU;
222 Cand.AtTop = AtTop;
223
224 if (!DAG->isTrackingPressure())
225 return;
226
227 Pressure.clear();
228 MaxPressure.clear();
229
230 // We try to use the cached PressureDiffs in the ScheduleDAG whenever
231 // possible over querying the RegPressureTracker.
232 //
233 // RegPressureTracker will make a lot of LIS queries which are very
234 // expensive, it is considered a slow function in this context.
235 //
236 // PressureDiffs are precomputed and cached, and getPressureDiff is just a
237 // trivial lookup into an array. It is pretty much free.
238 //
239 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
240 // PressureDiffs.
241 if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
242 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
244 } else {
245 // Reserve 4 slots.
246 Pressure.resize(4, 0);
247 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
248 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
249
250 for (const auto &Diff : DAG->getPressureDiff(SU)) {
251 if (!Diff.isValid())
252 continue;
253 // PressureDiffs is always bottom-up so if we're working top-down we need
254 // to invert its sign.
255 Pressure[Diff.getPSet()] +=
256 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
257 }
258
259#ifdef EXPENSIVE_CHECKS
260 std::vector<unsigned> CheckPressure, CheckMaxPressure;
261 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
263 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
264 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
265 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
266 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
267 errs() << "Register Pressure is inaccurate when calculated through "
268 "PressureDiff\n"
269 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
270 << ", expected "
271 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
272 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
273 << ", expected "
274 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
275 report_fatal_error("inaccurate register pressure calculation");
276 }
277#endif
278 }
279
280 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
281 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
282
283 // If two instructions increase the pressure of different register sets
284 // by the same amount, the generic scheduler will prefer to schedule the
285 // instruction that increases the set with the least amount of registers,
286 // which in our case would be SGPRs. This is rarely what we want, so
287 // when we report excess/critical register pressure, we do it either
288 // only for VGPRs or only for SGPRs.
289
290 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
291 const unsigned MaxVGPRPressureInc = 16;
292 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
293 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
294
295 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
296 // to increase the likelihood we don't go over the limits. We should improve
297 // the analysis to look through dependencies to find the path with the least
298 // register pressure.
299
300 // We only need to update the RPDelta for instructions that increase register
301 // pressure. Instructions that decrease or keep reg pressure the same will be
302 // marked as RegExcess in tryCandidate() when they are compared with
303 // instructions that increase the register pressure.
304 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
305 HasHighPressure = true;
306 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
307 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
308 }
309
310 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
311 HasHighPressure = true;
312 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
313 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
314 }
315
316 // Register pressure is considered 'CRITICAL' if it is approaching a value
317 // that would reduce the wave occupancy for the execution unit. When
318 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
319 // has the same cost, so we don't need to prefer one over the other.
320
321 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
322 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
323
324 if (SGPRDelta >= 0 || VGPRDelta >= 0) {
325 HasHighPressure = true;
326 if (SGPRDelta > VGPRDelta) {
327 Cand.RPDelta.CriticalMax =
328 PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
329 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
330 } else {
331 Cand.RPDelta.CriticalMax =
332 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
333 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
334 }
335 }
336}
337
338// This function is mostly cut and pasted from
339// GenericScheduler::pickNodeFromQueue()
341 const CandPolicy &ZonePolicy,
342 const RegPressureTracker &RPTracker,
343 SchedCandidate &Cand,
344 bool IsBottomUp) {
345 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
347 unsigned SGPRPressure = 0;
348 unsigned VGPRPressure = 0;
349 if (DAG->isTrackingPressure()) {
350 if (!GCNTrackers) {
351 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
352 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
353 } else {
354 GCNRPTracker *T = IsBottomUp
355 ? static_cast<GCNRPTracker *>(&UpwardTracker)
356 : static_cast<GCNRPTracker *>(&DownwardTracker);
357 SGPRPressure = T->getPressure().getSGPRNum();
358 VGPRPressure = T->getPressure().getArchVGPRNum();
359 }
360 }
361 ReadyQueue &Q = Zone.Available;
362 for (SUnit *SU : Q) {
363
364 SchedCandidate TryCand(ZonePolicy);
365 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
366 VGPRPressure, IsBottomUp);
367 // Pass SchedBoundary only when comparing nodes from the same boundary.
368 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
369 tryCandidate(Cand, TryCand, ZoneArg);
370 if (TryCand.Reason != NoCand) {
371 // Initialize resource delta if needed in case future heuristics query it.
372 if (TryCand.ResDelta == SchedResourceDelta())
373 TryCand.initResourceDelta(Zone.DAG, SchedModel);
374 Cand.setBest(TryCand);
376 }
377 }
378}
379
380// This function is mostly cut and pasted from
381// GenericScheduler::pickNodeBidirectional()
383 // Schedule as far as possible in the direction of no choice. This is most
384 // efficient, but also provides the best heuristics for CriticalPSets.
385 if (SUnit *SU = Bot.pickOnlyChoice()) {
386 IsTopNode = false;
387 return SU;
388 }
389 if (SUnit *SU = Top.pickOnlyChoice()) {
390 IsTopNode = true;
391 return SU;
392 }
393 // Set the bottom-up policy based on the state of the current bottom zone and
394 // the instructions outside the zone, including the top zone.
395 CandPolicy BotPolicy;
396 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
397 // Set the top-down policy based on the state of the current top zone and
398 // the instructions outside the zone, including the bottom zone.
399 CandPolicy TopPolicy;
400 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
401
402 // See if BotCand is still valid (because we previously scheduled from Top).
403 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
404 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
405 BotCand.Policy != BotPolicy) {
406 BotCand.reset(CandPolicy());
407 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
408 /*IsBottomUp=*/true);
409 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
410 } else {
412#ifndef NDEBUG
413 if (VerifyScheduling) {
414 SchedCandidate TCand;
415 TCand.reset(CandPolicy());
416 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
417 /*IsBottomUp=*/true);
418 assert(TCand.SU == BotCand.SU &&
419 "Last pick result should correspond to re-picking right now");
420 }
421#endif
422 }
423
424 // Check if the top Q has a better candidate.
425 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
426 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
427 TopCand.Policy != TopPolicy) {
428 TopCand.reset(CandPolicy());
429 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
430 /*IsBottomUp=*/false);
431 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
432 } else {
434#ifndef NDEBUG
435 if (VerifyScheduling) {
436 SchedCandidate TCand;
437 TCand.reset(CandPolicy());
438 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
439 /*IsBottomUp=*/false);
440 assert(TCand.SU == TopCand.SU &&
441 "Last pick result should correspond to re-picking right now");
442 }
443#endif
444 }
445
446 // Pick best from BotCand and TopCand.
447 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
448 dbgs() << "Bot Cand: "; traceCandidate(BotCand););
449 SchedCandidate Cand = BotCand;
450 TopCand.Reason = NoCand;
451 tryCandidate(Cand, TopCand, nullptr);
452 if (TopCand.Reason != NoCand) {
453 Cand.setBest(TopCand);
454 }
455 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
456
457 IsTopNode = Cand.AtTop;
458 return Cand.SU;
459}
460
461// This function is mostly cut and pasted from
462// GenericScheduler::pickNode()
464 if (DAG->top() == DAG->bottom()) {
465 assert(Top.Available.empty() && Top.Pending.empty() &&
466 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
467 return nullptr;
468 }
469 SUnit *SU;
470 do {
471 if (RegionPolicy.OnlyTopDown) {
472 SU = Top.pickOnlyChoice();
473 if (!SU) {
474 CandPolicy NoPolicy;
475 TopCand.reset(NoPolicy);
476 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
477 /*IsBottomUp=*/false);
478 assert(TopCand.Reason != NoCand && "failed to find a candidate");
479 SU = TopCand.SU;
480 }
481 IsTopNode = true;
482 } else if (RegionPolicy.OnlyBottomUp) {
483 SU = Bot.pickOnlyChoice();
484 if (!SU) {
485 CandPolicy NoPolicy;
486 BotCand.reset(NoPolicy);
487 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
488 /*IsBottomUp=*/true);
489 assert(BotCand.Reason != NoCand && "failed to find a candidate");
490 SU = BotCand.SU;
491 }
492 IsTopNode = false;
493 } else {
494 SU = pickNodeBidirectional(IsTopNode);
495 }
496 } while (SU->isScheduled);
497
498 if (SU->isTopReady())
499 Top.removeReady(SU);
500 if (SU->isBottomReady())
501 Bot.removeReady(SU);
502
503 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
504 << *SU->getInstr());
505 return SU;
506}
507
508void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
509 if (GCNTrackers) {
510 MachineInstr *MI = SU->getInstr();
511 IsTopNode ? (void)DownwardTracker.advance(MI, false)
512 : UpwardTracker.recede(*MI);
513 }
514
515 return GenericScheduler::schedNode(SU, IsTopNode);
516}
517
522
525 if (!CurrentStage)
526 CurrentStage = SchedStages.begin();
527 else
528 CurrentStage++;
529
530 return CurrentStage != SchedStages.end();
531}
532
535 return std::next(CurrentStage) != SchedStages.end();
536}
537
539 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
540 return *std::next(CurrentStage);
541}
542
552
557
559 SchedCandidate &TryCand,
560 SchedBoundary *Zone) const {
561 // Initialize the candidate if needed.
562 if (!Cand.isValid()) {
563 TryCand.Reason = NodeOrder;
564 return true;
565 }
566
567 // Avoid spilling by exceeding the register limit.
568 if (DAG->isTrackingPressure() &&
569 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
570 RegExcess, TRI, DAG->MF))
571 return TryCand.Reason != NoCand;
572
573 // Bias PhysReg Defs and copies to their uses and defined respectively.
574 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
575 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
576 return TryCand.Reason != NoCand;
577
578 bool SameBoundary = Zone != nullptr;
579 if (SameBoundary) {
580 // Prioritize instructions that read unbuffered resources by stall cycles.
581 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
582 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
583 return TryCand.Reason != NoCand;
584
585 // Avoid critical resource consumption and balance the schedule.
588 TryCand, Cand, ResourceReduce))
589 return TryCand.Reason != NoCand;
591 Cand.ResDelta.DemandedResources, TryCand, Cand,
593 return TryCand.Reason != NoCand;
594
595 // Unconditionally try to reduce latency.
596 if (tryLatency(TryCand, Cand, *Zone))
597 return TryCand.Reason != NoCand;
598
599 // Weak edges are for clustering and other constraints.
600 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
601 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
602 return TryCand.Reason != NoCand;
603 }
604
605 // Keep clustered nodes together to encourage downstream peephole
606 // optimizations which may reduce resource requirements.
607 //
608 // This is a best effort to set things up for a post-RA pass. Optimizations
609 // like generating loads of multiple registers should ideally be done within
610 // the scheduler pass by combining the loads during DAG postprocessing.
611 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
612 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
613 bool CandIsClusterSucc =
614 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
615 bool TryCandIsClusterSucc =
616 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
617 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
618 Cluster))
619 return TryCand.Reason != NoCand;
620
621 // Avoid increasing the max critical pressure in the scheduled region.
622 if (DAG->isTrackingPressure() &&
624 TryCand, Cand, RegCritical, TRI, DAG->MF))
625 return TryCand.Reason != NoCand;
626
627 // Avoid increasing the max pressure of the entire region.
628 if (DAG->isTrackingPressure() &&
629 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
630 Cand, RegMax, TRI, DAG->MF))
631 return TryCand.Reason != NoCand;
632
633 if (SameBoundary) {
634 // Fall through to original instruction order.
635 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
636 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
637 TryCand.Reason = NodeOrder;
638 return true;
639 }
640 }
641 return false;
642}
643
649
650/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
651/// much as possible. This is achieved by:
652// 1. Prioritize clustered operations before stall latency heuristic.
653// 2. Prioritize long-latency-load before stall latency heuristic.
654///
655/// \param Cand provides the policy and current best candidate.
656/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
657/// \param Zone describes the scheduled zone that we are extending, or nullptr
658/// if Cand is from a different zone than TryCand.
659/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
661 SchedCandidate &TryCand,
662 SchedBoundary *Zone) const {
663 // Initialize the candidate if needed.
664 if (!Cand.isValid()) {
665 TryCand.Reason = NodeOrder;
666 return true;
667 }
668
669 // Bias PhysReg Defs and copies to their uses and defined respectively.
670 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
671 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
672 return TryCand.Reason != NoCand;
673
674 if (DAG->isTrackingPressure()) {
675 // Avoid exceeding the target's limit.
676 if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
677 RegExcess, TRI, DAG->MF))
678 return TryCand.Reason != NoCand;
679
680 // Avoid increasing the max critical pressure in the scheduled region.
682 TryCand, Cand, RegCritical, TRI, DAG->MF))
683 return TryCand.Reason != NoCand;
684 }
685
686 // MaxMemoryClause-specific: We prioritize clustered instructions as we would
687 // get more benefit from clausing these memory instructions.
688 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
689 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
690 bool CandIsClusterSucc =
691 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
692 bool TryCandIsClusterSucc =
693 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
694 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
695 Cluster))
696 return TryCand.Reason != NoCand;
697
698 // We only compare a subset of features when comparing nodes between
699 // Top and Bottom boundary. Some properties are simply incomparable, in many
700 // other instances we should only override the other boundary if something
701 // is a clear good pick on one boundary. Skip heuristics that are more
702 // "tie-breaking" in nature.
703 bool SameBoundary = Zone != nullptr;
704 if (SameBoundary) {
705 // For loops that are acyclic path limited, aggressively schedule for
706 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
707 // heuristics to take precedence.
708 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
709 tryLatency(TryCand, Cand, *Zone))
710 return TryCand.Reason != NoCand;
711
712 // MaxMemoryClause-specific: Prioritize long latency memory load
713 // instructions in top-bottom order to hide more latency. The mayLoad check
714 // is used to exclude store-like instructions, which we do not want to
715 // scheduler them too early.
716 bool TryMayLoad =
717 TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
718 bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
719
720 if (TryMayLoad || CandMayLoad) {
721 bool TryLongLatency =
722 TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
723 bool CandLongLatency =
724 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
725
726 if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
727 Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
728 Cand, Stall))
729 return TryCand.Reason != NoCand;
730 }
731 // Prioritize instructions that read unbuffered resources by stall cycles.
732 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
733 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
734 return TryCand.Reason != NoCand;
735 }
736
737 if (SameBoundary) {
738 // Weak edges are for clustering and other constraints.
739 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
740 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
741 return TryCand.Reason != NoCand;
742 }
743
744 // Avoid increasing the max pressure of the entire region.
745 if (DAG->isTrackingPressure() &&
746 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
747 Cand, RegMax, TRI, DAG->MF))
748 return TryCand.Reason != NoCand;
749
750 if (SameBoundary) {
751 // Avoid critical resource consumption and balance the schedule.
754 TryCand, Cand, ResourceReduce))
755 return TryCand.Reason != NoCand;
757 Cand.ResDelta.DemandedResources, TryCand, Cand,
759 return TryCand.Reason != NoCand;
760
761 // Avoid serializing long latency dependence chains.
762 // For acyclic path limited loops, latency was already checked above.
763 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
764 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
765 return TryCand.Reason != NoCand;
766
767 // Fall through to original instruction order.
768 if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
769 assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
770 TryCand.Reason = NodeOrder;
771 return true;
772 }
773 }
774
775 return false;
776}
777
779 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
780 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
781 MFI(*MF.getInfo<SIMachineFunctionInfo>()),
782 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
783 RegionLiveOuts(this, /*IsLiveOut=*/true) {
784
785 // We want regions with a single MI to be scheduled so that we can reason
786 // about them correctly during scheduling stages that move MIs between regions
787 // (e.g., rematerialization).
789 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
790 if (RelaxedOcc) {
791 MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
792 if (MinOccupancy != StartingOccupancy)
793 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
794 << ".\n");
795 }
796}
797
798std::unique_ptr<GCNSchedStage>
799GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
800 switch (SchedStageID) {
802 return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
804 return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
806 return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
808 return std::make_unique<PreRARematStage>(SchedStageID, *this);
810 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
812 return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
813 *this);
814 }
815
816 llvm_unreachable("Unknown SchedStageID.");
817}
818
820 // Collect all scheduling regions. The actual scheduling is performed in
821 // GCNScheduleDAGMILive::finalizeSchedule.
822 Regions.push_back(std::pair(RegionBegin, RegionEnd));
823}
824
826GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
828 RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
829 &LiveIns[RegionIdx]);
830 return RPTracker.moveMaxPressure();
831}
832
834 MachineBasicBlock::iterator RegionEnd) {
835 auto REnd = RegionEnd == RegionBegin->getParent()->end()
836 ? std::prev(RegionEnd)
837 : RegionEnd;
838 return &*skipDebugInstructionsBackward(REnd, RegionBegin);
839}
840
841void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
842 const MachineBasicBlock *MBB) {
843 GCNDownwardRPTracker RPTracker(*LIS);
844
845 // If the block has the only successor then live-ins of that successor are
846 // live-outs of the current block. We can reuse calculated live set if the
847 // successor will be sent to scheduling past current block.
848
849 // However, due to the bug in LiveInterval analysis it may happen that two
850 // predecessors of the same successor block have different lane bitmasks for
851 // a live-out register. Workaround that by sticking to one-to-one relationship
852 // i.e. one predecessor with one successor block.
853 const MachineBasicBlock *OnlySucc = nullptr;
854 if (MBB->succ_size() == 1) {
855 auto *Candidate = *MBB->succ_begin();
856 if (!Candidate->empty() && Candidate->pred_size() == 1) {
857 SlotIndexes *Ind = LIS->getSlotIndexes();
858 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
859 OnlySucc = Candidate;
860 }
861 }
862
863 // Scheduler sends regions from the end of the block upwards.
864 size_t CurRegion = RegionIdx;
865 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
866 if (Regions[CurRegion].first->getParent() != MBB)
867 break;
868 --CurRegion;
869
870 auto I = MBB->begin();
871 auto LiveInIt = MBBLiveIns.find(MBB);
872 auto &Rgn = Regions[CurRegion];
873 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
874 if (LiveInIt != MBBLiveIns.end()) {
875 auto LiveIn = std::move(LiveInIt->second);
876 RPTracker.reset(*MBB->begin(), &LiveIn);
877 MBBLiveIns.erase(LiveInIt);
878 } else {
879 I = Rgn.first;
880 auto LRS = BBLiveInMap.lookup(NonDbgMI);
881#ifdef EXPENSIVE_CHECKS
882 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
883#endif
884 RPTracker.reset(*I, &LRS);
885 }
886
887 for (;;) {
888 I = RPTracker.getNext();
889
890 if (Regions[CurRegion].first == I || NonDbgMI == I) {
891 LiveIns[CurRegion] = RPTracker.getLiveRegs();
892 RPTracker.clearMaxPressure();
893 }
894
895 if (Regions[CurRegion].second == I) {
896 Pressure[CurRegion] = RPTracker.moveMaxPressure();
897 if (CurRegion-- == RegionIdx)
898 break;
899 auto &Rgn = Regions[CurRegion];
900 NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
901 }
902 RPTracker.advanceToNext();
903 RPTracker.advanceBeforeNext();
904 }
905
906 if (OnlySucc) {
907 if (I != MBB->end()) {
908 RPTracker.advanceToNext();
909 RPTracker.advance(MBB->end());
910 }
911 RPTracker.advanceBeforeNext();
912 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
913 }
914}
915
917GCNScheduleDAGMILive::getRegionLiveInMap() const {
918 assert(!Regions.empty());
919 std::vector<MachineInstr *> RegionFirstMIs;
920 RegionFirstMIs.reserve(Regions.size());
921 for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
922 RegionFirstMIs.push_back(
924
925 return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
926}
927
929GCNScheduleDAGMILive::getRegionLiveOutMap() const {
930 assert(!Regions.empty());
931 std::vector<MachineInstr *> RegionLastMIs;
932 RegionLastMIs.reserve(Regions.size());
933 for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
934 RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
935
936 return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
937}
938
940 IdxToInstruction.clear();
941
942 RegionLiveRegMap =
943 IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
944 for (unsigned I = 0; I < DAG->Regions.size(); I++) {
945 MachineInstr *RegionKey =
946 IsLiveOut
947 ? getLastMIForRegion(DAG->Regions[I].first, DAG->Regions[I].second)
948 : &*DAG->Regions[I].first;
949 IdxToInstruction[I] = RegionKey;
950 }
951}
952
954 // Start actual scheduling here. This function is called by the base
955 // MachineScheduler after all regions have been recorded by
956 // GCNScheduleDAGMILive::schedule().
957 LiveIns.resize(Regions.size());
958 Pressure.resize(Regions.size());
959 RegionsWithHighRP.resize(Regions.size());
960 RegionsWithExcessRP.resize(Regions.size());
961 RegionsWithIGLPInstrs.resize(Regions.size());
962 RegionsWithHighRP.reset();
963 RegionsWithExcessRP.reset();
964 RegionsWithIGLPInstrs.reset();
965
966 runSchedStages();
967}
968
969void GCNScheduleDAGMILive::runSchedStages() {
970 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
971
972 if (!Regions.empty()) {
973 BBLiveInMap = getRegionLiveInMap();
974 if (GCNTrackers)
975 RegionLiveOuts.buildLiveRegMap();
976 }
977
978#ifdef DUMP_MAX_REG_PRESSURE
982 LIS->dump();
983 }
984#endif
985
986 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
987 while (S.advanceStage()) {
988 auto Stage = createSchedStage(S.getCurrentStage());
989 if (!Stage->initGCNSchedStage())
990 continue;
991
992 for (auto Region : Regions) {
993 RegionBegin = Region.first;
994 RegionEnd = Region.second;
995 // Setup for scheduling the region and check whether it should be skipped.
996 if (!Stage->initGCNRegion()) {
997 Stage->advanceRegion();
998 exitRegion();
999 continue;
1000 }
1001
1002 if (GCNTrackers) {
1003 GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
1004 GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
1005 GCNRPTracker::LiveRegSet *RegionLiveIns =
1006 &LiveIns[Stage->getRegionIdx()];
1007
1008 reinterpret_cast<GCNRPTracker *>(DownwardTracker)
1009 ->reset(MRI, *RegionLiveIns);
1010 reinterpret_cast<GCNRPTracker *>(UpwardTracker)
1011 ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
1012 Stage->getRegionIdx()));
1013 }
1014
1016 Stage->finalizeGCNRegion();
1017 }
1018
1019 Stage->finalizeGCNSchedStage();
1020 }
1021
1022#ifdef DUMP_MAX_REG_PRESSURE
1026 LIS->dump();
1027 }
1028#endif
1029}
1030
1031#ifndef NDEBUG
1033 switch (StageID) {
1035 OS << "Max Occupancy Initial Schedule";
1036 break;
1038 OS << "Unclustered High Register Pressure Reschedule";
1039 break;
1041 OS << "Clustered Low Occupancy Reschedule";
1042 break;
1044 OS << "Pre-RA Rematerialize";
1045 break;
1047 OS << "Max ILP Initial Schedule";
1048 break;
1050 OS << "Max memory clause Initial Schedule";
1051 break;
1052 }
1053
1054 return OS;
1055}
1056#endif
1057
1061
1063 if (!DAG.LIS)
1064 return false;
1065
1066 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1067 return true;
1068}
1069
1072 return false;
1073
1075 return false;
1076
1077 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1078 return false;
1079
1080 SavedMutations.swap(DAG.Mutations);
1081 DAG.addMutation(
1083
1084 InitialOccupancy = DAG.MinOccupancy;
1085 // Aggressivly try to reduce register pressure in the unclustered high RP
1086 // stage. Temporarily increase occupancy target in the region.
1087 S.SGPRLimitBias = S.HighRPSGPRBias;
1088 S.VGPRLimitBias = S.HighRPVGPRBias;
1089 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
1090 MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);
1091
1092 LLVM_DEBUG(
1093 dbgs()
1094 << "Retrying function scheduling without clustering. "
1095 "Aggressivly try to reduce register pressure to achieve occupancy "
1096 << DAG.MinOccupancy << ".\n");
1097
1098 return true;
1099}
1100
1103 return false;
1104
1106 return false;
1107
1108 // Don't bother trying to improve ILP in lower RP regions if occupancy has not
1109 // been dropped. All regions will have already been scheduled with the ideal
1110 // occupancy targets.
1111 if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1112 return false;
1113
1114 LLVM_DEBUG(
1115 dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1116 << DAG.MinOccupancy << ".\n");
1117 return true;
1118}
1119
1120/// Allows to easily filter for this stage's debug output.
1121#define REMAT_PREFIX "[PreRARemat] "
1122#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
1123
1125 // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
1126 // regions inbetween the defs and region we sinked the def to. Will need to be
1127 // fixed if there is another pass after this pass.
1128 assert(!S.hasNextStage());
1129
1130 if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() == 1)
1131 return false;
1132
1133 // Before performing any IR modification record the parent region of each MI
1134 // and the parent MBB of each region.
1135 const unsigned NumRegions = DAG.Regions.size();
1136 RegionBB.reserve(NumRegions);
1137 for (unsigned I = 0; I < NumRegions; ++I) {
1138 RegionBoundaries Region = DAG.Regions[I];
1139 for (auto MI = Region.first; MI != Region.second; ++MI)
1140 MIRegion.insert({&*MI, I});
1141 RegionBB.push_back(Region.first->getParent());
1142 }
1143
1144 if (!canIncreaseOccupancyOrReduceSpill())
1145 return false;
1146
1147 // Rematerialize identified instructions and update scheduler's state.
1148 rematerialize();
1149 if (GCNTrackers)
1150 DAG.RegionLiveOuts.buildLiveRegMap();
1151 REMAT_DEBUG({
1152 dbgs() << "Retrying function scheduling with new min. occupancy of "
1153 << AchievedOcc << " from rematerializing (original was "
1154 << DAG.MinOccupancy;
1155 if (TargetOcc)
1156 dbgs() << ", target was " << *TargetOcc;
1157 dbgs() << ")\n";
1158 });
1159
1160 if (AchievedOcc > DAG.MinOccupancy) {
1161 DAG.MinOccupancy = AchievedOcc;
1163 MFI.increaseOccupancy(MF, DAG.MinOccupancy);
1164 }
1165 return true;
1166}
1167
1169 DAG.finishBlock();
1170 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1171}
1172
1174 SavedMutations.swap(DAG.Mutations);
1175 S.SGPRLimitBias = S.VGPRLimitBias = 0;
1176 if (DAG.MinOccupancy > InitialOccupancy) {
1178 << " stage successfully increased occupancy to "
1179 << DAG.MinOccupancy << '\n');
1180 }
1181
1183}
1184
1186 // Check whether this new region is also a new block.
1187 if (DAG.RegionBegin->getParent() != CurrentMBB)
1188 setupNewBlock();
1189
1190 unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
1191 DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
1192
1193 // Skip empty scheduling regions (0 or 1 schedulable instructions).
1194 if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
1195 return false;
1196
1197 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1198 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1199 << " " << CurrentMBB->getName()
1200 << "\n From: " << *DAG.begin() << " To: ";
1201 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1202 else dbgs() << "End";
1203 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1204
1205 // Save original instruction order before scheduling for possible revert.
1206 Unsched.clear();
1207 Unsched.reserve(DAG.NumRegionInstrs);
1210 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
1211 for (auto &I : DAG) {
1212 Unsched.push_back(&I);
1213 if (SII->isIGLPMutationOnly(I.getOpcode()))
1214 DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1215 }
1216 } else {
1217 for (auto &I : DAG)
1218 Unsched.push_back(&I);
1219 }
1220
1221 PressureBefore = DAG.Pressure[RegionIdx];
1222
1223 LLVM_DEBUG(
1224 dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1225 << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1226 << "Region live-in pressure: "
1227 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1228 << "Region register pressure: " << print(PressureBefore));
1229
1230 S.HasHighPressure = false;
1231 S.KnownExcessRP = isRegionWithExcessRP();
1232
1233 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1235 SavedMutations.clear();
1236 SavedMutations.swap(DAG.Mutations);
1237 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1239 DAG.addMutation(createIGroupLPDAGMutation(
1240 IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1242 }
1243
1244 return true;
1245}
1246
1248 // Only reschedule regions that have excess register pressure (i.e. spilling)
1249 // or had minimum occupancy at the beginning of the stage (as long as
1250 // rescheduling of previous regions did not make occupancy drop back down to
1251 // the initial minimum).
1252 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1253 if (!DAG.RegionsWithExcessRP[RegionIdx] &&
1254 (DAG.MinOccupancy <= InitialOccupancy ||
1255 DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
1256 InitialOccupancy))
1257 return false;
1258
1260}
1261
1263 // We may need to reschedule this region if it wasn't rescheduled in the last
1264 // stage, or if we found it was testing critical register pressure limits in
1265 // the unclustered reschedule stage. The later is because we may not have been
1266 // able to raise the min occupancy in the previous stage so the region may be
1267 // overly constrained even if it was already rescheduled.
1268 if (!DAG.RegionsWithHighRP[RegionIdx])
1269 return false;
1270
1272}
1273
1275 return RescheduleRegions[RegionIdx] && GCNSchedStage::initGCNRegion();
1276}
1277
1279 if (CurrentMBB)
1280 DAG.finishBlock();
1281
1282 CurrentMBB = DAG.RegionBegin->getParent();
1283 DAG.startBlock(CurrentMBB);
1284 // Get real RP for the region if it hasn't be calculated before. After the
1285 // initial schedule stage real RP will be collected after scheduling.
1289 DAG.computeBlockPressure(RegionIdx, CurrentMBB);
1290}
1291
1293 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1294 if (S.HasHighPressure)
1295 DAG.RegionsWithHighRP[RegionIdx] = true;
1296
1297 // Revert scheduling if we have dropped occupancy or there is some other
1298 // reason that the original schedule is better.
1300
1301 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1303 SavedMutations.swap(DAG.Mutations);
1304
1305 DAG.exitRegion();
1306 advanceRegion();
1307}
1308
1310 // Check the results of scheduling.
1311 PressureAfter = DAG.getRealRegPressure(RegionIdx);
1312
1313 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1314 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1315
1316 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1317
1318 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1319 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1320 DAG.Pressure[RegionIdx] = PressureAfter;
1321
1322 // Early out if we have achieved the occupancy target.
1323 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1324 return;
1325 }
1326
1327 unsigned TargetOccupancy = std::min(
1328 S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
1329 unsigned WavesAfter = std::min(
1330 TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
1331 unsigned WavesBefore = std::min(
1332 TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
1333 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1334 << ", after " << WavesAfter << ".\n");
1335
1336 // We may not be able to keep the current target occupancy because of the just
1337 // scheduled region. We might still be able to revert scheduling if the
1338 // occupancy before was higher, or if the current schedule has register
1339 // pressure higher than the excess limits which could lead to more spilling.
1340 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
1341
1342 // Allow memory bound functions to drop to 4 waves if not limited by an
1343 // attribute.
1344 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1345 WavesAfter >= MFI.getMinAllowedOccupancy()) {
1346 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1347 << MFI.getMinAllowedOccupancy() << " waves\n");
1348 NewOccupancy = WavesAfter;
1349 }
1350
1351 if (NewOccupancy < DAG.MinOccupancy) {
1352 DAG.MinOccupancy = NewOccupancy;
1353 MFI.limitOccupancy(DAG.MinOccupancy);
1354 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1355 << DAG.MinOccupancy << ".\n");
1356 }
1357 // The maximum number of arch VGPR on non-unified register file, or the
1358 // maximum VGPR + AGPR in the unified register file case.
1359 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1360 // The maximum number of arch VGPR for both unified and non-unified register
1361 // file.
1362 unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
1363 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1364
1365 if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
1366 PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
1367 PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1368 PressureAfter.getSGPRNum() > MaxSGPRs) {
1369 DAG.RegionsWithHighRP[RegionIdx] = true;
1370 DAG.RegionsWithExcessRP[RegionIdx] = true;
1371 }
1372
1373 // Revert if this region's schedule would cause a drop in occupancy or
1374 // spilling.
1375 if (shouldRevertScheduling(WavesAfter))
1377 else
1378 DAG.Pressure[RegionIdx] = PressureAfter;
1379}
1380
1381unsigned
1382GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1383 DenseMap<unsigned, unsigned> &ReadyCycles,
1384 const TargetSchedModel &SM) {
1385 unsigned ReadyCycle = CurrCycle;
1386 for (auto &D : SU.Preds) {
1387 if (D.isAssignedRegDep()) {
1388 MachineInstr *DefMI = D.getSUnit()->getInstr();
1389 unsigned Latency = SM.computeInstrLatency(DefMI);
1390 unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
1391 ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
1392 }
1393 }
1394 ReadyCycles[SU.NodeNum] = ReadyCycle;
1395 return ReadyCycle;
1396}
1397
1398#ifndef NDEBUG
1400 bool operator()(std::pair<MachineInstr *, unsigned> A,
1401 std::pair<MachineInstr *, unsigned> B) const {
1402 return A.second < B.second;
1403 }
1404};
1405
1406static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1407 EarlierIssuingCycle> &ReadyCycles) {
1408 if (ReadyCycles.empty())
1409 return;
1410 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1411 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1412 << " ##################\n# Cycle #\t\t\tInstruction "
1413 " "
1414 " \n";
1415 unsigned IPrev = 1;
1416 for (auto &I : ReadyCycles) {
1417 if (I.second > IPrev + 1)
1418 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1419 << " CYCLES DETECTED ******************************\n\n";
1420 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
1421 IPrev = I.second;
1422 }
1423}
1424#endif
1425
1427GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1428#ifndef NDEBUG
1429 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1430 ReadyCyclesSorted;
1431#endif
1432 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1433 unsigned SumBubbles = 0;
1434 DenseMap<unsigned, unsigned> ReadyCycles;
1435 unsigned CurrCycle = 0;
1436 for (auto &SU : InputSchedule) {
1437 unsigned ReadyCycle =
1438 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1439 SumBubbles += ReadyCycle - CurrCycle;
1440#ifndef NDEBUG
1441 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1442#endif
1443 CurrCycle = ++ReadyCycle;
1444 }
1445#ifndef NDEBUG
1446 LLVM_DEBUG(
1447 printScheduleModel(ReadyCyclesSorted);
1448 dbgs() << "\n\t"
1449 << "Metric: "
1450 << (SumBubbles
1451 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1452 : 1)
1453 << "\n\n");
1454#endif
1455
1456 return ScheduleMetrics(CurrCycle, SumBubbles);
1457}
1458
1461#ifndef NDEBUG
1462 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1463 ReadyCyclesSorted;
1464#endif
1465 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1466 unsigned SumBubbles = 0;
1467 DenseMap<unsigned, unsigned> ReadyCycles;
1468 unsigned CurrCycle = 0;
1469 for (auto &MI : DAG) {
1470 SUnit *SU = DAG.getSUnit(&MI);
1471 if (!SU)
1472 continue;
1473 unsigned ReadyCycle =
1474 computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
1475 SumBubbles += ReadyCycle - CurrCycle;
1476#ifndef NDEBUG
1477 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1478#endif
1479 CurrCycle = ++ReadyCycle;
1480 }
1481#ifndef NDEBUG
1482 LLVM_DEBUG(
1483 printScheduleModel(ReadyCyclesSorted);
1484 dbgs() << "\n\t"
1485 << "Metric: "
1486 << (SumBubbles
1487 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1488 : 1)
1489 << "\n\n");
1490#endif
1491
1492 return ScheduleMetrics(CurrCycle, SumBubbles);
1493}
1494
1495bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
1496 if (WavesAfter < DAG.MinOccupancy)
1497 return true;
1498
1499 // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
1500 if (DAG.MFI.isDynamicVGPREnabled()) {
1501 unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1502 &ST, DAG.MFI.getDynamicVGPRBlockSize(),
1503 PressureBefore.getVGPRNum(false));
1504 unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1505 &ST, DAG.MFI.getDynamicVGPRBlockSize(),
1506 PressureAfter.getVGPRNum(false));
1507 if (BlocksAfter > BlocksBefore)
1508 return true;
1509 }
1510
1511 return false;
1512}
1513
1516 return false;
1517
1519 return true;
1520
1521 if (mayCauseSpilling(WavesAfter))
1522 return true;
1523
1524 return false;
1525}
1526
1528 // If RP is not reduced in the unclustered reschedule stage, revert to the
1529 // old schedule.
1530 if ((WavesAfter <=
1531 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
1532 mayCauseSpilling(WavesAfter)) ||
1534 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
1535 return true;
1536 }
1537
1538 // Do not attempt to relax schedule even more if we are already spilling.
1540 return false;
1541
1542 LLVM_DEBUG(
1543 dbgs()
1544 << "\n\t *** In shouldRevertScheduling ***\n"
1545 << " *********** BEFORE UnclusteredHighRPStage ***********\n");
1546 ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
1547 LLVM_DEBUG(
1548 dbgs()
1549 << "\n *********** AFTER UnclusteredHighRPStage ***********\n");
1551 unsigned OldMetric = MBefore.getMetric();
1552 unsigned NewMetric = MAfter.getMetric();
1553 unsigned WavesBefore = std::min(
1554 S.getTargetOccupancy(),
1555 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
1556 unsigned Profit =
1557 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
1559 NewMetric) /
1561 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
1562 << MAfter << "Profit: " << Profit << "\n");
1563 return Profit < ScheduleMetrics::ScaleFactor;
1564}
1565
1568 return false;
1569
1571 return true;
1572
1573 if (mayCauseSpilling(WavesAfter))
1574 return true;
1575
1576 return false;
1577}
1578
1580 return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
1581 mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc);
1582}
1583
1585 if (mayCauseSpilling(WavesAfter))
1586 return true;
1587
1588 return false;
1589}
1590
1592 unsigned WavesAfter) {
1593 return mayCauseSpilling(WavesAfter);
1594}
1595
1596bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
1597 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
1599 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
1600 return true;
1601 }
1602
1603 return false;
1604}
1605
1607 LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
1608 DAG.RegionEnd = DAG.RegionBegin;
1609 int SkippedDebugInstr = 0;
1610 for (MachineInstr *MI : Unsched) {
1611 if (MI->isDebugInstr()) {
1612 ++SkippedDebugInstr;
1613 continue;
1614 }
1615
1616 if (MI->getIterator() != DAG.RegionEnd) {
1617 DAG.BB->splice(DAG.RegionEnd, DAG.BB, MI);
1618 if (!MI->isDebugInstr())
1619 DAG.LIS->handleMove(*MI, true);
1620 }
1621
1622 // Reset read-undef flags and update them later.
1623 for (auto &Op : MI->all_defs())
1624 Op.setIsUndef(false);
1625 RegisterOperands RegOpers;
1626 RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
1627 if (!MI->isDebugInstr()) {
1628 if (DAG.ShouldTrackLaneMasks) {
1629 // Adjust liveness and add missing dead+read-undef flags.
1630 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
1631 RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
1632 } else {
1633 // Adjust for missing dead-def flags.
1634 RegOpers.detectDeadDefs(*MI, *DAG.LIS);
1635 }
1636 }
1637 DAG.RegionEnd = MI->getIterator();
1638 ++DAG.RegionEnd;
1639 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
1640 }
1641
1642 // After reverting schedule, debug instrs will now be at the end of the block
1643 // and RegionEnd will point to the first debug instr. Increment RegionEnd
1644 // pass debug instrs to the actual end of the scheduling region.
1645 while (SkippedDebugInstr-- > 0)
1646 ++DAG.RegionEnd;
1647
1648 // If Unsched.front() instruction is a debug instruction, this will actually
1649 // shrink the region since we moved all debug instructions to the end of the
1650 // block. Find the first instruction that is not a debug instruction.
1651 DAG.RegionBegin = Unsched.front()->getIterator();
1652 if (DAG.RegionBegin->isDebugInstr()) {
1653 for (MachineInstr *MI : Unsched) {
1654 if (MI->isDebugInstr())
1655 continue;
1656 DAG.RegionBegin = MI->getIterator();
1657 break;
1658 }
1659 }
1660
1661 // Then move the debug instructions back into their correct place and set
1662 // RegionBegin and RegionEnd if needed.
1663 DAG.placeDebugValues();
1664
1665 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1666}
1667
1668bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
1669 const Function &F = MF.getFunction();
1670
1671 // Maps optimizable regions (i.e., regions at minimum and register-limited
1672 // occupancy, or regions with spilling) to the target RP we would like to
1673 // reach.
1675 unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
1676 unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
1677 auto ResetTargetRegions = [&]() {
1678 OptRegions.clear();
1679 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1680 const GCNRegPressure &RP = DAG.Pressure[I];
1681 GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP);
1682 if (!Target.satisfied())
1683 OptRegions.insert({I, Target});
1684 }
1685 };
1686
1687 ResetTargetRegions();
1688 if (!OptRegions.empty() || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
1689 // In addition to register usage being above addressable limits, occupancy
1690 // below the minimum is considered like "spilling" as well.
1691 TargetOcc = std::nullopt;
1692 } else {
1693 // There is no spilling and room to improve occupancy; set up "increased
1694 // occupancy targets" for all regions.
1695 TargetOcc = DAG.MinOccupancy + 1;
1696 unsigned VGPRBlockSize =
1697 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
1698 MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
1699 MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
1700 ResetTargetRegions();
1701 }
1702 REMAT_DEBUG({
1703 dbgs() << "Analyzing ";
1704 MF.getFunction().printAsOperand(dbgs(), false);
1705 dbgs() << ": ";
1706 if (OptRegions.empty()) {
1707 dbgs() << "no objective to achieve, occupancy is maximal at "
1708 << MFI.getMaxWavesPerEU();
1709 } else if (!TargetOcc) {
1710 dbgs() << "reduce spilling (minimum target occupancy is "
1711 << MFI.getMinWavesPerEU() << ')';
1712 } else {
1713 dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to "
1714 << TargetOcc;
1715 }
1716 dbgs() << '\n';
1717 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1718 if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) {
1719 dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond()
1720 << '\n';
1721 }
1722 }
1723 });
1724 if (OptRegions.empty())
1725 return false;
1726
1727 // Accounts for a reduction in RP in an optimizable region. Returns whether we
1728 // estimate that we have identified enough rematerialization opportunities to
1729 // achieve our goal, and sets Progress to true when this particular reduction
1730 // in pressure was helpful toward that goal.
1731 auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask,
1732 bool &Progress) -> bool {
1733 GCNRPTarget &Target = OptIt->getSecond();
1734 if (!Target.isSaveBeneficial(Reg))
1735 return false;
1736 Progress = true;
1737 Target.saveReg(Reg, Mask, DAG.MRI);
1738 if (Target.satisfied())
1739 OptRegions.erase(OptIt->getFirst());
1740 return OptRegions.empty();
1741 };
1742
1743 // We need up-to-date live-out info. to query live-out register masks in
1744 // regions containing rematerializable instructions.
1745 DAG.RegionLiveOuts.buildLiveRegMap();
1746
1747 // Cache set of registers that are going to be rematerialized.
1748 DenseSet<unsigned> RematRegs;
1749
1750 // Identify rematerializable instructions in the function.
1751 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
1752 auto Region = DAG.Regions[I];
1753 for (auto MI = Region.first; MI != Region.second; ++MI) {
1754 // The instruction must be rematerializable.
1755 MachineInstr &DefMI = *MI;
1756 if (!isReMaterializable(DefMI))
1757 continue;
1758
1759 // We only support rematerializing virtual registers with one definition.
1760 Register Reg = DefMI.getOperand(0).getReg();
1761 if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg))
1762 continue;
1763
1764 // We only care to rematerialize the instruction if it has a single
1765 // non-debug user in a different region. The using MI may not belong to a
1766 // region if it is a lone region terminator.
1767 MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
1768 if (!UseMI)
1769 continue;
1770 auto UseRegion = MIRegion.find(UseMI);
1771 if (UseRegion != MIRegion.end() && UseRegion->second == I)
1772 continue;
1773
1774 // Do not rematerialize an instruction if it uses or is used by an
1775 // instruction that we have designated for rematerialization.
1776 // FIXME: Allow for rematerialization chains: this requires 1. updating
1777 // remat points to account for uses that are rematerialized, and 2. either
1778 // rematerializing the candidates in careful ordering, or deferring the
1779 // MBB RP walk until the entire chain has been rematerialized.
1780 if (Rematerializations.contains(UseMI) ||
1781 llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) {
1782 return MO.isReg() && RematRegs.contains(MO.getReg());
1783 }))
1784 continue;
1785
1786 // Do not rematerialize an instruction it it uses registers that aren't
1787 // available at its use. This ensures that we are not extending any live
1788 // range while rematerializing.
1789 SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
1790 if (!VirtRegAuxInfo::allUsesAvailableAt(&DefMI, UseIdx, *DAG.LIS, DAG.MRI,
1791 *DAG.TII))
1792 continue;
1793
1794 REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
1795 RematInstruction &Remat =
1796 Rematerializations.try_emplace(&DefMI, UseMI).first->second;
1797
1798 bool RematUseful = false;
1799 if (auto It = OptRegions.find(I); It != OptRegions.end()) {
1800 // Optimistically consider that moving the instruction out of its
1801 // defining region will reduce RP in the latter; this assumes that
1802 // maximum RP in the region is reached somewhere between the defining
1803 // instruction and the end of the region.
1804 REMAT_DEBUG(dbgs() << " Defining region is optimizable\n");
1805 LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
1806 if (ReduceRPInRegion(It, Reg, Mask, RematUseful))
1807 return true;
1808 }
1809
1810 for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
1811 // We are only collecting regions in which the register is a live-in
1812 // (and may be live-through).
1813 auto It = DAG.LiveIns[LIRegion].find(Reg);
1814 if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
1815 continue;
1816 Remat.LiveInRegions.insert(LIRegion);
1817
1818 // Account for the reduction in RP due to the rematerialization in an
1819 // optimizable region in which the defined register is a live-in. This
1820 // is exact for live-through region but optimistic in the using region,
1821 // where RP is actually reduced only if maximum RP is reached somewhere
1822 // between the beginning of the region and the rematerializable
1823 // instruction's use.
1824 if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
1825 REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n');
1826 if (ReduceRPInRegion(It, Reg, DAG.LiveIns[LIRegion][Reg],
1827 RematUseful))
1828 return true;
1829 }
1830 }
1831
1832 // If the instruction is not a live-in or live-out in any optimizable
1833 // region then there is no point in rematerializing it.
1834 if (!RematUseful) {
1835 Rematerializations.pop_back();
1836 REMAT_DEBUG(dbgs() << " No impact, not rematerializing instruction\n");
1837 } else {
1838 RematRegs.insert(Reg);
1839 }
1840 }
1841 }
1842
1843 if (TargetOcc) {
1844 // We were trying to increase occupancy but failed, abort the stage.
1845 REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
1846 Rematerializations.clear();
1847 return false;
1848 }
1849 REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
1850 return !Rematerializations.empty();
1851}
1852
1853void PreRARematStage::rematerialize() {
1854 const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
1855
1856 // Collect regions whose RP changes in unpredictable way; we will have to
1857 // fully recompute their RP after all rematerailizations.
1858 DenseSet<unsigned> RecomputeRP;
1859
1860 // Rematerialize all instructions.
1861 for (auto &[DefMI, Remat] : Rematerializations) {
1862 MachineBasicBlock::iterator InsertPos(Remat.UseMI);
1864 unsigned DefRegion = MIRegion.at(DefMI);
1865
1866 // Rematerialize DefMI to its use block.
1867 TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
1868 AMDGPU::NoSubRegister, *DefMI, *DAG.TRI);
1869 Remat.RematMI = &*std::prev(InsertPos);
1870 DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
1871
1872 // Update region boundaries in regions we sinked from (remove defining MI)
1873 // and to (insert MI rematerialized in use block). Only then we can erase
1874 // the original MI.
1875 DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
1876 auto UseRegion = MIRegion.find(Remat.UseMI);
1877 if (UseRegion != MIRegion.end()) {
1878 DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
1879 Remat.RematMI);
1880 }
1881 DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
1883
1884 // Collect all regions impacted by the rematerialization and update their
1885 // live-in/RP information.
1886 for (unsigned I : Remat.LiveInRegions) {
1887 ImpactedRegions.insert({I, DAG.Pressure[I]});
1888 GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
1889
1890#ifdef EXPENSIVE_CHECKS
1891 // All uses are known to be available / live at the remat point. Thus, the
1892 // uses should already be live in to the region.
1893 for (MachineOperand &MO : DefMI->operands()) {
1894 if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
1895 continue;
1896
1897 Register UseReg = MO.getReg();
1898 if (!UseReg.isVirtual())
1899 continue;
1900
1901 LiveInterval &LI = DAG.LIS->getInterval(UseReg);
1902 LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
1903 if (LI.hasSubRanges() && MO.getSubReg())
1904 LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
1905
1906 LaneBitmask LiveInMask = RegionLiveIns.at(UseReg);
1907 LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
1908 // If this register has lanes not covered by the LiveIns, be sure they
1909 // do not map to any subrange. ref:
1910 // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
1911 if (UncoveredLanes.any()) {
1912 assert(LI.hasSubRanges());
1913 for (LiveInterval::SubRange &SR : LI.subranges())
1914 assert((SR.LaneMask & UncoveredLanes).none());
1915 }
1916 }
1917#endif
1918
1919 // The register is no longer a live-in in all regions but the one that
1920 // contains the single use. In live-through regions, maximum register
1921 // pressure decreases predictably so we can directly update it. In the
1922 // using region, maximum RP may or may not decrease, so we will mark it
1923 // for re-computation after all materializations have taken place.
1924 LaneBitmask PrevMask = RegionLiveIns[Reg];
1925 RegionLiveIns.erase(Reg);
1926 RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask});
1927 if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent())
1928 DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
1929 else
1930 RecomputeRP.insert(I);
1931 }
1932 // RP in the region from which the instruction was rematerialized may or may
1933 // not decrease.
1934 ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]});
1935 RecomputeRP.insert(DefRegion);
1936
1937 // Recompute live interval to reflect the register's rematerialization.
1938 Register RematReg = Remat.RematMI->getOperand(0).getReg();
1939 DAG.LIS->removeInterval(RematReg);
1940 DAG.LIS->createAndComputeVirtRegInterval(RematReg);
1941 }
1942
1943 // All regions impacted by at least one rematerialization must be rescheduled.
1944 // Maximum pressure must also be recomputed for all regions where it changed
1945 // non-predictably and checked against the target occupancy.
1946 unsigned DynamicVGPRBlockSize =
1947 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
1948 AchievedOcc = MFI.getMaxWavesPerEU();
1949 for (auto &[I, OriginalRP] : ImpactedRegions) {
1950 bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
1951 RescheduleRegions[I] = !IsEmptyRegion;
1952 if (!RecomputeRP.contains(I))
1953 continue;
1954
1955 GCNRegPressure RP;
1956 if (IsEmptyRegion) {
1957 RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
1958 } else {
1959 GCNDownwardRPTracker RPT(*DAG.LIS);
1960 auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
1961 DAG.Regions[I].second);
1962 if (NonDbgMI == DAG.Regions[I].second) {
1963 // Region is non-empty but contains only debug instructions.
1964 RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
1965 } else {
1966 RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
1967 RPT.advance(DAG.Regions[I].second);
1968 RP = RPT.moveMaxPressure();
1969 }
1970 }
1971 DAG.Pressure[I] = RP;
1972 AchievedOcc =
1973 std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
1974 }
1975 REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
1976}
1977
1978// Copied from MachineLICM
1979bool PreRARematStage::isReMaterializable(const MachineInstr &MI) {
1980 if (!DAG.TII->isReMaterializable(MI))
1981 return false;
1982
1983 for (const MachineOperand &MO : MI.all_uses()) {
1984 // We can't remat physreg uses, unless it is a constant or an ignorable
1985 // use (e.g. implicit exec use on VALU instructions)
1986 if (MO.getReg().isPhysical()) {
1987 if (DAG.MRI.isConstantPhysReg(MO.getReg()) || DAG.TII->isIgnorableUse(MO))
1988 continue;
1989 return false;
1990 }
1991 }
1992
1993 return true;
1994}
1995
1997 // We consider that reducing spilling is always beneficial so we never
1998 // rollback rematerializations in such cases. It's also possible that
1999 // rescheduling lowers occupancy over the one achieved just through remats, in
2000 // which case we do not want to rollback either (the rescheduling was already
2001 // reverted in PreRARematStage::shouldRevertScheduling in such cases).
2002 unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
2003 if (!TargetOcc || MaxOcc >= *TargetOcc)
2004 return;
2005
2006 REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
2007 const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
2008
2009 // Rollback the rematerializations.
2010 for (const auto &[DefMI, Remat] : Rematerializations) {
2011 MachineInstr &RematMI = *Remat.RematMI;
2012 unsigned DefRegion = MIRegion.at(DefMI);
2013 MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
2014 MachineBasicBlock *MBB = RegionBB[DefRegion];
2015 Register Reg = RematMI.getOperand(0).getReg();
2016
2017 // Re-rematerialize MI at the end of its original region. Note that it may
2018 // not be rematerialized exactly in the same position as originally within
2019 // the region, but it should not matter much.
2020 TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI,
2021 *DAG.TRI);
2022 MachineInstr *NewMI = &*std::prev(InsertPos);
2023 DAG.LIS->InsertMachineInstrInMaps(*NewMI);
2024
2025 auto UseRegion = MIRegion.find(Remat.UseMI);
2026 if (UseRegion != MIRegion.end()) {
2027 DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI,
2028 nullptr);
2029 }
2030 DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI);
2031
2032 // Erase rematerialized MI.
2033 DAG.LIS->RemoveMachineInstrFromMaps(RematMI);
2034 RematMI.eraseFromParent();
2035
2036 // Recompute live interval for the re-rematerialized register
2037 DAG.LIS->removeInterval(Reg);
2038 DAG.LIS->createAndComputeVirtRegInterval(Reg);
2039
2040 // Re-add the register as a live-in in all regions it used to be one in.
2041 for (unsigned LIRegion : Remat.LiveInRegions)
2042 DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})});
2043 }
2044
2045 // Reset RP in all impacted regions.
2046 for (auto &[I, OriginalRP] : ImpactedRegions)
2047 DAG.Pressure[I] = OriginalRP;
2048
2050}
2051
2052void GCNScheduleDAGMILive::updateRegionBoundaries(
2054 MachineInstr *NewMI) {
2055 assert((!NewMI || NewMI != RegionBounds.second) &&
2056 "cannot remove at region end");
2057
2058 if (RegionBounds.first == RegionBounds.second) {
2059 assert(NewMI && "cannot remove from an empty region");
2060 RegionBounds.first = NewMI;
2061 return;
2062 }
2063
2064 // We only care for modifications at the beginning of a non-empty region since
2065 // the upper region boundary is exclusive.
2066 if (MI != RegionBounds.first)
2067 return;
2068 if (!NewMI)
2069 RegionBounds.first = std::next(MI); // Removal
2070 else
2071 RegionBounds.first = NewMI; // Insertion
2072}
2073
2075 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
2076 return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) {
2077 return SII->isIGLPMutationOnly(MI->getOpcode());
2078 });
2079}
2080
2082 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
2083 bool RemoveKillFlags)
2085
2087 HasIGLPInstrs = hasIGLPInstrs(this);
2088 if (HasIGLPInstrs) {
2089 SavedMutations.clear();
2090 SavedMutations.swap(Mutations);
2092 }
2093
2095}
2096
2098 if (HasIGLPInstrs)
2099 SavedMutations.swap(Mutations);
2100
2102}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file defines the GCNRegPressure class, which tracks registry pressure by bookkeeping number of S...
static cl::opt< bool > GCNTrackers("amdgpu-use-amdgpu-trackers", cl::Hidden, cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false))
static cl::opt< bool > DisableClusteredLowOccupancy("amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden, cl::desc("Disable clustered low occupancy " "rescheduling for ILP scheduling stage."), cl::init(false))
#define REMAT_PREFIX
Allows to easily filter for this stage's debug output.
static MachineInstr * getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, MachineBasicBlock::iterator RegionEnd)
static cl::opt< bool > RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, cl::desc("Relax occupancy targets for kernels which are memory " "bound (amdgpu-membound-threshold), or " "Wave Limited (amdgpu-limit-wave-threshold)."), cl::init(false))
#define REMAT_DEBUG(X)
static cl::opt< bool > DisableUnclusterHighRP("amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden, cl::desc("Disable unclustered high register pressure " "reduction scheduling stage."), cl::init(false))
static void printScheduleModel(std::set< std::pair< MachineInstr *, unsigned >, EarlierIssuingCycle > &ReadyCycles)
static cl::opt< bool > PrintMaxRPRegUsageAfterScheduler("amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure after scheduling."), cl::init(false))
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG)
static bool canUsePressureDiffs(const SUnit &SU)
Checks whether SU can use the cached DAG pressure diffs to compute the current register pressure.
static void getRegisterPressures(bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, std::vector< unsigned > &Pressure, std::vector< unsigned > &MaxPressure, GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, ScheduleDAGMI *DAG, const SIRegisterInfo *SRI)
static cl::opt< bool > PrintMaxRPRegUsageBeforeScheduler("amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure before scheduling."), cl::init(false))
static cl::opt< unsigned > ScheduleMetricBias("amdgpu-schedule-metric-bias", cl::Hidden, cl::desc("Sets the bias which adds weight to occupancy vs latency. Set it to " "100 to chase the occupancy only."), cl::init(10))
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Register Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
if(PassOpts->AAPipeline)
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool shouldRevertScheduling(unsigned WavesAfter) override
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:167
bool erase(const KeyT &Val)
Definition DenseMap.h:311
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:213
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:222
GCNRegPressure bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI) const
Mostly copy/paste from CodeGen/RegisterPressure.cpp Calculate the impact MI will have on CurPressure ...
GCNMaxILPSchedStrategy(const MachineSchedContext *C)
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
Apply a set of heuristics to a new candidate.
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as much as possible.
GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext *C)
GCNMaxOccupancySchedStrategy(const MachineSchedContext *C, bool IsLegacyScheduler=false)
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNPostScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
Models a register pressure target, allowing to evaluate and track register savings against that targe...
GCNRegPressure getPressure() const
DenseMap< unsigned, LaneBitmask > LiveRegSet
GCNSchedStrategy & S
GCNRegPressure PressureBefore
bool isRegionWithExcessRP() const
bool mayCauseSpilling(unsigned WavesAfter)
ScheduleMetrics getScheduleMetrics(const std::vector< SUnit > &InputSchedule)
GCNScheduleDAGMILive & DAG
const GCNSchedStageID StageID
std::vector< MachineInstr * > Unsched
GCNRegPressure PressureAfter
MachineFunction & MF
SIMachineFunctionInfo & MFI
unsigned computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, DenseMap< unsigned, unsigned > &ReadyCycles, const TargetSchedModel &SM)
virtual void finalizeGCNSchedStage()
virtual bool initGCNSchedStage()
virtual bool shouldRevertScheduling(unsigned WavesAfter)
std::vector< std::unique_ptr< ScheduleDAGMutation > > SavedMutations
GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
MachineBasicBlock * CurrentMBB
const GCNSubtarget & ST
This is a minimal scheduler strategy.
GCNDownwardRPTracker DownwardTracker
GCNSchedStrategy(const MachineSchedContext *C)
SmallVector< GCNSchedStageID, 4 > SchedStages
SUnit * pickNodeBidirectional(bool &IsTopNode)
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand, bool IsBottomUp)
std::vector< unsigned > MaxPressure
GCNSchedStageID getCurrentStage()
SmallVectorImpl< GCNSchedStageID >::iterator CurrentStage
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
GCNDownwardRPTracker * getDownwardTracker()
std::vector< unsigned > Pressure
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
GCNUpwardRPTracker UpwardTracker
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp)
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
GCNUpwardRPTracker * getUpwardTracker()
GCNSchedStageID getNextStage() const
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
void recede(const MachineInstr &MI)
Move to the state of RP just before the MI .
void traceCandidate(const SchedCandidate &Cand)
LLVM_ABI void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone, SchedBoundary *OtherZone)
Set the CandPolicy given a scheduling zone given the current resources and latencies inside and outsi...
MachineSchedPolicy RegionPolicy
const TargetSchedModel * SchedModel
const MachineSchedContext * Context
const TargetRegisterInfo * TRI
SchedCandidate BotCand
Candidate last picked from Bot boundary.
SchedCandidate TopCand
Candidate last picked from Top boundary.
virtual bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Apply a set of heuristics to a new candidate.
ScheduleDAGMILive * DAG
void initialize(ScheduleDAGMI *dag) override
Initialize the strategy after building the DAG for a new region.
void schedNode(SUnit *SU, bool IsTopNode) override
Update the scheduler's state after scheduling a node.
GenericScheduler(const MachineSchedContext *C)
bool shouldRevertScheduling(unsigned WavesAfter) override
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
LLVM_ABI void dump() const
MachineInstrBundleIterator< MachineInstr > iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
Register getReg(unsigned Idx) const
Get the register for the operand index.
Representation of each machine instruction.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
unsigned getSubReg() const
bool readsReg() const
readsReg - Returns true if this operand reads the previous value of its register.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
bool initGCNSchedStage() override
Capture a change in pressure for a single pressure set.
Helpers for implementing custom MachineSchedStrategy classes.
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void advance()
Advance across the current instruction.
LLVM_ABI void getDownwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction top-down.
const std::vector< unsigned > & getRegSetPressureAtPos() const
Get the register set pressure at the current position, which may be less than the pressure across the...
LLVM_ABI void getUpwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction bottom-up.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
List of registers defined and used by a machine instruction.
LLVM_ABI void collect(const MachineInstr &MI, const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, bool TrackLaneMasks, bool IgnoreDead)
Analyze the given instruction MI and fill in the Uses, Defs and DeadDefs list based on the MachineOpe...
LLVM_ABI void adjustLaneLiveness(const LiveIntervals &LIS, const MachineRegisterInfo &MRI, SlotIndex Pos, MachineInstr *AddFlagsMI=nullptr)
Use liveness information to find out which uses/defs are partially undefined/dead and adjust the VReg...
LLVM_ABI void detectDeadDefs(const MachineInstr &MI, const LiveIntervals &LIS)
Use liveness information to find dead defs not marked with a dead flag and move them to the DeadDefs ...
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:74
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
bool isIGLPMutationOnly(unsigned Opcode) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
unsigned NodeNum
Entry # of node in the node vector.
unsigned short Latency
Node latency.
bool isScheduled
True once scheduled.
unsigned ParentClusterIdx
The parent cluster id.
bool isBottomReady() const
bool isTopReady() const
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Each Scheduling boundary is associated with ready queues.
LLVM_ABI unsigned getLatencyStallCycles(SUnit *SU)
Get the difference between the given SUnit's ready time and the current cycle.
unsigned getCurrMOps() const
Micro-ops issued in the current cycle.
A ScheduleDAG for scheduling lists of MachineInstr.
bool ScheduleSingleMIRegions
True if regions with a single MI should be scheduled.
MachineBasicBlock::iterator RegionEnd
The end of the range to be scheduled.
virtual void finalizeSchedule()
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
virtual void exitRegion()
Called when the scheduler has finished scheduling the current region.
const MachineLoopInfo * MLI
bool RemoveKillFlags
True if the DAG builder should remove kill flags (in preparation for rescheduling).
MachineBasicBlock::iterator RegionBegin
The beginning of the range to be scheduled.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
RegPressureTracker RPTracker
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMI(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
std::vector< std::unique_ptr< ScheduleDAGMutation > > Mutations
Ordered list of DAG postprocessing steps.
MachineRegisterInfo & MRI
Virtual/real register map.
const TargetInstrInfo * TII
Target instruction information.
MachineFunction & MF
Machine function.
static const unsigned ScaleFactor
unsigned getMetric() const
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
SlotIndex getMBBStartIdx(unsigned Num) const
Returns the first index in the given basic block number.
Provide an instruction scheduling machine model to CodeGen passes.
Target - Wrapper for Target specific information.
bool shouldRevertScheduling(unsigned WavesAfter) override
static bool allUsesAvailableAt(const MachineInstr *MI, SlotIndex UseIdx, const LiveIntervals &LIS, const MachineRegisterInfo &MRI, const TargetInstrInfo &TII)
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize)
unsigned getDynamicVGPRBlockSize(const Function &F)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
LLVM_ABI unsigned getWeakLeft(const SUnit *SU, bool isTop)
GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI, Range &&LiveRegs)
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
std::pair< MachineBasicBlock::iterator, MachineBasicBlock::iterator > RegionBoundaries
A region's boundaries i.e.
IterT skipDebugInstructionsForward(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It until it points to a non-debug instruction or to End and return the resulting iterator.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI bool tryPressure(const PressureChange &TryP, const PressureChange &CandP, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason, const TargetRegisterInfo *TRI, const MachineFunction &MF)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
LLVM_ABI cl::opt< bool > VerifyScheduling
LLVM_ABI bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone)
IterT skipDebugInstructionsBackward(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It until it points to a non-debug instruction or to Begin and return the resulting iterator...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool isTheSameCluster(unsigned A, unsigned B)
Return whether the input cluster ID's are the same and valid.
DWARFExpression::Operation Op
LLVM_ABI bool tryGreater(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1867
DenseMap< MachineInstr *, GCNRPTracker::LiveRegSet > getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS)
creates a map MachineInstr -> LiveRegSet R - range of iterators on instructions After - upon entry or...
GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI, const LiveIntervals &LIS)
LLVM_ABI bool tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
Return true if this heuristic determines order.
LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF, GCNRegPressure::RegKind Kind, LiveIntervals &LIS, const MachineLoopInfo *MLI)
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
LLVM_ABI int biasPhysReg(const SUnit *SU, bool isTop)
Minimize physical register live ranges.
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:867
bool operator()(std::pair< MachineInstr *, unsigned > A, std::pair< MachineInstr *, unsigned > B) const
unsigned getArchVGPRNum() const
unsigned getAGPRNum() const
unsigned getSGPRNum() const
Policy for scheduling the next instruction in the candidate's zone.
Store the state used by GenericScheduler heuristics, required for the lifetime of one invocation of p...
void reset(const CandPolicy &NewPolicy)
LLVM_ABI void initResourceDelta(const ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel)
Status of an instruction's critical resource consumption.
constexpr bool any() const
Definition LaneBitmask.h:53
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...