LLVM 23.0.0git
GCNSchedStrategy.cpp
Go to the documentation of this file.
1//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This contains a MachineSchedStrategy implementation for maximizing wave
11/// occupancy on GCN hardware.
12///
13/// This pass will apply multiple scheduling stages to the same function.
14/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15/// entry point for the scheduling of those regions is
16/// GCNScheduleDAGMILive::runSchedStages.
17
18/// Generally, the reason for having multiple scheduling stages is to account
19/// for the kernel-wide effect of register usage on occupancy. Usually, only a
20/// few scheduling regions will have register pressure high enough to limit
21/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22/// other regions.
23///
24//===----------------------------------------------------------------------===//
25
26#include "GCNSchedStrategy.h"
27#include "AMDGPUIGroupLP.h"
28#include "GCNHazardRecognizer.h"
29#include "GCNRegPressure.h"
32#include "llvm/ADT/BitVector.h"
33#include "llvm/ADT/STLExtras.h"
41#include "llvm/MC/LaneBitmask.h"
43#include "llvm/MC/MCSchedule.h"
46
47#define DEBUG_TYPE "machine-scheduler"
48
49using namespace llvm;
50
52 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
53 cl::desc("Disable unclustered high register pressure "
54 "reduction scheduling stage."),
55 cl::init(false));
56
58 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
59 cl::desc("Disable clustered low occupancy "
60 "rescheduling for ILP scheduling stage."),
61 cl::init(false));
62
64 "amdgpu-schedule-metric-bias", cl::Hidden,
66 "Sets the bias which adds weight to occupancy vs latency. Set it to "
67 "100 to chase the occupancy only."),
68 cl::init(10));
69
70static cl::opt<bool>
71 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
72 cl::desc("Relax occupancy targets for kernels which are memory "
73 "bound (amdgpu-membound-threshold), or "
74 "Wave Limited (amdgpu-limit-wave-threshold)."),
75 cl::init(false));
76
78 "amdgpu-use-amdgpu-trackers", cl::Hidden,
79 cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
80 cl::init(false));
81
83 "amdgpu-scheduler-pending-queue-limit", cl::Hidden,
85 "Max (Available+Pending) size to inspect pending queue (0 disables)"),
86 cl::init(256));
87
88#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
89#define DUMP_MAX_REG_PRESSURE
91 "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
92 cl::desc("Print a list of live registers along with their def/uses at the "
93 "point of maximum register pressure before scheduling."),
94 cl::init(false));
95
97 "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
98 cl::desc("Print a list of live registers along with their def/uses at the "
99 "point of maximum register pressure after scheduling."),
100 cl::init(false));
101#endif
102
104 "amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden,
105 cl::desc("Disable rewrite mfma rewrite scheduling stage"), cl::init(true));
106
107const unsigned ScheduleMetrics::ScaleFactor = 100;
108
115
118
119 MF = &DAG->MF;
120
121 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
122
124 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
126 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
127
129 // Set the initial TargetOccupnacy to the maximum occupancy that we can
130 // achieve for this function. This effectively sets a lower bound on the
131 // 'Critical' register limits in the scheduler.
132 // Allow for lower occupancy targets if kernel is wave limited or memory
133 // bound, and using the relaxed occupancy feature.
137 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
138
139 if (!KnownExcessRP) {
140 VGPRCriticalLimit = std::min(
141 ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
143 } else {
144 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
145 // returns a reasonably small number for targets with lots of VGPRs, such
146 // as GFX10 and GFX11.
147 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
148 "VGPRCriticalLimit calculation method.\n");
149 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
150 unsigned Granule =
151 AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, DynamicVGPRBlockSize);
152 unsigned Addressable =
153 AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, DynamicVGPRBlockSize);
154 unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
155 VGPRBudget = std::max(VGPRBudget, Granule);
156 VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
157 }
158
159 // Subtract error margin and bias from register limits and avoid overflow.
164 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
165 << ", VGPRExcessLimit = " << VGPRExcessLimit
166 << ", SGPRCriticalLimit = " << SGPRCriticalLimit
167 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
168}
169
170/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
171/// current register pressure.
172///
173/// This works for the common case, but it has a few exceptions that have been
174/// observed through trial and error:
175/// - Explicit physical register operands
176/// - Subregister definitions
177///
178/// In both of those cases, PressureDiff doesn't represent the actual pressure,
179/// and querying LiveIntervals through the RegPressureTracker is needed to get
180/// an accurate value.
181///
182/// We should eventually only use PressureDiff for maximum performance, but this
183/// already allows 80% of SUs to take the fast path without changing scheduling
184/// at all. Further changes would either change scheduling, or require a lot
185/// more logic to recover an accurate pressure estimate from the PressureDiffs.
186static bool canUsePressureDiffs(const SUnit &SU) {
187 if (!SU.isInstr())
188 return false;
189
190 // Cannot use pressure diffs for subregister defs or with physregs, it's
191 // imprecise in both cases.
192 for (const auto &Op : SU.getInstr()->operands()) {
193 if (!Op.isReg() || Op.isImplicit())
194 continue;
195 if (Op.getReg().isPhysical() ||
196 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
197 return false;
198 }
199 return true;
200}
201
203 bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
204 std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
206 ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
207 // getDownwardPressure() and getUpwardPressure() make temporary changes to
208 // the tracker, so we need to pass those function a non-const copy.
209 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
210 if (!useGCNTrackers()) {
211 AtTop
212 ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
213 : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
214
215 return;
216 }
217
218 // GCNTrackers
219 Pressure.resize(4, 0);
220 MachineInstr *MI = SU->getInstr();
221 GCNRegPressure NewPressure;
222 if (AtTop) {
223 GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
224 NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
225 } else {
226 GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
227 TempUpwardTracker.recede(*MI);
228 NewPressure = TempUpwardTracker.getPressure();
229 }
230 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
231 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
232 NewPressure.getArchVGPRNum();
233 Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
234}
235
237 SUnit *SU) const {
238 // Only implemented for top-down scheduling currently.
239 if (!Zone.isTop() || !SU)
240 return 0;
241
242 MachineInstr *MI = SU->getInstr();
243 unsigned CurrCycle = Zone.getCurrCycle();
244 unsigned Stall = 0;
245
246 // Query SchedModel for resource stalls (unbuffered resources).
247 if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
248 const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
249 for (const MCWriteProcResEntry &PE :
250 make_range(SchedModel->getWriteProcResBegin(SC),
251 SchedModel->getWriteProcResEnd(SC))) {
252 unsigned NextAvail =
253 Zone.getNextResourceCycle(SC, PE.ProcResourceIdx, PE.ReleaseAtCycle,
254 PE.AcquireAtCycle)
255 .first;
256 if (NextAvail > CurrCycle)
257 Stall = std::max(Stall, NextAvail - CurrCycle);
258 }
259 }
260
261 // Query HazardRecognizer for sequence-dependent hazard penalties.
262 // AMDGPU currently installs GCNHazardRecognizer for MI scheduling only in
263 // the post-RA configuration without vreg liveness.
264 if (!DAG->hasVRegLiveness() && Zone.HazardRec &&
265 Zone.HazardRec->isEnabled()) {
266 auto *HR = static_cast<GCNHazardRecognizer *>(Zone.HazardRec);
267 Stall = std::max(Stall, HR->getHazardWaitStates(MI));
268 }
269
270 return Stall;
271}
272
274 bool AtTop,
275 const RegPressureTracker &RPTracker,
276 const SIRegisterInfo *SRI,
277 unsigned SGPRPressure,
278 unsigned VGPRPressure, bool IsBottomUp) {
279 Cand.SU = SU;
280 Cand.AtTop = AtTop;
281
282 if (!DAG->isTrackingPressure())
283 return;
284
285 Pressure.clear();
286 MaxPressure.clear();
287
288 // We try to use the cached PressureDiffs in the ScheduleDAG whenever
289 // possible over querying the RegPressureTracker.
290 //
291 // RegPressureTracker will make a lot of LIS queries which are very
292 // expensive, it is considered a slow function in this context.
293 //
294 // PressureDiffs are precomputed and cached, and getPressureDiff is just a
295 // trivial lookup into an array. It is pretty much free.
296 //
297 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
298 // PressureDiffs.
299 if (AtTop || !canUsePressureDiffs(*SU) || useGCNTrackers()) {
300 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
302 } else {
303 // Reserve 4 slots.
304 Pressure.resize(4, 0);
305 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
306 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
307
308 for (const auto &Diff : DAG->getPressureDiff(SU)) {
309 if (!Diff.isValid())
310 continue;
311 // PressureDiffs is always bottom-up so if we're working top-down we need
312 // to invert its sign.
313 Pressure[Diff.getPSet()] +=
314 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
315 }
316
317#ifdef EXPENSIVE_CHECKS
318 std::vector<unsigned> CheckPressure, CheckMaxPressure;
319 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
321 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
322 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
323 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
324 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
325 errs() << "Register Pressure is inaccurate when calculated through "
326 "PressureDiff\n"
327 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
328 << ", expected "
329 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
330 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
331 << ", expected "
332 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
333 report_fatal_error("inaccurate register pressure calculation");
334 }
335#endif
336 }
337
338 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
339 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
340
341 // If two instructions increase the pressure of different register sets
342 // by the same amount, the generic scheduler will prefer to schedule the
343 // instruction that increases the set with the least amount of registers,
344 // which in our case would be SGPRs. This is rarely what we want, so
345 // when we report excess/critical register pressure, we do it either
346 // only for VGPRs or only for SGPRs.
347
348 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
349 const unsigned MaxVGPRPressureInc = 16;
350 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
351 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
352
353 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
354 // to increase the likelihood we don't go over the limits. We should improve
355 // the analysis to look through dependencies to find the path with the least
356 // register pressure.
357
358 // We only need to update the RPDelta for instructions that increase register
359 // pressure. Instructions that decrease or keep reg pressure the same will be
360 // marked as RegExcess in tryCandidate() when they are compared with
361 // instructions that increase the register pressure.
362 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
363 HasHighPressure = true;
364 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
365 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
366 }
367
368 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
369 HasHighPressure = true;
370 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
371 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
372 }
373
374 // Register pressure is considered 'CRITICAL' if it is approaching a value
375 // that would reduce the wave occupancy for the execution unit. When
376 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
377 // has the same cost, so we don't need to prefer one over the other.
378
379 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
380 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
381
382 if (SGPRDelta >= 0 || VGPRDelta >= 0) {
383 HasHighPressure = true;
384 if (SGPRDelta > VGPRDelta) {
385 Cand.RPDelta.CriticalMax =
386 PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
387 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
388 } else {
389 Cand.RPDelta.CriticalMax =
390 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
391 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
392 }
393 }
394}
395
397 const TargetSchedModel *SchedModel) {
398 bool HasBufferedModel =
399 SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
400 unsigned Combined = Zone.Available.size() + Zone.Pending.size();
401 return Combined <= PendingQueueLimit && HasBufferedModel;
402}
403
405 const TargetSchedModel *SchedModel) {
406 // pickOnlyChoice() releases pending instructions and checks for new hazards.
407 SUnit *OnlyChoice = Zone.pickOnlyChoice();
408 if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty())
409 return OnlyChoice;
410
411 return nullptr;
412}
413
415 const SchedCandidate &Preferred) {
416 LLVM_DEBUG({
417 dbgs() << "Prefer:\t\t";
418 DAG->dumpNode(*Preferred.SU);
419
420 if (Current.SU) {
421 dbgs() << "Not:\t";
422 DAG->dumpNode(*Current.SU);
423 }
424
425 dbgs() << "Reason:\t\t";
426 traceCandidate(Preferred);
427 });
428}
429
430// This function is mostly cut and pasted from
431// GenericScheduler::pickNodeFromQueue()
433 const CandPolicy &ZonePolicy,
434 const RegPressureTracker &RPTracker,
435 SchedCandidate &Cand, bool &IsPending,
436 bool IsBottomUp) {
437 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
439 unsigned SGPRPressure = 0;
440 unsigned VGPRPressure = 0;
441 IsPending = false;
442 if (DAG->isTrackingPressure()) {
443 if (!useGCNTrackers()) {
444 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
445 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
446 } else {
447 GCNRPTracker *T = IsBottomUp
448 ? static_cast<GCNRPTracker *>(&UpwardTracker)
449 : static_cast<GCNRPTracker *>(&DownwardTracker);
450 SGPRPressure = T->getPressure().getSGPRNum();
451 VGPRPressure = T->getPressure().getArchVGPRNum();
452 }
453 }
454 LLVM_DEBUG(dbgs() << "Available Q:\n");
455 ReadyQueue &AQ = Zone.Available;
456 for (SUnit *SU : AQ) {
457
458 SchedCandidate TryCand(ZonePolicy);
459 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
460 VGPRPressure, IsBottomUp);
461 // Pass SchedBoundary only when comparing nodes from the same boundary.
462 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
463 tryCandidate(Cand, TryCand, ZoneArg);
464 if (TryCand.Reason != NoCand) {
465 // Initialize resource delta if needed in case future heuristics query it.
466 if (TryCand.ResDelta == SchedResourceDelta())
467 TryCand.initResourceDelta(Zone.DAG, SchedModel);
468 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
469 Cand.setBest(TryCand);
470 } else {
471 printCandidateDecision(TryCand, Cand);
472 }
473 }
474
475 if (!shouldCheckPending(Zone, SchedModel))
476 return;
477
478 LLVM_DEBUG(dbgs() << "Pending Q:\n");
479 ReadyQueue &PQ = Zone.Pending;
480 for (SUnit *SU : PQ) {
481
482 SchedCandidate TryCand(ZonePolicy);
483 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
484 VGPRPressure, IsBottomUp);
485 // Pass SchedBoundary only when comparing nodes from the same boundary.
486 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
487 tryPendingCandidate(Cand, TryCand, ZoneArg);
488 if (TryCand.Reason != NoCand) {
489 // Initialize resource delta if needed in case future heuristics query it.
490 if (TryCand.ResDelta == SchedResourceDelta())
491 TryCand.initResourceDelta(Zone.DAG, SchedModel);
492 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
493 IsPending = true;
494 Cand.setBest(TryCand);
495 } else {
496 printCandidateDecision(TryCand, Cand);
497 }
498 }
499}
500
501// This function is mostly cut and pasted from
502// GenericScheduler::pickNodeBidirectional()
504 bool &PickedPending) {
505 // Schedule as far as possible in the direction of no choice. This is most
506 // efficient, but also provides the best heuristics for CriticalPSets.
507 if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) {
508 IsTopNode = false;
509 return SU;
510 }
511 if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) {
512 IsTopNode = true;
513 return SU;
514 }
515 // Set the bottom-up policy based on the state of the current bottom zone
516 // and the instructions outside the zone, including the top zone.
517 CandPolicy BotPolicy;
518 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
519 // Set the top-down policy based on the state of the current top zone and
520 // the instructions outside the zone, including the bottom zone.
521 CandPolicy TopPolicy;
522 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
523
524 bool BotPending = false;
525 // See if BotCand is still valid (because we previously scheduled from Top).
526 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
527 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
528 BotCand.Policy != BotPolicy) {
529 BotCand.reset(CandPolicy());
530 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
531 BotPending,
532 /*IsBottomUp=*/true);
533 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
534 } else {
536#ifndef NDEBUG
537 if (VerifyScheduling) {
538 SchedCandidate TCand;
539 TCand.reset(CandPolicy());
540 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
541 BotPending,
542 /*IsBottomUp=*/true);
543 assert(TCand.SU == BotCand.SU &&
544 "Last pick result should correspond to re-picking right now");
545 }
546#endif
547 }
548
549 bool TopPending = false;
550 // Check if the top Q has a better candidate.
551 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
552 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
553 TopCand.Policy != TopPolicy) {
554 TopCand.reset(CandPolicy());
555 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
556 TopPending,
557 /*IsBottomUp=*/false);
558 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
559 } else {
561#ifndef NDEBUG
562 if (VerifyScheduling) {
563 SchedCandidate TCand;
564 TCand.reset(CandPolicy());
565 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
566 TopPending,
567 /*IsBottomUp=*/false);
568 assert(TCand.SU == TopCand.SU &&
569 "Last pick result should correspond to re-picking right now");
570 }
571#endif
572 }
573
574 // Pick best from BotCand and TopCand.
575 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
576 dbgs() << "Bot Cand: "; traceCandidate(BotCand););
577 SchedCandidate Cand = BotPending ? TopCand : BotCand;
578 SchedCandidate TryCand = BotPending ? BotCand : TopCand;
579 PickedPending = BotPending && TopPending;
580
581 TryCand.Reason = NoCand;
582 if (BotPending || TopPending) {
583 PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr);
584 } else {
585 tryCandidate(Cand, TryCand, nullptr);
586 }
587
588 if (TryCand.Reason != NoCand) {
589 Cand.setBest(TryCand);
590 }
591
592 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
593
594 IsTopNode = Cand.AtTop;
595 return Cand.SU;
596}
597
598// This function is mostly cut and pasted from
599// GenericScheduler::pickNode()
601 if (DAG->top() == DAG->bottom()) {
602 assert(Top.Available.empty() && Top.Pending.empty() &&
603 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
604 return nullptr;
605 }
606 bool PickedPending;
607 SUnit *SU;
608 do {
609 PickedPending = false;
610 if (RegionPolicy.OnlyTopDown) {
612 if (!SU) {
613 CandPolicy NoPolicy;
614 TopCand.reset(NoPolicy);
615 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
616 PickedPending,
617 /*IsBottomUp=*/false);
618 assert(TopCand.Reason != NoCand && "failed to find a candidate");
619 SU = TopCand.SU;
620 }
621 IsTopNode = true;
622 } else if (RegionPolicy.OnlyBottomUp) {
624 if (!SU) {
625 CandPolicy NoPolicy;
626 BotCand.reset(NoPolicy);
627 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
628 PickedPending,
629 /*IsBottomUp=*/true);
630 assert(BotCand.Reason != NoCand && "failed to find a candidate");
631 SU = BotCand.SU;
632 }
633 IsTopNode = false;
634 } else {
635 SU = pickNodeBidirectional(IsTopNode, PickedPending);
636 }
637 } while (SU->isScheduled);
638
639 if (PickedPending) {
640 unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
641 SchedBoundary &Zone = IsTopNode ? Top : Bot;
642 unsigned CurrentCycle = Zone.getCurrCycle();
643 if (ReadyCycle > CurrentCycle)
644 Zone.bumpCycle(ReadyCycle);
645
646 // FIXME: checkHazard() doesn't give information about which cycle the
647 // hazard will resolve so just keep bumping the cycle by 1. This could be
648 // made more efficient if checkHazard() returned more details.
649 while (Zone.checkHazard(SU))
650 Zone.bumpCycle(Zone.getCurrCycle() + 1);
651
652 Zone.releasePending();
653 }
654
655 if (SU->isTopReady())
656 Top.removeReady(SU);
657 if (SU->isBottomReady())
658 Bot.removeReady(SU);
659
660 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
661 << *SU->getInstr());
662 return SU;
663}
664
665void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
666 if (useGCNTrackers()) {
667 MachineInstr *MI = SU->getInstr();
668 IsTopNode ? (void)DownwardTracker.advance(MI, false)
669 : UpwardTracker.recede(*MI);
670 }
671
672 return GenericScheduler::schedNode(SU, IsTopNode);
673}
674
679
682 if (!CurrentStage)
683 CurrentStage = SchedStages.begin();
684 else
685 CurrentStage++;
686
687 return CurrentStage != SchedStages.end();
688}
689
692 return std::next(CurrentStage) != SchedStages.end();
693}
694
696 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
697 return *std::next(CurrentStage);
698}
699
701 SchedCandidate &TryCand,
702 SchedBoundary *Zone) const {
703 // Initialize the candidate if needed.
704 if (!Cand.isValid()) {
705 TryCand.Reason = NodeOrder;
706 return true;
707 }
708
709 // Bias PhysReg Defs and copies to their uses and defined respectively.
710 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
711 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
712 return TryCand.Reason != NoCand;
713
714 // Avoid exceeding the target's limit.
715 if (DAG->isTrackingPressure() &&
716 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
717 RegExcess, TRI, DAG->MF))
718 return TryCand.Reason != NoCand;
719
720 // Avoid increasing the max critical pressure in the scheduled region.
721 if (DAG->isTrackingPressure() &&
723 TryCand, Cand, RegCritical, TRI, DAG->MF))
724 return TryCand.Reason != NoCand;
725
726 bool SameBoundary = Zone != nullptr;
727 if (SameBoundary) {
730 TryCand, Cand, ResourceReduce))
731 return TryCand.Reason != NoCand;
733 Cand.ResDelta.DemandedResources, TryCand, Cand,
735 return TryCand.Reason != NoCand;
736 }
737
738 return false;
739}
740
753
758
760 SchedCandidate &TryCand,
761 SchedBoundary *Zone) const {
762 // Initialize the candidate if needed.
763 if (!Cand.isValid()) {
764 TryCand.Reason = NodeOrder;
765 return true;
766 }
767
768 // Avoid spilling by exceeding the register limit.
769 if (DAG->isTrackingPressure() &&
770 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
771 RegExcess, TRI, DAG->MF))
772 return TryCand.Reason != NoCand;
773
774 // Bias PhysReg Defs and copies to their uses and defined respectively.
775 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
776 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
777 return TryCand.Reason != NoCand;
778
779 bool SameBoundary = Zone != nullptr;
780 if (SameBoundary) {
781 // Prioritize instructions that read unbuffered resources by stall cycles.
782 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
783 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
784 return TryCand.Reason != NoCand;
785
786 // Avoid critical resource consumption and balance the schedule.
789 TryCand, Cand, ResourceReduce))
790 return TryCand.Reason != NoCand;
792 Cand.ResDelta.DemandedResources, TryCand, Cand,
794 return TryCand.Reason != NoCand;
795
796 // Unconditionally try to reduce latency.
797 if (tryLatency(TryCand, Cand, *Zone))
798 return TryCand.Reason != NoCand;
799
800 // Weak edges are for clustering and other constraints.
801 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
802 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
803 return TryCand.Reason != NoCand;
804 }
805
806 // Keep clustered nodes together to encourage downstream peephole
807 // optimizations which may reduce resource requirements.
808 //
809 // This is a best effort to set things up for a post-RA pass. Optimizations
810 // like generating loads of multiple registers should ideally be done within
811 // the scheduler pass by combining the loads during DAG postprocessing.
812 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
813 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
814 bool CandIsClusterSucc =
815 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
816 bool TryCandIsClusterSucc =
817 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
818 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
819 Cluster))
820 return TryCand.Reason != NoCand;
821
822 // Avoid increasing the max critical pressure in the scheduled region.
823 if (DAG->isTrackingPressure() &&
825 TryCand, Cand, RegCritical, TRI, DAG->MF))
826 return TryCand.Reason != NoCand;
827
828 // Avoid increasing the max pressure of the entire region.
829 if (DAG->isTrackingPressure() &&
830 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
831 Cand, RegMax, TRI, DAG->MF))
832 return TryCand.Reason != NoCand;
833
834 if (SameBoundary) {
835 // Fall through to original instruction order.
836 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
837 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
838 TryCand.Reason = NodeOrder;
839 return true;
840 }
841 }
842 return false;
843}
844
850
851/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
852/// much as possible. This is achieved by:
853// 1. Prioritize clustered operations before stall latency heuristic.
854// 2. Prioritize long-latency-load before stall latency heuristic.
855///
856/// \param Cand provides the policy and current best candidate.
857/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
858/// \param Zone describes the scheduled zone that we are extending, or nullptr
859/// if Cand is from a different zone than TryCand.
860/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
862 SchedCandidate &TryCand,
863 SchedBoundary *Zone) const {
864 // Initialize the candidate if needed.
865 if (!Cand.isValid()) {
866 TryCand.Reason = NodeOrder;
867 return true;
868 }
869
870 // Bias PhysReg Defs and copies to their uses and defined respectively.
871 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
872 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
873 return TryCand.Reason != NoCand;
874
875 if (DAG->isTrackingPressure()) {
876 // Avoid exceeding the target's limit.
877 if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
878 RegExcess, TRI, DAG->MF))
879 return TryCand.Reason != NoCand;
880
881 // Avoid increasing the max critical pressure in the scheduled region.
883 TryCand, Cand, RegCritical, TRI, DAG->MF))
884 return TryCand.Reason != NoCand;
885 }
886
887 // MaxMemoryClause-specific: We prioritize clustered instructions as we would
888 // get more benefit from clausing these memory instructions.
889 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
890 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
891 bool CandIsClusterSucc =
892 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
893 bool TryCandIsClusterSucc =
894 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
895 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
896 Cluster))
897 return TryCand.Reason != NoCand;
898
899 // We only compare a subset of features when comparing nodes between
900 // Top and Bottom boundary. Some properties are simply incomparable, in many
901 // other instances we should only override the other boundary if something
902 // is a clear good pick on one boundary. Skip heuristics that are more
903 // "tie-breaking" in nature.
904 bool SameBoundary = Zone != nullptr;
905 if (SameBoundary) {
906 // For loops that are acyclic path limited, aggressively schedule for
907 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
908 // heuristics to take precedence.
909 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
910 tryLatency(TryCand, Cand, *Zone))
911 return TryCand.Reason != NoCand;
912
913 // MaxMemoryClause-specific: Prioritize long latency memory load
914 // instructions in top-bottom order to hide more latency. The mayLoad check
915 // is used to exclude store-like instructions, which we do not want to
916 // scheduler them too early.
917 bool TryMayLoad =
918 TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
919 bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
920
921 if (TryMayLoad || CandMayLoad) {
922 bool TryLongLatency =
923 TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
924 bool CandLongLatency =
925 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
926
927 if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
928 Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
929 Cand, Stall))
930 return TryCand.Reason != NoCand;
931 }
932 // Prioritize instructions that read unbuffered resources by stall cycles.
933 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
934 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
935 return TryCand.Reason != NoCand;
936 }
937
938 if (SameBoundary) {
939 // Weak edges are for clustering and other constraints.
940 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
941 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
942 return TryCand.Reason != NoCand;
943 }
944
945 // Avoid increasing the max pressure of the entire region.
946 if (DAG->isTrackingPressure() &&
947 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
948 Cand, RegMax, TRI, DAG->MF))
949 return TryCand.Reason != NoCand;
950
951 if (SameBoundary) {
952 // Avoid critical resource consumption and balance the schedule.
955 TryCand, Cand, ResourceReduce))
956 return TryCand.Reason != NoCand;
958 Cand.ResDelta.DemandedResources, TryCand, Cand,
960 return TryCand.Reason != NoCand;
961
962 // Avoid serializing long latency dependence chains.
963 // For acyclic path limited loops, latency was already checked above.
964 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
965 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
966 return TryCand.Reason != NoCand;
967
968 // Fall through to original instruction order.
969 if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
970 assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
971 TryCand.Reason = NodeOrder;
972 return true;
973 }
974 }
975
976 return false;
977}
978
980 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
981 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
982 MFI(*MF.getInfo<SIMachineFunctionInfo>()),
983 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
984 RegionLiveOuts(this, /*IsLiveOut=*/true) {
985
986 // We want regions with a single MI to be scheduled so that we can reason
987 // about them correctly during scheduling stages that move MIs between regions
988 // (e.g., rematerialization).
990 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
991 if (RelaxedOcc) {
992 MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
993 if (MinOccupancy != StartingOccupancy)
994 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
995 << ".\n");
996 }
997}
998
999std::unique_ptr<GCNSchedStage>
1000GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
1001 switch (SchedStageID) {
1003 return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
1005 return std::make_unique<RewriteMFMAFormStage>(SchedStageID, *this);
1007 return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
1009 return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
1011 return std::make_unique<PreRARematStage>(SchedStageID, *this);
1013 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
1015 return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
1016 *this);
1017 }
1018
1019 llvm_unreachable("Unknown SchedStageID.");
1020}
1021
1023 // Collect all scheduling regions. The actual scheduling is performed in
1024 // GCNScheduleDAGMILive::finalizeSchedule.
1025 Regions.push_back(std::pair(RegionBegin, RegionEnd));
1026}
1027
1029GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
1030 if (Regions[RegionIdx].first == Regions[RegionIdx].second)
1031 return llvm::getRegPressure(MRI, LiveIns[RegionIdx]);
1033 RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
1034 &LiveIns[RegionIdx]);
1035 return RPTracker.moveMaxPressure();
1036}
1037
1039 MachineBasicBlock::iterator RegionEnd) {
1040 assert(RegionBegin != RegionEnd && "Region must not be empty");
1041 return &*skipDebugInstructionsBackward(std::prev(RegionEnd), RegionBegin);
1042}
1043
1044void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
1045 const MachineBasicBlock *MBB) {
1046 GCNDownwardRPTracker RPTracker(*LIS);
1047
1048 // If the block has the only successor then live-ins of that successor are
1049 // live-outs of the current block. We can reuse calculated live set if the
1050 // successor will be sent to scheduling past current block.
1051
1052 // However, due to the bug in LiveInterval analysis it may happen that two
1053 // predecessors of the same successor block have different lane bitmasks for
1054 // a live-out register. Workaround that by sticking to one-to-one relationship
1055 // i.e. one predecessor with one successor block.
1056 const MachineBasicBlock *OnlySucc = nullptr;
1057 if (MBB->succ_size() == 1) {
1058 auto *Candidate = *MBB->succ_begin();
1059 if (!Candidate->empty() && Candidate->pred_size() == 1) {
1060 SlotIndexes *Ind = LIS->getSlotIndexes();
1061 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
1062 OnlySucc = Candidate;
1063 }
1064 }
1065
1066 // Scheduler sends regions from the end of the block upwards.
1067 size_t CurRegion = RegionIdx;
1068 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
1069 if (Regions[CurRegion].first->getParent() != MBB)
1070 break;
1071 --CurRegion;
1072
1073 auto I = MBB->begin();
1074 auto LiveInIt = MBBLiveIns.find(MBB);
1075 auto &Rgn = Regions[CurRegion];
1076 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
1077 if (LiveInIt != MBBLiveIns.end()) {
1078 auto LiveIn = std::move(LiveInIt->second);
1079 RPTracker.reset(*MBB->begin(), &LiveIn);
1080 MBBLiveIns.erase(LiveInIt);
1081 } else {
1082 I = Rgn.first;
1083 auto LRS = BBLiveInMap.lookup(NonDbgMI);
1084#ifdef EXPENSIVE_CHECKS
1085 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
1086#endif
1087 RPTracker.reset(*I, &LRS);
1088 }
1089
1090 for (;;) {
1091 I = RPTracker.getNext();
1092
1093 if (Regions[CurRegion].first == I || NonDbgMI == I) {
1094 LiveIns[CurRegion] = RPTracker.getLiveRegs();
1095 RPTracker.clearMaxPressure();
1096 }
1097
1098 if (Regions[CurRegion].second == I) {
1099 Pressure[CurRegion] = RPTracker.moveMaxPressure();
1100 if (CurRegion-- == RegionIdx)
1101 break;
1102 auto &Rgn = Regions[CurRegion];
1103 NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
1104 }
1105 RPTracker.advanceBeforeNext();
1106 RPTracker.advanceToNext();
1107 }
1108
1109 if (OnlySucc) {
1110 if (I != MBB->end()) {
1111 RPTracker.advanceBeforeNext();
1112 RPTracker.advanceToNext();
1113 RPTracker.advance(MBB->end());
1114 }
1115 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
1116 }
1117}
1118
1120GCNScheduleDAGMILive::getRegionLiveInMap() const {
1121 assert(!Regions.empty());
1122 std::vector<MachineInstr *> RegionFirstMIs;
1123 RegionFirstMIs.reserve(Regions.size());
1124 for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
1125 RegionFirstMIs.push_back(
1127
1128 return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
1129}
1130
1132GCNScheduleDAGMILive::getRegionLiveOutMap() const {
1133 assert(!Regions.empty());
1134 std::vector<MachineInstr *> RegionLastMIs;
1135 RegionLastMIs.reserve(Regions.size());
1136 for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) {
1137 // Skip empty regions.
1138 if (RegionBegin == RegionEnd)
1139 continue;
1140 RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
1141 }
1142 return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
1143}
1144
1146 IdxToInstruction.clear();
1147
1148 RegionLiveRegMap =
1149 IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
1150 for (unsigned I = 0; I < DAG->Regions.size(); I++) {
1151 auto &[RegionBegin, RegionEnd] = DAG->Regions[I];
1152 // Skip empty regions.
1153 if (RegionBegin == RegionEnd)
1154 continue;
1155 MachineInstr *RegionKey =
1156 IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin;
1157 IdxToInstruction[I] = RegionKey;
1158 }
1159}
1160
1162 // Start actual scheduling here. This function is called by the base
1163 // MachineScheduler after all regions have been recorded by
1164 // GCNScheduleDAGMILive::schedule().
1165 LiveIns.resize(Regions.size());
1166 Pressure.resize(Regions.size());
1167 RegionsWithHighRP.resize(Regions.size());
1168 RegionsWithExcessRP.resize(Regions.size());
1169 RegionsWithIGLPInstrs.resize(Regions.size());
1170 RegionsWithHighRP.reset();
1171 RegionsWithExcessRP.reset();
1172 RegionsWithIGLPInstrs.reset();
1173
1174 runSchedStages();
1175}
1176
1177void GCNScheduleDAGMILive::runSchedStages() {
1178 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
1179
1180 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
1181 if (!Regions.empty()) {
1182 BBLiveInMap = getRegionLiveInMap();
1183 if (S.useGCNTrackers())
1184 RegionLiveOuts.buildLiveRegMap();
1185 }
1186
1187#ifdef DUMP_MAX_REG_PRESSURE
1191 LIS->dump();
1192 }
1193#endif
1194
1195 while (S.advanceStage()) {
1196 auto Stage = createSchedStage(S.getCurrentStage());
1197 if (!Stage->initGCNSchedStage())
1198 continue;
1199
1200 for (auto Region : Regions) {
1201 RegionBegin = Region.first;
1202 RegionEnd = Region.second;
1203 // Setup for scheduling the region and check whether it should be skipped.
1204 if (!Stage->initGCNRegion()) {
1205 Stage->advanceRegion();
1206 exitRegion();
1207 continue;
1208 }
1209
1210 if (S.useGCNTrackers()) {
1211 GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
1212 GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
1213 GCNRPTracker::LiveRegSet *RegionLiveIns =
1214 &LiveIns[Stage->getRegionIdx()];
1215
1216 reinterpret_cast<GCNRPTracker *>(DownwardTracker)
1217 ->reset(MRI, *RegionLiveIns);
1218 reinterpret_cast<GCNRPTracker *>(UpwardTracker)
1219 ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
1220 Stage->getRegionIdx()));
1221 }
1222
1224 Stage->finalizeGCNRegion();
1225 Stage->advanceRegion();
1226 exitRegion();
1227 }
1228
1229 Stage->finalizeGCNSchedStage();
1230 }
1231
1232#ifdef DUMP_MAX_REG_PRESSURE
1236 LIS->dump();
1237 }
1238#endif
1239}
1240
1241#ifndef NDEBUG
1243 switch (StageID) {
1245 OS << "Max Occupancy Initial Schedule";
1246 break;
1248 OS << "Instruction Rewriting Reschedule";
1249 break;
1251 OS << "Unclustered High Register Pressure Reschedule";
1252 break;
1254 OS << "Clustered Low Occupancy Reschedule";
1255 break;
1257 OS << "Pre-RA Rematerialize";
1258 break;
1260 OS << "Max ILP Initial Schedule";
1261 break;
1263 OS << "Max memory clause Initial Schedule";
1264 break;
1265 }
1266
1267 return OS;
1268}
1269#endif
1270
1274
1276 if (!DAG.LIS)
1277 return false;
1278
1279 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1280 return true;
1281}
1282
1283void RewriteMFMAFormStage::findReachingDefs(
1284 MachineOperand &UseMO, LiveIntervals *LIS,
1285 SmallVectorImpl<SlotIndex> &DefIdxs) {
1286 MachineInstr *UseMI = UseMO.getParent();
1287 LiveInterval &UseLI = LIS->getInterval(UseMO.getReg());
1288 VNInfo *VNI = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI));
1289
1290 // If the def is not a PHI, then it must be the only reaching def.
1291 if (!VNI->isPHIDef()) {
1292 DefIdxs.push_back(VNI->def);
1293 return;
1294 }
1295
1296 SmallPtrSet<MachineBasicBlock *, 8> Visited = {UseMI->getParent()};
1297 SmallVector<MachineBasicBlock *, 8> Worklist;
1298
1299 // Mark the predecessor blocks for traversal
1300 for (MachineBasicBlock *PredMBB : UseMI->getParent()->predecessors()) {
1301 Worklist.push_back(PredMBB);
1302 Visited.insert(PredMBB);
1303 }
1304
1305 while (!Worklist.empty()) {
1306 MachineBasicBlock *CurrMBB = Worklist.pop_back_val();
1307
1308 SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(CurrMBB);
1309 VNInfo *VNI = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot());
1310
1311 MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNI->def);
1312
1313 // If there is a def in this block, then add it to the list. This is the
1314 // reaching def of this path.
1315 if (!VNI->isPHIDef()) {
1316 DefIdxs.push_back(VNI->def);
1317 continue;
1318 }
1319
1320 for (MachineBasicBlock *PredMBB : DefMBB->predecessors()) {
1321 if (Visited.insert(PredMBB).second)
1322 Worklist.push_back(PredMBB);
1323 }
1324 }
1325}
1326
1327void RewriteMFMAFormStage::findReachingUses(
1329 SmallVectorImpl<MachineOperand *> &ReachingUses) {
1330 SlotIndex DefIdx = LIS->getInstructionIndex(*DefMI);
1331 for (MachineOperand &UseMO :
1332 DAG.MRI.use_nodbg_operands(DefMI->getOperand(0).getReg())) {
1333 SmallVector<SlotIndex, 8> ReachingDefIndexes;
1334 findReachingDefs(UseMO, LIS, ReachingDefIndexes);
1335
1336 // If we find a use that contains this DefMI in its reachingDefs, then it is
1337 // a reaching use.
1338 if (any_of(ReachingDefIndexes, [DefIdx](SlotIndex RDIdx) {
1339 return SlotIndex::isSameInstr(RDIdx, DefIdx);
1340 }))
1341 ReachingUses.push_back(&UseMO);
1342 }
1343}
1344
1346 // We only need to run this pass if the architecture supports AGPRs.
1347 // Additionally, we don't use AGPRs at occupancy levels above 1 so there
1348 // is no need for this pass in that case, either.
1349 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1350 if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1)
1351 return false;
1352
1353 RegionsWithExcessArchVGPR.resize(DAG.Regions.size());
1354 RegionsWithExcessArchVGPR.reset();
1355 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
1357 if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
1358 RegionsWithExcessArchVGPR[Region] = true;
1359 }
1360
1361 if (RegionsWithExcessArchVGPR.none())
1362 return false;
1363
1364 TII = ST.getInstrInfo();
1365 SRI = ST.getRegisterInfo();
1366
1367 std::vector<std::pair<MachineInstr *, unsigned>> RewriteCands;
1370
1371 if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef))
1372 return false;
1373
1374 int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef);
1375
1376 // If we haven't found the beneficial conditions, prefer the VGPR form which
1377 // may result in less cross RC copies.
1378 if (Cost > 0)
1379 return false;
1380
1381 return rewrite(RewriteCands);
1382}
1383
1386 return false;
1387
1389 return false;
1390
1391 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1392 return false;
1393
1394 SavedMutations.swap(DAG.Mutations);
1395 DAG.addMutation(
1397
1398 InitialOccupancy = DAG.MinOccupancy;
1399 // Aggressively try to reduce register pressure in the unclustered high RP
1400 // stage. Temporarily increase occupancy target in the region.
1401 TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
1402 ? InitialOccupancy + 1
1403 : InitialOccupancy;
1404 IsAnyRegionScheduled = false;
1405 S.SGPRLimitBias = S.HighRPSGPRBias;
1406 S.VGPRLimitBias = S.HighRPVGPRBias;
1407
1408 LLVM_DEBUG(
1409 dbgs()
1410 << "Retrying function scheduling without clustering. "
1411 "Aggressively try to reduce register pressure to achieve occupancy "
1412 << TempTargetOccupancy << ".\n");
1413
1414 return true;
1415}
1416
1419 return false;
1420
1422 return false;
1423
1424 // Don't bother trying to improve ILP in lower RP regions if occupancy has not
1425 // been dropped. All regions will have already been scheduled with the ideal
1426 // occupancy targets.
1427 if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1428 return false;
1429
1430 LLVM_DEBUG(
1431 dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1432 << DAG.MinOccupancy << ".\n");
1433 return true;
1434}
1435
1436/// Allows to easily filter for this stage's debug output.
1437#define REMAT_PREFIX "[PreRARemat] "
1438#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
1439
1440#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1441Printable PreRARematStage::ScoredRemat::print() const {
1442 return Printable([&](raw_ostream &OS) {
1443 OS << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
1444 });
1445}
1446#endif
1447
1449 // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
1450 // regions inbetween the defs and region we sinked the def to. Will need to be
1451 // fixed if there is another pass after this pass.
1452 assert(!S.hasNextStage());
1453
1454 if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1)
1455 return false;
1456
1457 // Maps all MIs (except lone terminators, which are not part of any region) to
1458 // their parent region. Non-lone terminators are considered part of the region
1459 // they delimitate.
1460 DenseMap<MachineInstr *, unsigned> MIRegion(MF.getInstructionCount());
1461
1462 // Before performing any IR modification record the parent region of each MI
1463 // and the parent MBB of each region.
1464 const unsigned NumRegions = DAG.Regions.size();
1465 for (unsigned I = 0; I < NumRegions; ++I) {
1466 RegionBoundaries Region = DAG.Regions[I];
1467 for (auto MI = Region.first; MI != Region.second; ++MI)
1468 MIRegion.insert({&*MI, I});
1469 MachineBasicBlock *ParentMBB = Region.first->getParent();
1470 if (Region.second != ParentMBB->end())
1471 MIRegion.insert({&*Region.second, I});
1472 RegionBB.push_back(ParentMBB);
1473 }
1474
1475#ifndef NDEBUG
1476 auto PrintTargetRegions = [&]() -> void {
1477 if (TargetRegions.none()) {
1478 dbgs() << REMAT_PREFIX << "No target regions\n";
1479 return;
1480 }
1481 dbgs() << REMAT_PREFIX << "Target regions:\n";
1482 for (unsigned I : TargetRegions.set_bits())
1483 dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
1484 };
1485 auto PrintRematReg = [&](const RematReg &Remat) -> Printable {
1486 return Printable([&, Remat](raw_ostream &OS) {
1487 // Concatenate all region numbers in which the register is unused and
1488 // live-through.
1489 bool HasLiveThroughRegion = false;
1490 OS << '[' << Remat.DefRegion << " -";
1491 for (unsigned I = 0; I < NumRegions; ++I) {
1492 if (Remat.isUnusedLiveThrough(I)) {
1493 if (HasLiveThroughRegion) {
1494 OS << ',';
1495 } else {
1496 OS << "- ";
1497 HasLiveThroughRegion = true;
1498 }
1499 OS << I;
1500 }
1501 }
1502 if (HasLiveThroughRegion)
1503 OS << " -";
1504 OS << "-> " << Remat.UseRegion << "] ";
1505 Remat.DefMI->print(OS, /*IsStandalone=*/true, /*SkipOpers=*/false,
1506 /*SkipDebugLoc=*/false, /*AddNewLine=*/false);
1507 });
1508 };
1509#endif
1510
1511 // Set an objective for the stage based on current RP in each region.
1512 REMAT_DEBUG({
1513 dbgs() << "Analyzing ";
1514 MF.getFunction().printAsOperand(dbgs(), false);
1515 dbgs() << ": ";
1516 });
1517 if (!setObjective()) {
1518 LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at "
1519 << MFI.getMaxWavesPerEU() << '\n');
1520 return false;
1521 }
1522 LLVM_DEBUG({
1523 if (TargetOcc) {
1524 dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n';
1525 } else {
1526 dbgs() << "reduce spilling (minimum target occupancy is "
1527 << MFI.getMinWavesPerEU() << ")\n";
1528 }
1529 PrintTargetRegions();
1530 });
1531
1532 if (!collectRematRegs(MIRegion)) {
1533 REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
1534 return false;
1535 }
1536 const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
1537 REMAT_DEBUG({
1538 dbgs() << "Rematerializable registers:\n";
1539 for (const RematReg &Remat : RematRegs)
1540 dbgs() << REMAT_PREFIX << " " << PrintRematReg(Remat) << '\n';
1541 dbgs() << REMAT_PREFIX << "Region frequencies\n";
1542 for (auto [I, Freq] : enumerate(FreqInfo.Regions)) {
1543 dbgs() << REMAT_PREFIX << " [" << I << "] ";
1544 if (Freq)
1545 dbgs() << Freq;
1546 else
1547 dbgs() << "unknown ";
1548 dbgs() << " | " << *DAG.Regions[I].first;
1549 }
1550 });
1551
1552 SmallVector<ScoredRemat> ScoredRemats;
1553 for (RematReg &Remat : RematRegs)
1554 ScoredRemats.emplace_back(&Remat, FreqInfo, DAG);
1555
1556// Rematerialize registers in successive rounds until all RP targets are
1557// satisifed or until we run out of rematerialization candidates.
1558#ifndef NDEBUG
1559 unsigned RoundNum = 0;
1560#endif
1561 BitVector RecomputeRP(NumRegions);
1562 do {
1563 assert(!ScoredRemats.empty() && "no more remat candidates");
1564
1565 // (Re-)Score and (re-)sort all remats in increasing score order.
1566 for (ScoredRemat &Remat : ScoredRemats)
1567 Remat.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc);
1568 sort(ScoredRemats);
1569
1570 REMAT_DEBUG({
1571 dbgs() << "==== ROUND " << RoundNum++ << " ====\n"
1572 << REMAT_PREFIX
1573 << "Candidates with non-null score, in rematerialization order:\n";
1574 for (const ScoredRemat &RematDecision : reverse(ScoredRemats)) {
1575 if (RematDecision.hasNullScore())
1576 break;
1577 dbgs() << REMAT_PREFIX << " " << RematDecision.print() << " | "
1578 << *RematDecision.Remat->DefMI;
1579 }
1580 PrintTargetRegions();
1581 });
1582
1583 RecomputeRP.reset();
1584 unsigned RematIdx = ScoredRemats.size();
1585
1586 // Rematerialize registers in decreasing score order until we estimate
1587 // that all RP targets are satisfied or until rematerialization candidates
1588 // are no longer useful to decrease RP.
1589 for (; RematIdx && TargetRegions.any(); --RematIdx) {
1590 const ScoredRemat &Candidate = ScoredRemats[RematIdx - 1];
1591 // Stop rematerializing on encountering a null score. Since scores
1592 // monotonically decrease as we rematerialize, we know there is nothing
1593 // useful left to do in such cases, even if we were to re-score.
1594 if (Candidate.hasNullScore()) {
1595 RematIdx = 0;
1596 break;
1597 }
1598
1599 RematReg &Remat = *Candidate.Remat;
1600 // When previous rematerializations in this round have already satisfied
1601 // RP targets in all regions this rematerialization can impact, we have a
1602 // good indication that our scores have diverged significantly from
1603 // reality, in which case we interrupt this round and re-score. This also
1604 // ensures that every rematerialization we perform is possibly impactful
1605 // in at least one target region.
1606 if (!Remat.maybeBeneficial(TargetRegions, RPTargets))
1607 break;
1608
1609 REMAT_DEBUG(dbgs() << "** REMAT " << PrintRematReg(Remat) << '\n';);
1610 MachineInstr *RematMI =
1611 Candidate.rematerialize(RecomputeRP, RPTargets, DAG);
1612 RescheduleRegions |= Remat.Live;
1613
1614 // Every rematerialization we do here is likely to move the instruction
1615 // into a higher frequency region, increasing the total sum latency of the
1616 // instruction itself. This is acceptable if we are eliminating a spill in
1617 // the process, but when the goal is increasing occupancy we get nothing
1618 // out of rematerialization if occupancy is not increased in the end; in
1619 // such cases we want to roll back the rematerialization.
1620 if (TargetOcc) {
1621 RollbackInfo &Rollback = Rollbacks.emplace_back(&Remat);
1622 Rollback.RematMI = RematMI;
1623 // Make the original MI a debug value so that it does not influence
1624 // scheduling and replace all read registers with a sentinel register to
1625 // prevent operands to appear in use-lists of other MIs during LIS
1626 // updates. Store mappings between operand indices and original
1627 // registers for potential rollback.
1628 Remat.DefMI->setDesc(DAG.TII->get(TargetOpcode::DBG_VALUE));
1629 for (auto [Idx, MO] : enumerate(Remat.DefMI->operands())) {
1630 if (MO.isReg() && MO.readsReg()) {
1631 Rollback.RegMap.insert({Idx, MO.getReg()});
1632 MO.setReg(Register());
1633 }
1634 }
1635 } else {
1636 // Just delete the original instruction if it cannot be rolled back.
1637 DAG.deleteMI(Remat.DefRegion, Remat.DefMI);
1638 }
1639
1640 unsetSatisfiedRPTargets(Remat.Live);
1641 }
1642
1643 REMAT_DEBUG({
1644 if (!TargetRegions.any()) {
1645 dbgs() << "** Interrupt round on all targets achieved\n";
1646 } else if (RematIdx) {
1647 dbgs() << "** Interrupt round on stale score for "
1648 << *ScoredRemats[RematIdx - 1].Remat->DefMI;
1649 } else {
1650 dbgs() << "** Stop on exhausted rematerialization candidates\n";
1651 }
1652 });
1653
1654 // Peel off registers we already rematerialized from the vector's tail.
1655 ScoredRemats.truncate(RematIdx);
1656 } while ((updateAndVerifyRPTargets(RecomputeRP) || TargetRegions.any()) &&
1657 !ScoredRemats.empty());
1658 if (RescheduleRegions.none())
1659 return false;
1660
1661 // Commit all pressure changes to the DAG and compute minimum achieved
1662 // occupancy in impacted regions.
1663 REMAT_DEBUG(dbgs() << "==== REMAT RESULTS ====\n");
1664 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
1665 for (unsigned I : RescheduleRegions.set_bits()) {
1666 DAG.Pressure[I] = RPTargets[I].getCurrentRP();
1667 REMAT_DEBUG(dbgs() << '[' << I << "] Achieved occupancy "
1668 << DAG.Pressure[I].getOccupancy(ST, DynamicVGPRBlockSize)
1669 << " (" << RPTargets[I] << ")\n");
1670 }
1671 AchievedOcc = MFI.getMaxWavesPerEU();
1672 for (const GCNRegPressure &RP : DAG.Pressure) {
1673 AchievedOcc =
1674 std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
1675 }
1676
1677 REMAT_DEBUG({
1678 dbgs() << "Retrying function scheduling with new min. occupancy of "
1679 << AchievedOcc << " from rematerializing (original was "
1680 << DAG.MinOccupancy;
1681 if (TargetOcc)
1682 dbgs() << ", target was " << *TargetOcc;
1683 dbgs() << ")\n";
1684 });
1685
1686 DAG.setTargetOccupancy(getStageTargetOccupancy());
1687 return true;
1688}
1689
1691 DAG.finishBlock();
1692 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1693}
1694
1696 SavedMutations.swap(DAG.Mutations);
1697 S.SGPRLimitBias = S.VGPRLimitBias = 0;
1698 if (DAG.MinOccupancy > InitialOccupancy) {
1699 assert(IsAnyRegionScheduled);
1701 << " stage successfully increased occupancy to "
1702 << DAG.MinOccupancy << '\n');
1703 } else if (!IsAnyRegionScheduled) {
1704 assert(DAG.MinOccupancy == InitialOccupancy);
1706 << ": No regions scheduled, min occupancy stays at "
1707 << DAG.MinOccupancy << ", MFI occupancy stays at "
1708 << MFI.getOccupancy() << ".\n");
1709 }
1710
1712}
1713
1715 // Skip empty scheduling region.
1716 if (DAG.begin() == DAG.end())
1717 return false;
1718
1719 // Check whether this new region is also a new block.
1720 if (DAG.RegionBegin->getParent() != CurrentMBB)
1721 setupNewBlock();
1722
1723 unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
1724 DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
1725
1726 // Skip regions with 1 schedulable instruction.
1727 if (DAG.begin() == std::prev(DAG.end()))
1728 return false;
1729
1730 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1731 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1732 << " " << CurrentMBB->getName()
1733 << "\n From: " << *DAG.begin() << " To: ";
1734 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1735 else dbgs() << "End";
1736 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1737
1738 // Save original instruction order before scheduling for possible revert.
1739 Unsched.clear();
1740 Unsched.reserve(DAG.NumRegionInstrs);
1743 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
1744 for (auto &I : DAG) {
1745 Unsched.push_back(&I);
1746 if (SII->isIGLPMutationOnly(I.getOpcode()))
1747 DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1748 }
1749 } else {
1750 for (auto &I : DAG)
1751 Unsched.push_back(&I);
1752 }
1753
1754 PressureBefore = DAG.Pressure[RegionIdx];
1755
1756 LLVM_DEBUG(
1757 dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1758 << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1759 << "Region live-in pressure: "
1760 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1761 << "Region register pressure: " << print(PressureBefore));
1762
1763 S.HasHighPressure = false;
1764 S.KnownExcessRP = isRegionWithExcessRP();
1765
1766 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1768 SavedMutations.clear();
1769 SavedMutations.swap(DAG.Mutations);
1770 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1772 DAG.addMutation(createIGroupLPDAGMutation(
1773 IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1775 }
1776
1777 return true;
1778}
1779
1781 // Only reschedule regions that have excess register pressure (i.e. spilling)
1782 // or had minimum occupancy at the beginning of the stage (as long as
1783 // rescheduling of previous regions did not make occupancy drop back down to
1784 // the initial minimum).
1785 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1786 // If no region has been scheduled yet, the DAG has not yet been updated with
1787 // the occupancy target. So retrieve it from the temporary.
1788 unsigned CurrentTargetOccupancy =
1789 IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
1790 if (!DAG.RegionsWithExcessRP[RegionIdx] &&
1791 (CurrentTargetOccupancy <= InitialOccupancy ||
1792 DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
1793 InitialOccupancy))
1794 return false;
1795
1796 bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
1797 // If this is the first region scheduled during this stage, make the target
1798 // occupancy changes in the DAG and MFI.
1799 if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
1800 IsAnyRegionScheduled = true;
1801 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
1802 DAG.setTargetOccupancy(TempTargetOccupancy);
1803 }
1804 return IsSchedulingThisRegion;
1805}
1806
1808 // We may need to reschedule this region if it wasn't rescheduled in the last
1809 // stage, or if we found it was testing critical register pressure limits in
1810 // the unclustered reschedule stage. The later is because we may not have been
1811 // able to raise the min occupancy in the previous stage so the region may be
1812 // overly constrained even if it was already rescheduled.
1813 if (!DAG.RegionsWithHighRP[RegionIdx])
1814 return false;
1815
1817}
1818
1820 return RescheduleRegions[RegionIdx] && GCNSchedStage::initGCNRegion();
1821}
1822
1824 if (CurrentMBB)
1825 DAG.finishBlock();
1826
1827 CurrentMBB = DAG.RegionBegin->getParent();
1828 DAG.startBlock(CurrentMBB);
1829 // Get real RP for the region if it hasn't be calculated before. After the
1830 // initial schedule stage real RP will be collected after scheduling.
1834 DAG.computeBlockPressure(RegionIdx, CurrentMBB);
1835}
1836
1838 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1839 if (S.HasHighPressure)
1840 DAG.RegionsWithHighRP[RegionIdx] = true;
1841
1842 // Revert scheduling if we have dropped occupancy or there is some other
1843 // reason that the original schedule is better.
1845
1846 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1848 SavedMutations.swap(DAG.Mutations);
1849}
1850
1853 // When the goal is to increase occupancy, all regions must reach the target
1854 // occupancy for rematerializations to be possibly useful, otherwise we will
1855 // just hurt latency for no benefit. If minimum occupancy drops below the
1856 // target there is no point in trying to re-schedule further regions.
1857 if (!TargetOcc)
1858 return;
1859 RegionReverts.emplace_back(RegionIdx, Unsched, PressureBefore);
1860 if (DAG.MinOccupancy < *TargetOcc) {
1861 REMAT_DEBUG(dbgs() << "Region " << RegionIdx
1862 << " cannot meet occupancy target, interrupting "
1863 "re-scheduling in all regions\n");
1864 RescheduleRegions.reset();
1865 }
1866}
1867
1869 // Check the results of scheduling.
1870 PressureAfter = DAG.getRealRegPressure(RegionIdx);
1871
1872 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1873 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1874
1875 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1876
1877 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1878 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1879 DAG.Pressure[RegionIdx] = PressureAfter;
1880
1881 // Early out if we have achieved the occupancy target.
1882 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1883 return;
1884 }
1885
1886 unsigned TargetOccupancy = std::min(
1887 S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
1888 unsigned WavesAfter = std::min(
1889 TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
1890 unsigned WavesBefore = std::min(
1891 TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
1892 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1893 << ", after " << WavesAfter << ".\n");
1894
1895 // We may not be able to keep the current target occupancy because of the just
1896 // scheduled region. We might still be able to revert scheduling if the
1897 // occupancy before was higher, or if the current schedule has register
1898 // pressure higher than the excess limits which could lead to more spilling.
1899 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
1900
1901 // Allow memory bound functions to drop to 4 waves if not limited by an
1902 // attribute.
1903 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1904 WavesAfter >= MFI.getMinAllowedOccupancy()) {
1905 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1906 << MFI.getMinAllowedOccupancy() << " waves\n");
1907 NewOccupancy = WavesAfter;
1908 }
1909
1910 if (NewOccupancy < DAG.MinOccupancy) {
1911 DAG.MinOccupancy = NewOccupancy;
1912 MFI.limitOccupancy(DAG.MinOccupancy);
1913 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1914 << DAG.MinOccupancy << ".\n");
1915 }
1916 // The maximum number of arch VGPR on non-unified register file, or the
1917 // maximum VGPR + AGPR in the unified register file case.
1918 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1919 // The maximum number of arch VGPR for both unified and non-unified register
1920 // file.
1921 unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
1922 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1923
1924 if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
1925 PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
1926 PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1927 PressureAfter.getSGPRNum() > MaxSGPRs) {
1928 DAG.RegionsWithHighRP[RegionIdx] = true;
1929 DAG.RegionsWithExcessRP[RegionIdx] = true;
1930 }
1931
1932 // Revert if this region's schedule would cause a drop in occupancy or
1933 // spilling.
1934 if (shouldRevertScheduling(WavesAfter)) {
1936 std::tie(DAG.RegionBegin, DAG.RegionEnd) = DAG.Regions[RegionIdx];
1937 } else {
1938 DAG.Pressure[RegionIdx] = PressureAfter;
1939 }
1940}
1941
1942unsigned
1943GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1944 DenseMap<unsigned, unsigned> &ReadyCycles,
1945 const TargetSchedModel &SM) {
1946 unsigned ReadyCycle = CurrCycle;
1947 for (auto &D : SU.Preds) {
1948 if (D.isAssignedRegDep()) {
1949 MachineInstr *DefMI = D.getSUnit()->getInstr();
1950 unsigned Latency = SM.computeInstrLatency(DefMI);
1951 unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
1952 ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
1953 }
1954 }
1955 ReadyCycles[SU.NodeNum] = ReadyCycle;
1956 return ReadyCycle;
1957}
1958
1959#ifndef NDEBUG
1961 bool operator()(std::pair<MachineInstr *, unsigned> A,
1962 std::pair<MachineInstr *, unsigned> B) const {
1963 return A.second < B.second;
1964 }
1965};
1966
1967static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1968 EarlierIssuingCycle> &ReadyCycles) {
1969 if (ReadyCycles.empty())
1970 return;
1971 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1972 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1973 << " ##################\n# Cycle #\t\t\tInstruction "
1974 " "
1975 " \n";
1976 unsigned IPrev = 1;
1977 for (auto &I : ReadyCycles) {
1978 if (I.second > IPrev + 1)
1979 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1980 << " CYCLES DETECTED ******************************\n\n";
1981 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
1982 IPrev = I.second;
1983 }
1984}
1985#endif
1986
1987ScheduleMetrics
1988GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1989#ifndef NDEBUG
1990 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1991 ReadyCyclesSorted;
1992#endif
1993 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1994 unsigned SumBubbles = 0;
1995 DenseMap<unsigned, unsigned> ReadyCycles;
1996 unsigned CurrCycle = 0;
1997 for (auto &SU : InputSchedule) {
1998 unsigned ReadyCycle =
1999 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
2000 SumBubbles += ReadyCycle - CurrCycle;
2001#ifndef NDEBUG
2002 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
2003#endif
2004 CurrCycle = ++ReadyCycle;
2005 }
2006#ifndef NDEBUG
2007 LLVM_DEBUG(
2008 printScheduleModel(ReadyCyclesSorted);
2009 dbgs() << "\n\t"
2010 << "Metric: "
2011 << (SumBubbles
2012 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
2013 : 1)
2014 << "\n\n");
2015#endif
2016
2017 return ScheduleMetrics(CurrCycle, SumBubbles);
2018}
2019
2022#ifndef NDEBUG
2023 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
2024 ReadyCyclesSorted;
2025#endif
2026 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
2027 unsigned SumBubbles = 0;
2028 DenseMap<unsigned, unsigned> ReadyCycles;
2029 unsigned CurrCycle = 0;
2030 for (auto &MI : DAG) {
2031 SUnit *SU = DAG.getSUnit(&MI);
2032 if (!SU)
2033 continue;
2034 unsigned ReadyCycle =
2035 computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
2036 SumBubbles += ReadyCycle - CurrCycle;
2037#ifndef NDEBUG
2038 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
2039#endif
2040 CurrCycle = ++ReadyCycle;
2041 }
2042#ifndef NDEBUG
2043 LLVM_DEBUG(
2044 printScheduleModel(ReadyCyclesSorted);
2045 dbgs() << "\n\t"
2046 << "Metric: "
2047 << (SumBubbles
2048 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
2049 : 1)
2050 << "\n\n");
2051#endif
2052
2053 return ScheduleMetrics(CurrCycle, SumBubbles);
2054}
2055
2056bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
2057 if (WavesAfter < DAG.MinOccupancy)
2058 return true;
2059
2060 // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
2061 if (DAG.MFI.isDynamicVGPREnabled()) {
2062 unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2063 &ST, DAG.MFI.getDynamicVGPRBlockSize(),
2064 PressureBefore.getVGPRNum(false));
2065 unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2066 &ST, DAG.MFI.getDynamicVGPRBlockSize(),
2067 PressureAfter.getVGPRNum(false));
2068 if (BlocksAfter > BlocksBefore)
2069 return true;
2070 }
2071
2072 return false;
2073}
2074
2077 return false;
2078
2080 return true;
2081
2082 if (mayCauseSpilling(WavesAfter))
2083 return true;
2084
2085 return false;
2086}
2087
2089 // If RP is not reduced in the unclustered reschedule stage, revert to the
2090 // old schedule.
2091 if ((WavesAfter <=
2092 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
2093 mayCauseSpilling(WavesAfter)) ||
2095 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
2096 return true;
2097 }
2098
2099 // Do not attempt to relax schedule even more if we are already spilling.
2101 return false;
2102
2103 LLVM_DEBUG(
2104 dbgs()
2105 << "\n\t *** In shouldRevertScheduling ***\n"
2106 << " *********** BEFORE UnclusteredHighRPStage ***********\n");
2107 ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
2108 LLVM_DEBUG(
2109 dbgs()
2110 << "\n *********** AFTER UnclusteredHighRPStage ***********\n");
2112 unsigned OldMetric = MBefore.getMetric();
2113 unsigned NewMetric = MAfter.getMetric();
2114 unsigned WavesBefore = std::min(
2115 S.getTargetOccupancy(),
2116 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
2117 unsigned Profit =
2118 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
2120 NewMetric) /
2122 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
2123 << MAfter << "Profit: " << Profit << "\n");
2124 return Profit < ScheduleMetrics::ScaleFactor;
2125}
2126
2129 return false;
2130
2132 return true;
2133
2134 if (mayCauseSpilling(WavesAfter))
2135 return true;
2136
2137 return false;
2138}
2139
2141 // When trying to increase occupancy (TargetOcc == true) the stage manages
2142 // region reverts globally (all or none), so we always return false here.
2143 return !TargetOcc && mayCauseSpilling(WavesAfter);
2144}
2145
2147 if (mayCauseSpilling(WavesAfter))
2148 return true;
2149
2150 return false;
2151}
2152
2154 unsigned WavesAfter) {
2155 return mayCauseSpilling(WavesAfter);
2156}
2157
2158bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
2159 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
2161 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
2162 return true;
2163 }
2164
2165 return false;
2166}
2167
2170 ArrayRef<MachineInstr *> MIOrder) {
2171 assert(static_cast<size_t>(std::distance(DAG.Regions[RegionIdx].first,
2172 DAG.Regions[RegionIdx].second)) ==
2173 MIOrder.size() &&
2174 "instruction number mismatch");
2175 if (MIOrder.empty())
2176 return;
2177
2178 LLVM_DEBUG(dbgs() << "Reverting scheduling for region " << RegionIdx << '\n');
2179
2180 // Reconstruct MI sequence by moving instructions in desired order before
2181 // the current region's start.
2182 MachineBasicBlock::iterator RegionEnd = DAG.Regions[RegionIdx].first;
2183 for (MachineInstr *MI : MIOrder) {
2184 // Either move the next MI in order before the end of the region or move the
2185 // region end past the MI if it is at the correct position.
2186 MachineBasicBlock::iterator MII = MI->getIterator();
2187 if (MII != RegionEnd) {
2188 // Will subsequent splice move MI up past a non-debug instruction?
2189 bool NonDebugReordered =
2190 !MI->isDebugInstr() &&
2191 skipDebugInstructionsForward(RegionEnd, MII) != MII;
2192 MBB->splice(RegionEnd, MBB, MI);
2193 // Only update LiveIntervals information if non-debug instructions are
2194 // reordered. Otherwise debug instructions could cause code generation to
2195 // change.
2196 if (NonDebugReordered)
2197 DAG.LIS->handleMove(*MI, true);
2198 } else {
2199 ++RegionEnd;
2200 }
2201 if (MI->isDebugInstr()) {
2202 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2203 continue;
2204 }
2205
2206 // Reset read-undef flags and update them later.
2207 for (MachineOperand &Op : MI->all_defs())
2208 Op.setIsUndef(false);
2209 RegisterOperands RegOpers;
2210 RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
2211 if (DAG.ShouldTrackLaneMasks) {
2212 // Adjust liveness and add missing dead+read-undef flags.
2213 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
2214 RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
2215 } else {
2216 // Adjust for missing dead-def flags.
2217 RegOpers.detectDeadDefs(*MI, *DAG.LIS);
2218 }
2219 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2220 }
2221
2222 // The region end doesn't change throughout scheduling since it itself is
2223 // outside the region (whether that is a MBB end or a terminator MI).
2224 assert(RegionEnd == DAG.Regions[RegionIdx].second && "region end mismatch");
2225 DAG.Regions[RegionIdx].first = MIOrder.front();
2226}
2227
2228bool RewriteMFMAFormStage::isRewriteCandidate(MachineInstr *MI) const {
2229
2230 if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
2231 return false;
2232 return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1;
2233}
2234
2235bool RewriteMFMAFormStage::initHeuristics(
2236 std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
2237 DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2238 SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2239 bool Changed = false;
2240
2241 // Prepare for the heuristics
2242 for (MachineBasicBlock &MBB : MF) {
2243 for (MachineInstr &MI : MBB) {
2244 if (!isRewriteCandidate(&MI))
2245 continue;
2246
2247 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
2248 assert(ReplacementOp != -1);
2249
2250 RewriteCands.push_back({&MI, MI.getOpcode()});
2251 MI.setDesc(TII->get(ReplacementOp));
2252
2253 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
2254 if (Src2->isReg()) {
2255 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2256 findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
2257
2258 // For any definition of the src2 register which is non-MFMA, we
2259 // insert a copy.
2260 for (SlotIndex RDIdx : Src2ReachingDefs) {
2261 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
2262 if (!TII->isMAI(*RD))
2263 CopyForDef.insert(RD);
2264 }
2265 }
2266
2267 MachineOperand &Dst = MI.getOperand(0);
2268 SmallVector<MachineOperand *, 8> DstReachingUses;
2269
2270 findReachingUses(&MI, DAG.LIS, DstReachingUses);
2271
2272 for (MachineOperand *RUOp : DstReachingUses) {
2273 if (TII->isMAI(*RUOp->getParent()))
2274 continue;
2275
2276 // For any user of the result of the MFMA which is not an MFMA, we
2277 // insert a copy. For a given register, we will only insert one copy
2278 // per user block.
2279 CopyForUse[RUOp->getParent()->getParent()].insert(RUOp->getReg());
2280
2281 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2282 findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
2283
2284 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2285 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2286 if (TII->isMAI(*RD))
2287 continue;
2288
2289 // For any definition of the user of the MFMA which is not an MFMA,
2290 // we insert a copy. We do this to transform all the reaching defs
2291 // of this use to AGPR. By doing this, we can insert a copy from
2292 // AGPR to VGPR at the user rather than after the MFMA.
2293 CopyForDef.insert(RD);
2294 }
2295 }
2296
2297 // Do the rewrite to allow for updated RP calculation.
2298 const TargetRegisterClass *VDefRC = DAG.MRI.getRegClass(Dst.getReg());
2299 const TargetRegisterClass *ADefRC = SRI->getEquivalentAGPRClass(VDefRC);
2300 DAG.MRI.setRegClass(Dst.getReg(), ADefRC);
2301 if (Src2->isReg()) {
2302 // Have to get src types separately since subregs may cause C and D
2303 // registers to be different types even though the actual operand is
2304 // the same size.
2305 const TargetRegisterClass *VUseRC = DAG.MRI.getRegClass(Src2->getReg());
2306 const TargetRegisterClass *AUseRC = SRI->getEquivalentAGPRClass(VUseRC);
2307 DAG.MRI.setRegClass(Src2->getReg(), AUseRC);
2308 }
2309 Changed = true;
2310 }
2311 }
2312
2313 return Changed;
2314}
2315
2316int64_t RewriteMFMAFormStage::getRewriteCost(
2317 const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
2318 const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2319 const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2320 MachineBlockFrequencyInfo *MBFI = DAG.MBFI;
2321
2322 int64_t BestSpillCost = 0;
2323 int64_t Cost = 0;
2324 uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
2325
2326 std::pair<unsigned, unsigned> MaxVectorRegs =
2327 ST.getMaxNumVectorRegs(MF.getFunction());
2328 unsigned ArchVGPRThreshold = MaxVectorRegs.first;
2329 unsigned AGPRThreshold = MaxVectorRegs.second;
2330 unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
2331
2332 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2333 if (!RegionsWithExcessArchVGPR[Region])
2334 continue;
2335
2336 GCNRegPressure &PressureBefore = DAG.Pressure[Region];
2337 unsigned SpillCostBefore = PressureBefore.getVGPRSpills(
2338 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2339
2340 // For the cases we care about (i.e. ArchVGPR usage is greater than the
2341 // addressable limit), rewriting alone should bring pressure to manageable
2342 // level. If we find any such region, then the rewrite is potentially
2343 // beneficial.
2344 GCNRegPressure PressureAfter = DAG.getRealRegPressure(Region);
2345 unsigned SpillCostAfter = PressureAfter.getVGPRSpills(
2346 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2347
2348 uint64_t BlockFreq =
2349 MBFI->getBlockFreq(DAG.Regions[Region].first->getParent())
2350 .getFrequency();
2351
2352 bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
2353 uint64_t RelativeFreq = EntryFreq && BlockFreq
2354 ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
2355 : BlockFreq / EntryFreq)
2356 : 1;
2357
2358 // This assumes perfect spilling / splitting -- using one spill / copy
2359 // instruction and one restoreFrom / copy for each excess register,
2360 int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2;
2361
2362 // Also account for the block frequency.
2363 if (RelativeFreqIsDenom)
2364 SpillCost /= (int64_t)RelativeFreq;
2365 else
2366 SpillCost *= (int64_t)RelativeFreq;
2367
2368 // If we have increased spilling in any block, just bail.
2369 if (SpillCost > 0)
2370 return SpillCost;
2371
2372 if (SpillCost < BestSpillCost)
2373 BestSpillCost = SpillCost;
2374 }
2375
2376 // Set the cost to the largest decrease in spill cost in order to not double
2377 // count spill reductions.
2378 Cost = BestSpillCost;
2379 assert(Cost <= 0);
2380
2381 unsigned CopyCost = 0;
2382
2383 // For each CopyForDef, increase the cost by the register size while
2384 // accounting for block frequency.
2385 for (MachineInstr *DefMI : CopyForDef) {
2386 Register DefReg = DefMI->getOperand(0).getReg();
2387 uint64_t DefFreq =
2388 EntryFreq
2389 ? MBFI->getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
2390 : 1;
2391
2392 const TargetRegisterClass *RC = DAG.MRI.getRegClass(DefReg);
2393 CopyCost += RC->getCopyCost() * DefFreq;
2394 }
2395
2396 // Account for CopyForUse copies in each block that the register is used.
2397 for (auto &[UseBlock, UseRegs] : CopyForUse) {
2398 uint64_t UseFreq =
2399 EntryFreq ? MBFI->getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
2400
2401 for (Register UseReg : UseRegs) {
2402 const TargetRegisterClass *RC = DAG.MRI.getRegClass(UseReg);
2403 CopyCost += RC->getCopyCost() * UseFreq;
2404 }
2405 }
2406
2407 // Reset the classes that were changed to AGPR for better RB analysis.
2408 // We must do rewriting after copy-insertion, as some defs of the register
2409 // may require VGPR. Additionally, if we bail out and don't perform the
2410 // rewrite then these need to be restored anyway.
2411 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2412 assert(TII->isMAI(*MI));
2413 const TargetRegisterClass *ADefRC =
2414 DAG.MRI.getRegClass(MI->getOperand(0).getReg());
2415 const TargetRegisterClass *VDefRC = SRI->getEquivalentVGPRClass(ADefRC);
2416 DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VDefRC);
2417 MI->setDesc(TII->get(OriginalOpcode));
2418
2419 MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
2420 assert(Src2);
2421 if (!Src2->isReg())
2422 continue;
2423
2424 // Have to get src types separately since subregs may cause C and D
2425 // registers to be different types even though the actual operand is
2426 // the same size.
2427 const TargetRegisterClass *AUseRC = DAG.MRI.getRegClass(Src2->getReg());
2428 const TargetRegisterClass *VUseRC = SRI->getEquivalentVGPRClass(AUseRC);
2429 DAG.MRI.setRegClass(Src2->getReg(), VUseRC);
2430 }
2431
2432 return Cost + CopyCost;
2433}
2434
2435bool RewriteMFMAFormStage::rewrite(
2436 const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
2437 DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
2438 DenseMap<MachineInstr *, unsigned> LastMIToRegion;
2439
2440 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2441 RegionBoundaries Entry = DAG.Regions[Region];
2442 if (Entry.first == Entry.second)
2443 continue;
2444
2445 FirstMIToRegion[&*Entry.first] = Region;
2446 if (Entry.second != Entry.first->getParent()->end())
2447 LastMIToRegion[&*Entry.second] = Region;
2448 }
2449
2450 // Rewrite the MFMAs to AGPR, and insert any copies as needed.
2451 // The general assumption of the algorithm (and the previous cost calculation)
2452 // is that it is better to insert the copies in the MBB of the def of the src2
2453 // operands, and in the MBB of the user of the dest operands. This is based on
2454 // the assumption that the MFMAs are likely to appear in loop bodies, while
2455 // the src2 and dest operands are live-in / live-out of the loop. Due to this
2456 // design, the algorithm for finding copy insertion points is more
2457 // complicated.
2458 //
2459 // There are three main cases to handle: 1. the reaching defs of the src2
2460 // operands, 2. the reaching uses of the dst operands, and 3. the reaching
2461 // defs of the reaching uses of the dst operand.
2462 //
2463 // In the first case, we simply insert copies after each of the reaching
2464 // definitions. In the second case, we collect all the uses of a given dest
2465 // and organize them by MBB. Then, we insert 1 copy for each MBB before the
2466 // earliest use. Since the use may have multiple reaching defs, and since we
2467 // want to replace the register it is using with the result of the copy, we
2468 // must handle case 3. In the third case, we simply insert a copy after each
2469 // of the reaching defs to connect to the copy of the reaching uses of the dst
2470 // reg. This allows us to avoid inserting copies next to the MFMAs.
2471 //
2472 // While inserting the copies, we maintain a map of operands which will use
2473 // different regs (i.e. the result of the copies). For example, a case 1 src2
2474 // operand will use the register result of the copies after the reaching defs,
2475 // as opposed to the original register. Now that we have completed our copy
2476 // analysis and placement, we can bulk update the registers. We do this
2477 // separately as to avoid complicating the reachingDef and reachingUse
2478 // queries.
2479 //
2480 // While inserting the copies, we also maintain a list or registers which we
2481 // will want to reclassify as AGPR. After doing the copy insertion and the
2482 // register replacement, we can finally do the reclassification. This uses the
2483 // redef map, as the registers we are interested in reclassifying may be
2484 // replaced by the result of a copy. We must do this after the copy analysis
2485 // and placement as we must have an accurate redef map -- otherwise we may end
2486 // up creating illegal instructions.
2487
2488 // The original registers of the MFMA that need to be reclassified as AGPR.
2489 DenseSet<Register> RewriteRegs;
2490 // The map of an original register in the MFMA to a new register (result of a
2491 // copy) that it should be replaced with.
2492 DenseMap<Register, Register> RedefMap;
2493 // The map of the original MFMA registers to the relevant MFMA operands.
2494 DenseMap<Register, DenseSet<MachineOperand *>> ReplaceMap;
2495 // The map of reaching defs for a given register -- to avoid duplicate copies.
2496 DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap;
2497 // The map of reaching uses for a given register by basic block -- to avoid
2498 // duplicate copies and to calculate per MBB insert pts.
2499 DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>
2500 ReachingUseTracker;
2501
2502 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2503 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode());
2504 if (ReplacementOp == -1)
2505 continue;
2506 MI->setDesc(TII->get(ReplacementOp));
2507
2508 // Case 1: insert copies for the reaching defs of the Src2Reg.
2509 MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
2510 if (Src2->isReg()) {
2511 Register Src2Reg = Src2->getReg();
2512 if (!Src2Reg.isVirtual())
2513 return false;
2514
2515 Register MappedReg = Src2->getReg();
2516 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2517 findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
2518 SmallSetVector<MachineInstr *, 8> Src2DefsReplace;
2519
2520 for (SlotIndex RDIndex : Src2ReachingDefs) {
2521 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2522 if (TII->isMAI(*RD))
2523 continue;
2524
2525 // If there is a non mai reaching def, then we need a copy.
2526 Src2DefsReplace.insert(RD);
2527 }
2528
2529 if (!Src2DefsReplace.empty()) {
2530 DenseMap<Register, Register>::iterator RI = RedefMap.find(Src2Reg);
2531 if (RI != RedefMap.end()) {
2532 MappedReg = RI->second;
2533 } else {
2534 assert(!ReachingDefCopyMap.contains(Src2Reg));
2535 const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Src2Reg);
2536 const TargetRegisterClass *VGPRRC =
2537 SRI->getEquivalentVGPRClass(Src2RC);
2538
2539 // Track the mapping of the original register to the new register.
2540 MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
2541 RedefMap[Src2Reg] = MappedReg;
2542 }
2543
2544 // If none exists, create a copy from this reaching def.
2545 // We may have inserted a copy already in an earlier iteration.
2546 for (MachineInstr *RD : Src2DefsReplace) {
2547 // Do not create redundant copies.
2548 if (ReachingDefCopyMap[Src2Reg].insert(RD).second) {
2549 MachineInstrBuilder VGPRCopy =
2550 BuildMI(*RD->getParent(), std::next(RD->getIterator()),
2551 RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
2552 .addDef(MappedReg, {}, 0)
2553 .addUse(Src2Reg, {}, 0);
2554 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2555
2556 // If this reaching def was the last MI in the region, update the
2557 // region boundaries.
2558 if (LastMIToRegion.contains(RD)) {
2559 unsigned UpdateRegion = LastMIToRegion[RD];
2560 DAG.Regions[UpdateRegion].second = VGPRCopy;
2561 LastMIToRegion.erase(RD);
2562 }
2563 }
2564 }
2565 }
2566
2567 // Track the register for reclassification
2568 RewriteRegs.insert(Src2Reg);
2569
2570 // Always insert the operand for replacement. If this corresponds with a
2571 // chain of tied-def we may not see the VGPR requirement until later.
2572 ReplaceMap[Src2Reg].insert(Src2);
2573 }
2574
2575 // Case 2 and Case 3: insert copies before the reaching uses of the dsts,
2576 // and after the reaching defs of the reaching uses of the dsts.
2577
2578 MachineOperand *Dst = &MI->getOperand(0);
2579 Register DstReg = Dst->getReg();
2580 if (!DstReg.isVirtual())
2581 return false;
2582
2583 Register MappedReg = DstReg;
2584 SmallVector<MachineOperand *, 8> DstReachingUses;
2585
2586 SmallVector<MachineOperand *, 8> DstReachingUseCopies;
2587 SmallVector<MachineInstr *, 8> DstUseDefsReplace;
2588
2589 findReachingUses(MI, DAG.LIS, DstReachingUses);
2590
2591 for (MachineOperand *RUOp : DstReachingUses) {
2592 if (TII->isMAI(*RUOp->getParent()))
2593 continue;
2594
2595 // If there is a non mai reaching use, then we need a copy.
2596 if (find(DstReachingUseCopies, RUOp) == DstReachingUseCopies.end())
2597 DstReachingUseCopies.push_back(RUOp);
2598 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2599 findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
2600
2601 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2602 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2603 if (TII->isMAI(*RD))
2604 continue;
2605
2606 // If there is a non mai reaching def of this reaching use, then we will
2607 // need a copy.
2608 if (find(DstUseDefsReplace, RD) == DstUseDefsReplace.end())
2609 DstUseDefsReplace.push_back(RD);
2610 }
2611 }
2612
2613 if (!DstUseDefsReplace.empty()) {
2614 DenseMap<Register, Register>::iterator RI = RedefMap.find(DstReg);
2615 if (RI != RedefMap.end()) {
2616 MappedReg = RI->second;
2617 } else {
2618 assert(!ReachingDefCopyMap.contains(DstReg));
2619 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
2620 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2621
2622 // Track the mapping of the original register to the new register.
2623 MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
2624 RedefMap[DstReg] = MappedReg;
2625 }
2626
2627 // If none exists, create a copy from this reaching def.
2628 // We may have inserted a copy already in an earlier iteration.
2629 for (MachineInstr *RD : DstUseDefsReplace) {
2630 // Do not create reundant copies.
2631 if (ReachingDefCopyMap[DstReg].insert(RD).second) {
2632 MachineInstrBuilder VGPRCopy =
2633 BuildMI(*RD->getParent(), std::next(RD->getIterator()),
2634 RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
2635 .addDef(MappedReg, {}, 0)
2636 .addUse(DstReg, {}, 0);
2637 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2638
2639 // If this reaching def was the last MI in the region, update the
2640 // region boundaries.
2642 LastMIToRegion.find(RD);
2643 if (LMI != LastMIToRegion.end()) {
2644 unsigned UpdateRegion = LMI->second;
2645 DAG.Regions[UpdateRegion].second = VGPRCopy;
2646 LastMIToRegion.erase(RD);
2647 }
2648 }
2649 }
2650 }
2651
2652 DenseSet<MachineOperand *> &DstRegSet = ReplaceMap[DstReg];
2653 for (MachineOperand *RU : DstReachingUseCopies) {
2654 MachineBasicBlock *RUBlock = RU->getParent()->getParent();
2655 // Just keep track of the reaching use of this register by block. After we
2656 // have scanned all the MFMAs we can find optimal insert pts.
2657 if (RUBlock != MI->getParent()) {
2658 ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(RU);
2659 continue;
2660 }
2661
2662 // Special case, the use is in the same block as the MFMA. Insert the copy
2663 // just before the use.
2664 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
2665 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2666 Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
2667 MachineInstr *UseInst = RU->getParent();
2668 MachineInstrBuilder VGPRCopy =
2669 BuildMI(*UseInst->getParent(), UseInst->getIterator(),
2670 UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
2671 .addDef(NewUseReg, {}, 0)
2672 .addUse(DstReg, {}, 0);
2673 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2674 // Since we know this use has only one reaching def, we can replace the
2675 // use reg.
2676 RU->setReg(NewUseReg);
2677 // Track the copy source operand for r eplacement.
2678 DstRegSet.insert(&VGPRCopy->getOperand(1));
2679 }
2680
2681 // Track the register for reclassification
2682 RewriteRegs.insert(DstReg);
2683
2684 // Insert the dst operand for replacement. If this dst is in a chain of
2685 // tied-def MFMAs, and the first src2 needs to be replaced with a new reg,
2686 // all the correspond operands need to be replaced.
2687 DstRegSet.insert(Dst);
2688 }
2689
2690 // Handle the copies for dst uses.
2691 using RUBType =
2692 std::pair<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>;
2693 for (RUBType RUBlockEntry : ReachingUseTracker) {
2694 using RUDType = std::pair<Register, SmallPtrSet<MachineOperand *, 8>>;
2695 for (RUDType RUDst : RUBlockEntry.second) {
2696 MachineOperand *OpBegin = *RUDst.second.begin();
2697 SlotIndex InstPt = DAG.LIS->getInstructionIndex(*OpBegin->getParent());
2698
2699 // Find the earliest use in this block.
2700 for (MachineOperand *User : RUDst.second) {
2701 SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(*User->getParent());
2702 if (SlotIndex::isEarlierInstr(NewInstPt, InstPt))
2703 InstPt = NewInstPt;
2704 }
2705
2706 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(RUDst.first);
2707 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2708 Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
2709 MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(InstPt);
2710
2711 MachineInstrBuilder VGPRCopy =
2712 BuildMI(*UseInst->getParent(), UseInst->getIterator(),
2713 UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
2714 .addDef(NewUseReg, {}, 0)
2715 .addUse(RUDst.first, {}, 0);
2716 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2717
2718 // If this UseInst was the first MI in the region, update the region
2719 // boundaries.
2721 FirstMIToRegion.find(UseInst);
2722 if (FI != FirstMIToRegion.end()) {
2723 unsigned UpdateRegion = FI->second;
2724 DAG.Regions[UpdateRegion].first = VGPRCopy;
2725 FirstMIToRegion.erase(UseInst);
2726 }
2727
2728 // Replace the operand for all users.
2729 for (MachineOperand *User : RUDst.second) {
2730 User->setReg(NewUseReg);
2731 }
2732
2733 // Track the copy source operand for replacement.
2734 ReplaceMap[RUDst.first].insert(&VGPRCopy->getOperand(1));
2735 }
2736 }
2737
2738 // We may have needed to insert copies after the reaching defs of the MFMAs.
2739 // Replace the original register with the result of the copy for all relevant
2740 // operands.
2741 for (std::pair<Register, Register> NewDef : RedefMap) {
2742 Register OldReg = NewDef.first;
2743 Register NewReg = NewDef.second;
2744
2745 // Replace the register for any associated operand in the MFMA chain.
2746 for (MachineOperand *ReplaceOp : ReplaceMap[OldReg])
2747 ReplaceOp->setReg(NewReg);
2748 }
2749
2750 // Finally, do the reclassification of the MFMA registers.
2751 for (Register RewriteReg : RewriteRegs) {
2752 Register RegToRewrite = RewriteReg;
2753
2754 // Be sure to update the replacement register and not the original.
2755 DenseMap<Register, Register>::iterator RI = RedefMap.find(RewriteReg);
2756 if (RI != RedefMap.end())
2757 RegToRewrite = RI->second;
2758
2759 const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(RegToRewrite);
2760 const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(CurrRC);
2761
2762 DAG.MRI.setRegClass(RegToRewrite, AGPRRC);
2763 }
2764
2765 // Bulk update the LIS.
2766 DAG.LIS->reanalyze(DAG.MF);
2767 // Liveins may have been modified for cross RC copies
2768 RegionPressureMap LiveInUpdater(&DAG, false);
2769 LiveInUpdater.buildLiveRegMap();
2770
2771 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++)
2772 DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(Region);
2773
2774 DAG.Pressure[RegionIdx] = DAG.getRealRegPressure(RegionIdx);
2775
2776 return true;
2777}
2778
2779unsigned PreRARematStage::getStageTargetOccupancy() const {
2780 return TargetOcc ? *TargetOcc : MFI.getMinWavesPerEU();
2781}
2782
2783bool PreRARematStage::setObjective() {
2784 const Function &F = MF.getFunction();
2785
2786 // Set up "spilling targets" for all regions.
2787 unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
2788 unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
2789 bool HasVectorRegisterExcess = false;
2790 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2791 const GCNRegPressure &RP = DAG.Pressure[I];
2792 GCNRPTarget &Target = RPTargets.emplace_back(MaxSGPRs, MaxVGPRs, MF, RP);
2793 if (!Target.satisfied())
2794 TargetRegions.set(I);
2795 HasVectorRegisterExcess |= Target.hasVectorRegisterExcess();
2796 }
2797
2798 if (HasVectorRegisterExcess || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
2799 // In addition to register usage being above addressable limits, occupancy
2800 // below the minimum is considered like "spilling" as well.
2801 TargetOcc = std::nullopt;
2802 } else {
2803 // There is no spilling and room to improve occupancy; set up "increased
2804 // occupancy targets" for all regions.
2805 TargetOcc = DAG.MinOccupancy + 1;
2806 const unsigned VGPRBlockSize = MFI.getDynamicVGPRBlockSize();
2807 MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
2808 MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
2809 for (auto [I, Target] : enumerate(RPTargets)) {
2810 Target.setTarget(MaxSGPRs, MaxVGPRs);
2811 if (!Target.satisfied())
2812 TargetRegions.set(I);
2813 }
2814 }
2815
2816 return TargetRegions.any();
2817}
2818
2819bool PreRARematStage::collectRematRegs(
2820 const DenseMap<MachineInstr *, unsigned> &MIRegion) {
2821 // We need up-to-date live-out info. to query live-out register masks in
2822 // regions containing rematerializable instructions.
2823 DAG.RegionLiveOuts.buildLiveRegMap();
2824
2825 // Set of registers already marked for potential remterialization; used to
2826 // avoid rematerialization chains.
2827 SmallSet<Register, 4> MarkedRegs;
2828 auto IsMarkedForRemat = [&MarkedRegs](const MachineOperand &MO) -> bool {
2829 return MO.isReg() && MarkedRegs.contains(MO.getReg());
2830 };
2831
2832 // Identify rematerializable instructions in the function.
2833 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2834 RegionBoundaries Bounds = DAG.Regions[I];
2835 for (auto MI = Bounds.first; MI != Bounds.second; ++MI) {
2836 // The instruction must be rematerializable.
2837 MachineInstr &DefMI = *MI;
2838 if (!isReMaterializable(DefMI))
2839 continue;
2840
2841 // We only support rematerializing virtual registers with one
2842 // definition.
2843 Register Reg = DefMI.getOperand(0).getReg();
2844 if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg))
2845 continue;
2846
2847 // We only care to rematerialize the instruction if it has a single
2848 // non-debug user in a different region.
2849 // FIXME: Allow rematerializations with multiple uses. This should be
2850 // relatively easy to support using the current cost model.
2851 MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
2852 if (!UseMI)
2853 continue;
2854 auto UseRegion = MIRegion.find(UseMI);
2855 if (UseRegion == MIRegion.end() || UseRegion->second == I)
2856 continue;
2857
2858 // Do not rematerialize an instruction if it uses or is used by an
2859 // instruction that we have designated for rematerialization.
2860 // FIXME: Allow for rematerialization chains: this requires 1. updating
2861 // remat points to account for uses that are rematerialized, and 2.
2862 // either rematerializing the candidates in careful ordering, or
2863 // deferring the MBB RP walk until the entire chain has been
2864 // rematerialized.
2865 const MachineOperand &UseMO = UseMI->getOperand(0);
2866 if (IsMarkedForRemat(UseMO) ||
2867 llvm::any_of(DefMI.operands(), IsMarkedForRemat))
2868 continue;
2869
2870 // Do not rematerialize an instruction it it uses registers that aren't
2871 // available at its use. This ensures that we are not extending any live
2872 // range while rematerializing.
2873 SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
2874 if (!VirtRegAuxInfo::allUsesAvailableAt(&DefMI, UseIdx, *DAG.LIS, DAG.MRI,
2875 *DAG.TII))
2876 continue;
2877
2878 // Add the instruction to the rematerializable list.
2879 MarkedRegs.insert(Reg);
2880 RematRegs.emplace_back(&DefMI, UseMI, DAG, MIRegion);
2881 }
2882 }
2883
2884 return !RematRegs.empty();
2885}
2886
2887PreRARematStage::RematReg::RematReg(
2888 MachineInstr *DefMI, MachineInstr *UseMI, GCNScheduleDAGMILive &DAG,
2889 const DenseMap<MachineInstr *, unsigned> &MIRegion)
2890 : DefMI(DefMI), UseMI(UseMI), LiveIn(DAG.Regions.size()),
2891 LiveOut(DAG.Regions.size()), Live(DAG.Regions.size()),
2892 DefRegion(MIRegion.at(DefMI)), UseRegion(MIRegion.at(UseMI)) {
2893
2894 // Mark regions in which the rematerializable register is live.
2895 Register Reg = getReg();
2896 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2897 auto LiveInIt = DAG.LiveIns[I].find(Reg);
2898 if (LiveInIt != DAG.LiveIns[I].end())
2899 LiveIn.set(I);
2900 const auto &LiveOuts = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I);
2901 if (auto LiveOutIt = LiveOuts.find(Reg); LiveOutIt != LiveOuts.end())
2902 LiveOut.set(I);
2903 }
2904 Live |= LiveIn;
2905 Live |= LiveOut;
2906 Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(DefRegion).at(Reg);
2907}
2908
2909bool PreRARematStage::RematReg::maybeBeneficial(
2910 const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets) const {
2911 Register Reg = getReg();
2912 for (unsigned I : TargetRegions.set_bits()) {
2913 if (Live[I] && RPTargets[I].isSaveBeneficial(Reg))
2914 return true;
2915 }
2916 return false;
2917}
2918
2919void PreRARematStage::RematReg::insertMI(unsigned RegionIdx,
2920 MachineInstr *RematMI,
2921 GCNScheduleDAGMILive &DAG) const {
2922 RegionBoundaries &Bounds = DAG.Regions[RegionIdx];
2923 if (Bounds.first == std::next(MachineBasicBlock::iterator(RematMI)))
2924 Bounds.first = RematMI;
2925 DAG.LIS->InsertMachineInstrInMaps(*RematMI);
2926 DAG.LIS->createAndComputeVirtRegInterval(RematMI->getOperand(0).getReg());
2927}
2928
2931 assert(DAG.MLI && "MLI not defined in DAG");
2933 MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI);
2934
2935 const unsigned NumRegions = DAG.Regions.size();
2937 MaxFreq = 0;
2938 Regions.reserve(NumRegions);
2939 for (unsigned I = 0; I < NumRegions; ++I) {
2940 MachineBasicBlock *MBB = DAG.Regions[I].first->getParent();
2941 uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency();
2942 Regions.push_back(BlockFreq);
2943 if (BlockFreq && BlockFreq < MinFreq)
2944 MinFreq = BlockFreq;
2945 else if (BlockFreq > MaxFreq)
2946 MaxFreq = BlockFreq;
2947 }
2948 if (!MinFreq)
2949 return;
2950
2951 // Scale everything down if frequencies are high.
2952 if (MinFreq >= ScaleFactor * ScaleFactor) {
2953 for (uint64_t &Freq : Regions)
2954 Freq /= ScaleFactor;
2955 MinFreq /= ScaleFactor;
2956 MaxFreq /= ScaleFactor;
2957 }
2958}
2959
2960PreRARematStage::ScoredRemat::ScoredRemat(RematReg *Remat, const FreqInfo &Freq,
2962 : Remat(Remat), FreqDiff(getFreqDiff(Freq)) {
2963 RPSave.inc(Remat->getReg(), LaneBitmask::getNone(), Remat->Mask, DAG.MRI);
2964}
2965
2966int64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
2967 // Get frequencies of defining and using regions. A rematerialization from the
2968 // least frequent region to the most frequent region will yield the greatest
2969 // latency penalty and therefore should get minimum score. Reciprocally, a
2970 // rematerialization in the other direction should get maximum score. Default
2971 // to values that will yield the worst possible score given known frequencies
2972 // in order to penalize rematerializations from or into regions whose
2973 // frequency is unknown.
2974 int64_t DefOrMin = std::max(Freq.Regions[Remat->DefRegion], Freq.MinFreq);
2975 int64_t UseOrMax = Freq.Regions[Remat->UseRegion];
2976 if (!UseOrMax)
2977 UseOrMax = Freq.MaxFreq;
2978 return DefOrMin - UseOrMax;
2979}
2980
2981void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
2982 ArrayRef<GCNRPTarget> RPTargets,
2983 const FreqInfo &FreqInfo,
2984 bool ReduceSpill) {
2985 MaxFreq = 0;
2986 RegionImpact = 0;
2987 for (unsigned I : TargetRegions.set_bits()) {
2988 if (!Remat->Live[I])
2989 continue;
2990
2991 // The rematerialization must contribute positively in at least one
2992 // register class with usage above the RP target for this region to
2993 // contribute to the score.
2994 const GCNRPTarget &RegionTarget = RPTargets[I];
2995 const unsigned NumRegsBenefit = RegionTarget.getNumRegsBenefit(RPSave);
2996 if (!NumRegsBenefit)
2997 continue;
2998
2999 bool UnusedLT = Remat->isUnusedLiveThrough(I);
3000
3001 // Regions in which RP is guaranteed to decrease have more weight.
3002 RegionImpact += (UnusedLT ? 2 : 1) * NumRegsBenefit;
3003
3004 if (ReduceSpill) {
3005 uint64_t Freq = FreqInfo.Regions[I];
3006 if (!UnusedLT) {
3007 // Apply a frequency penalty in regions in which we are not sure that RP
3008 // will decrease.
3009 Freq /= 2;
3010 }
3011 MaxFreq = std::max(MaxFreq, Freq);
3012 }
3013 }
3014}
3015
3016MachineInstr *PreRARematStage::ScoredRemat::rematerialize(
3017 BitVector &RecomputeRP, SmallVectorImpl<GCNRPTarget> &RPTargets,
3018 GCNScheduleDAGMILive &DAG) const {
3019 const SIInstrInfo *TII = DAG.MF.getSubtarget<GCNSubtarget>().getInstrInfo();
3020 MachineInstr &DefMI = *Remat->DefMI;
3021 Register Reg = DefMI.getOperand(0).getReg();
3023
3024 // Rematerialize the register in the region where it is used.
3025 MachineBasicBlock::iterator InsertPos = Remat->UseMI;
3026 TII->reMaterialize(*InsertPos->getParent(), InsertPos, NewReg, 0, DefMI);
3027 MachineInstr *RematMI = &*std::prev(InsertPos);
3028 Remat->UseMI->substituteRegister(Reg, NewReg, 0, *DAG.TRI);
3029 Remat->insertMI(Remat->UseRegion, RematMI, DAG);
3030
3031#ifdef EXPENSIVE_CHECKS
3032 // All uses are known to be available / live at the remat point. Thus,
3033 // the uses should already be live in to the using region.
3034 for (MachineOperand &MO : DefMI.operands()) {
3035 if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
3036 continue;
3037
3038 Register UseReg = MO.getReg();
3039 if (!UseReg.isVirtual())
3040 continue;
3041
3042 LiveInterval &LI = DAG.LIS->getInterval(UseReg);
3043 LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
3044 if (LI.hasSubRanges() && MO.getSubReg())
3045 LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
3046
3047 LaneBitmask LiveInMask = DAG.LiveIns[Remat->UseRegion].at(UseReg);
3048 LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
3049 // If this register has lanes not covered by the LiveIns, be sure they
3050 // do not map to any subrange. ref:
3051 // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
3052 if (UncoveredLanes.any()) {
3053 assert(LI.hasSubRanges());
3054 for (LiveInterval::SubRange &SR : LI.subranges())
3055 assert((SR.LaneMask & UncoveredLanes).none());
3056 }
3057 }
3058#endif
3059
3060 // Remove the register from all regions where it is a live-in or live-out
3061 // and adjust RP targets. The save is guaranteed in regions in which the
3062 // register is live-through and unused but optimistic in all other regions
3063 // where the register is live.
3064 for (unsigned I : Remat->Live.set_bits()) {
3065 RPTargets[I].saveRP(RPSave);
3066 DAG.LiveIns[I].erase(Reg);
3067 DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).erase(Reg);
3068 if (!Remat->isUnusedLiveThrough(I))
3069 RecomputeRP.set(I);
3070 }
3071
3072 return RematMI;
3073}
3074
3075void PreRARematStage::commitRematerializations() const {
3076 REMAT_DEBUG(dbgs() << "Commiting all rematerializations\n");
3077 for (const RollbackInfo &Rollback : Rollbacks)
3078 DAG.deleteMI(Rollback.Remat->DefRegion, Rollback.Remat->DefMI);
3079}
3080
3081void PreRARematStage::unsetSatisfiedRPTargets(const BitVector &Regions) {
3082 for (unsigned I : Regions.set_bits()) {
3083 if (TargetRegions[I] && RPTargets[I].satisfied()) {
3084 REMAT_DEBUG(dbgs() << " [" << I << "] Target reached!\n");
3085 TargetRegions.reset(I);
3086 }
3087 }
3088}
3089
3090bool PreRARematStage::updateAndVerifyRPTargets(const BitVector &Regions) {
3091 bool TooOptimistic = false;
3092 for (unsigned I : Regions.set_bits()) {
3093 GCNRPTarget &Target = RPTargets[I];
3094 Target.setRP(DAG.getRealRegPressure(I));
3095
3096 // Since we were optimistic in assessing RP decreases in these regions, we
3097 // may need to remark the target as a target region if RP didn't decrease
3098 // as expected.
3099 if (!TargetRegions[I] && !Target.satisfied()) {
3100 REMAT_DEBUG(dbgs() << " [" << I << "] Incorrect RP estimation\n");
3101 TooOptimistic = true;
3102 TargetRegions.set(I);
3103 }
3104 }
3105 return TooOptimistic;
3106}
3107
3108// Copied from MachineLICM
3109bool PreRARematStage::isReMaterializable(const MachineInstr &MI) {
3111 return false;
3112
3113 for (const MachineOperand &MO : MI.all_uses()) {
3114 // We can't remat physreg uses, unless it is a constant or an ignorable
3115 // use (e.g. implicit exec use on VALU instructions)
3116 if (MO.getReg().isPhysical()) {
3117 if (DAG.MRI.isConstantPhysReg(MO.getReg()) || DAG.TII->isIgnorableUse(MO))
3118 continue;
3119 return false;
3120 }
3121 }
3122
3123 return true;
3124}
3125
3127 // We consider that reducing spilling is always beneficial so we never
3128 // rollback rematerializations or revert scheduling in such cases.
3129 if (!TargetOcc)
3130 return;
3131
3132 // When increasing occupancy, it is possible that re-scheduling is not able to
3133 // achieve the target occupancy in all regions, in which case re-scheduling in
3134 // all regions should be reverted.
3135 if (DAG.MinOccupancy >= *TargetOcc) {
3136 commitRematerializations();
3137 return;
3138 }
3139
3140 // It is possible that re-scheduling lowers occupancy over the one achieved
3141 // just through rematerializations, in which case we revert re-scheduling in
3142 // all regions but do not roll back rematerializations.
3143 const bool ShouldRollbackRemats = AchievedOcc < *TargetOcc;
3144
3145 // When we both need to revert re-scheduling and rollback rematerializations,
3146 // restore rematerialized MIs' original state before reverting so that they
3147 // are treated as non-debug instructions by the revert logic.
3148 if (ShouldRollbackRemats) {
3149 for (const RollbackInfo &Rollback : Rollbacks) {
3150 const auto &[Remat, RematMI, RegMap] = Rollback;
3151 Remat->DefMI->setDesc(DAG.TII->get(RematMI->getOpcode()));
3152 for (const auto &[MOIdx, Reg] : RegMap)
3153 Remat->DefMI->getOperand(MOIdx).setReg(Reg);
3154 }
3155 }
3156
3157 // Revert re-scheduling in all affected regions.
3158 for (const auto &[RegionIdx, OrigMIOrder, MaxPressure] : RegionReverts) {
3159 REMAT_DEBUG(dbgs() << "Reverting re-scheduling in region " << RegionIdx
3160 << '\n');
3161 DAG.Pressure[RegionIdx] = MaxPressure;
3162 modifyRegionSchedule(RegionIdx, RegionBB[RegionIdx], OrigMIOrder);
3163 }
3164
3165 if (!ShouldRollbackRemats) {
3166 commitRematerializations();
3167 DAG.setTargetOccupancy(AchievedOcc);
3168 return;
3169 }
3170
3171 // Reset the target occupancy to what it was pre-rematerialization.
3172 DAG.setTargetOccupancy(*TargetOcc - 1);
3173
3174 // Finish rolling back rematerializations, then recompute pressure in all
3175 // affected regions.
3176 REMAT_DEBUG(dbgs() << "==== ROLLBACK ====\n");
3177 BitVector RecomputeRP(DAG.Regions.size());
3178 DenseSet<Register> RecomputeLI;
3179 for (const RollbackInfo &Rollback : Rollbacks) {
3180 const auto &[Remat, RematMI, RegMap] = Rollback;
3181
3182 // Switch back to using the original register and delete the
3183 // rematerialization.
3184 Register Reg = RematMI->getOperand(0).getReg();
3185 Register OriginalReg = Remat->DefMI->getOperand(0).getReg();
3186 Remat->UseMI->substituteRegister(Reg, OriginalReg, 0, *DAG.TRI);
3187 REMAT_DEBUG(dbgs() << '[' << Remat->UseRegion
3188 << "] Deleting rematerialization " << *RematMI);
3189 DAG.deleteMI(Remat->UseRegion, RematMI);
3190
3191 // Re-add the defined register as a live-in/live-out in all regions it used
3192 // to be one in.
3193 std::pair<Register, LaneBitmask> LiveReg(OriginalReg, Remat->Mask);
3194 for (unsigned I : Remat->LiveIn.set_bits())
3195 DAG.LiveIns[I].insert(LiveReg);
3196 for (unsigned I : Remat->LiveOut.set_bits())
3197 DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).insert(LiveReg);
3198
3199 RecomputeRP |= Rollback.Remat->Live;
3200 // Regenerate intervals for all register operands of rematerialized MIs as
3201 // slot indices may have changed slightly from before re-scheduling.
3202 for (MachineOperand &MO : Rollback.Remat->DefMI->operands()) {
3203 if (MO.isReg() && MO.getReg().isVirtual())
3204 RecomputeLI.insert(MO.getReg());
3205 }
3206 }
3207 for (Register Reg : RecomputeLI) {
3210 }
3211#ifdef EXPENSIVE_CHECKS
3212 // In particular, we want to check for coherent MI/slot order in regions in
3213 // which reverts and/or rollbacks may have happened.
3214 MF.verify();
3215#endif
3216 for (unsigned I : RecomputeRP.set_bits())
3217 DAG.Pressure[I] = DAG.getRealRegPressure(I);
3218
3220}
3221
3222void GCNScheduleDAGMILive::deleteMI(unsigned RegionIdx, MachineInstr *MI) {
3223 // It's not possible for the deleted instruction to be upper region boundary
3224 // since we don't delete region terminators.
3225 if (Regions[RegionIdx].first == MI)
3226 Regions[RegionIdx].first = std::next(MachineBasicBlock::iterator(MI));
3227 LIS->removeInterval(MI->getOperand(0).getReg());
3229 MI->eraseFromParent();
3230}
3231
3232void GCNScheduleDAGMILive::setTargetOccupancy(unsigned TargetOccupancy) {
3233 MinOccupancy = TargetOccupancy;
3234 if (MFI.getOccupancy() < TargetOccupancy)
3235 MFI.increaseOccupancy(MF, MinOccupancy);
3236 else
3237 MFI.limitOccupancy(MinOccupancy);
3238}
3239
3241 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
3242 return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) {
3243 return SII->isIGLPMutationOnly(MI->getOpcode());
3244 });
3245}
3246
3251
3253 HasIGLPInstrs = hasIGLPInstrs(this);
3254 if (HasIGLPInstrs) {
3255 SavedMutations.clear();
3256 SavedMutations.swap(Mutations);
3258 }
3259
3261}
3262
3264 if (HasIGLPInstrs)
3265 SavedMutations.swap(Mutations);
3266
3268}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SUnit * pickOnlyChoice(SchedBoundary &Zone)
MachineBasicBlock & MBB
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file defines the GCNRegPressure class, which tracks registry pressure by bookkeeping number of S...
static cl::opt< bool > GCNTrackers("amdgpu-use-amdgpu-trackers", cl::Hidden, cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false))
static cl::opt< bool > DisableClusteredLowOccupancy("amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden, cl::desc("Disable clustered low occupancy " "rescheduling for ILP scheduling stage."), cl::init(false))
#define REMAT_PREFIX
Allows to easily filter for this stage's debug output.
static MachineInstr * getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, MachineBasicBlock::iterator RegionEnd)
static bool shouldCheckPending(SchedBoundary &Zone, const TargetSchedModel *SchedModel)
static cl::opt< bool > RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, cl::desc("Relax occupancy targets for kernels which are memory " "bound (amdgpu-membound-threshold), or " "Wave Limited (amdgpu-limit-wave-threshold)."), cl::init(false))
#define REMAT_DEBUG(X)
static cl::opt< bool > DisableUnclusterHighRP("amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden, cl::desc("Disable unclustered high register pressure " "reduction scheduling stage."), cl::init(false))
static void printScheduleModel(std::set< std::pair< MachineInstr *, unsigned >, EarlierIssuingCycle > &ReadyCycles)
static cl::opt< bool > PrintMaxRPRegUsageAfterScheduler("amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure after scheduling."), cl::init(false))
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG)
static cl::opt< bool > DisableRewriteMFMAFormSchedStage("amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden, cl::desc("Disable rewrite mfma rewrite scheduling stage"), cl::init(true))
static bool canUsePressureDiffs(const SUnit &SU)
Checks whether SU can use the cached DAG pressure diffs to compute the current register pressure.
static cl::opt< unsigned > PendingQueueLimit("amdgpu-scheduler-pending-queue-limit", cl::Hidden, cl::desc("Max (Available+Pending) size to inspect pending queue (0 disables)"), cl::init(256))
static cl::opt< bool > PrintMaxRPRegUsageBeforeScheduler("amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure before scheduling."), cl::init(false))
static cl::opt< unsigned > ScheduleMetricBias("amdgpu-schedule-metric-bias", cl::Hidden, cl::desc("Sets the bias which adds weight to occupancy vs latency. Set it to " "100 to chase the occupancy only."), cl::init(10))
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
static constexpr std::pair< StringLiteral, StringLiteral > ReplaceMap[]
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static constexpr unsigned SM(unsigned Version)
if(PassOpts->AAPipeline)
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
BitVector & reset()
Definition BitVector.h:411
BitVector & set()
Definition BitVector.h:370
iterator_range< const_set_bits_iterator > set_bits() const
Definition BitVector.h:159
size_type size() const
size - Returns the number of bits in this bitvector.
Definition BitVector.h:178
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
bool shouldRevertScheduling(unsigned WavesAfter) override
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
bool erase(const KeyT &Val)
Definition DenseMap.h:330
DenseMapIterator< KeyT, ValueT, KeyInfoT, BucketT > iterator
Definition DenseMap.h:74
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
GCNRegPressure bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI) const
Mostly copy/paste from CodeGen/RegisterPressure.cpp Calculate the impact MI will have on CurPressure ...
GCNMaxILPSchedStrategy(const MachineSchedContext *C)
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
Apply a set of heuristics to a new candidate.
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as much as possible.
GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext *C)
GCNMaxOccupancySchedStrategy(const MachineSchedContext *C, bool IsLegacyScheduler=false)
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNPostScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
Models a register pressure target, allowing to evaluate and track register savings against that targe...
unsigned getNumRegsBenefit(const GCNRegPressure &SaveRP) const
Returns the benefit towards achieving the RP target that saving SaveRP represents,...
GCNRegPressure getPressure() const
DenseMap< unsigned, LaneBitmask > LiveRegSet
GCNSchedStrategy & S
GCNRegPressure PressureBefore
bool isRegionWithExcessRP() const
bool mayCauseSpilling(unsigned WavesAfter)
ScheduleMetrics getScheduleMetrics(const std::vector< SUnit > &InputSchedule)
GCNScheduleDAGMILive & DAG
const GCNSchedStageID StageID
std::vector< MachineInstr * > Unsched
GCNRegPressure PressureAfter
MachineFunction & MF
virtual void finalizeGCNRegion()
SIMachineFunctionInfo & MFI
unsigned computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, DenseMap< unsigned, unsigned > &ReadyCycles, const TargetSchedModel &SM)
virtual void finalizeGCNSchedStage()
virtual bool initGCNSchedStage()
virtual bool shouldRevertScheduling(unsigned WavesAfter)
std::vector< std::unique_ptr< ScheduleDAGMutation > > SavedMutations
GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
void modifyRegionSchedule(unsigned RegionIdx, MachineBasicBlock *MBB, ArrayRef< MachineInstr * > MIOrder)
Sets the schedule of region RegionIdx in block MBB to MIOrder.
MachineBasicBlock * CurrentMBB
const GCNSubtarget & ST
This is a minimal scheduler strategy.
GCNDownwardRPTracker DownwardTracker
void getRegisterPressures(bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, std::vector< unsigned > &Pressure, std::vector< unsigned > &MaxPressure, GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, ScheduleDAGMI *DAG, const SIRegisterInfo *SRI)
GCNSchedStrategy(const MachineSchedContext *C)
SmallVector< GCNSchedStageID, 4 > SchedStages
std::vector< unsigned > MaxPressure
SUnit * pickNodeBidirectional(bool &IsTopNode, bool &PickedPending)
GCNSchedStageID getCurrentStage()
bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Evaluates instructions in the pending queue using a subset of scheduling heuristics.
SmallVectorImpl< GCNSchedStageID >::iterator CurrentStage
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
std::optional< bool > GCNTrackersOverride
GCNDownwardRPTracker * getDownwardTracker()
std::vector< unsigned > Pressure
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
GCNUpwardRPTracker UpwardTracker
void printCandidateDecision(const SchedCandidate &Current, const SchedCandidate &Preferred)
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand, bool &IsPending, bool IsBottomUp)
unsigned getStructuralStallCycles(SchedBoundary &Zone, SUnit *SU) const
Estimate how many cycles SU must wait due to structural hazards at the current boundary cycle.
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp)
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
GCNUpwardRPTracker * getUpwardTracker()
GCNSchedStageID getNextStage() const
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
void recede(const MachineInstr &MI)
Move to the state of RP just before the MI .
void traceCandidate(const SchedCandidate &Cand)
LLVM_ABI void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone, SchedBoundary *OtherZone)
Set the CandPolicy given a scheduling zone given the current resources and latencies inside and outsi...
MachineSchedPolicy RegionPolicy
const TargetSchedModel * SchedModel
const MachineSchedContext * Context
const TargetRegisterInfo * TRI
SchedCandidate BotCand
Candidate last picked from Bot boundary.
SchedCandidate TopCand
Candidate last picked from Top boundary.
virtual bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Apply a set of heuristics to a new candidate.
ScheduleDAGMILive * DAG
void initialize(ScheduleDAGMI *dag) override
Initialize the strategy after building the DAG for a new region.
void schedNode(SUnit *SU, bool IsTopNode) override
Update the scheduler's state after scheduling a node.
GenericScheduler(const MachineSchedContext *C)
bool shouldRevertScheduling(unsigned WavesAfter) override
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
void RemoveMachineInstrFromMaps(MachineInstr &MI)
SlotIndex getMBBEndIdx(const MachineBasicBlock *mbb) const
Return the last index in the given basic block.
LiveInterval & getInterval(Register Reg)
void removeInterval(Register Reg)
Interval removal.
LLVM_ABI void dump() const
MachineBasicBlock * getMBBFromIndex(SlotIndex index) const
LiveInterval & createAndComputeVirtRegInterval(Register Reg)
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
Definition MCInstrInfo.h:90
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
LLVM_ABI BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const
getblockFreq - Return block frequency.
LLVM_ABI BlockFrequency getEntryFreq() const
Divide a block's BlockFrequency::getFrequency() value by this value to obtain the entry block - relat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
bool verify(Pass *p=nullptr, const char *Banner=nullptr, raw_ostream *OS=nullptr, bool AbortOnError=true) const
Run the current MachineFunction through the machine code verifier, useful for debugger use.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Register getReg() const
getReg - Returns the register number.
LLVM_ABI LaneBitmask getMaxLaneMaskForVReg(Register Reg) const
Returns a mask covering all bits that can appear in lane masks of subregisters of the virtual registe...
LLVM_ABI bool isConstantPhysReg(MCRegister PhysReg) const
Returns true if PhysReg is unallocatable and constant throughout the function.
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
void finalizeGCNRegion() override
bool initGCNSchedStage() override
Capture a change in pressure for a single pressure set.
Simple wrapper around std::function<void(raw_ostream&)>.
Definition Printable.h:38
Helpers for implementing custom MachineSchedStrategy classes.
unsigned size() const
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void advance()
Advance across the current instruction.
LLVM_ABI void getDownwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction top-down.
const std::vector< unsigned > & getRegSetPressureAtPos() const
Get the register set pressure at the current position, which may be less than the pressure across the...
LLVM_ABI void getUpwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction bottom-up.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
GCNRPTracker::LiveRegSet & getLiveRegsForRegionIdx(unsigned RegionIdx)
List of registers defined and used by a machine instruction.
LLVM_ABI void collect(const MachineInstr &MI, const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, bool TrackLaneMasks, bool IgnoreDead)
Analyze the given instruction MI and fill in the Uses, Defs and DeadDefs list based on the MachineOpe...
LLVM_ABI void adjustLaneLiveness(const LiveIntervals &LIS, const MachineRegisterInfo &MRI, SlotIndex Pos, MachineInstr *AddFlagsMI=nullptr)
Use liveness information to find out which uses/defs are partially undefined/dead and adjust the VReg...
LLVM_ABI void detectDeadDefs(const MachineInstr &MI, const LiveIntervals &LIS)
Use liveness information to find dead defs not marked with a dead flag and move them to the DeadDefs ...
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
bool isIGLPMutationOnly(unsigned Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void increaseOccupancy(const MachineFunction &MF, unsigned Limit)
void limitOccupancy(const MachineFunction &MF)
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
unsigned TopReadyCycle
Cycle relative to start when node is ready.
unsigned NodeNum
Entry # of node in the node vector.
unsigned short Latency
Node latency.
bool isScheduled
True once scheduled.
unsigned ParentClusterIdx
The parent cluster id.
unsigned BotReadyCycle
Cycle relative to end when node is ready.
bool hasReservedResource
Uses a reserved resource.
bool isBottomReady() const
bool isTopReady() const
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Each Scheduling boundary is associated with ready queues.
LLVM_ABI void releasePending()
Release pending ready nodes in to the available queue.
LLVM_ABI unsigned getLatencyStallCycles(SUnit *SU)
Get the difference between the given SUnit's ready time and the current cycle.
LLVM_ABI SUnit * pickOnlyChoice()
Call this before applying any other heuristics to the Available queue.
ScheduleHazardRecognizer * HazardRec
LLVM_ABI void bumpCycle(unsigned NextCycle)
Move the boundary of scheduled code by one cycle.
unsigned getCurrMOps() const
Micro-ops issued in the current cycle.
unsigned getCurrCycle() const
Number of cycles to issue the instructions scheduled in this zone.
LLVM_ABI bool checkHazard(SUnit *SU)
Does this SU have a hazard within the current instruction group.
LLVM_ABI std::pair< unsigned, unsigned > getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx, unsigned ReleaseAtCycle, unsigned AcquireAtCycle)
Compute the next cycle at which the given processor resource can be scheduled.
A ScheduleDAG for scheduling lists of MachineInstr.
bool ScheduleSingleMIRegions
True if regions with a single MI should be scheduled.
MachineBasicBlock::iterator RegionEnd
The end of the range to be scheduled.
virtual void finalizeSchedule()
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
virtual void exitRegion()
Called when the scheduler has finished scheduling the current region.
const MachineLoopInfo * MLI
bool RemoveKillFlags
True if the DAG builder should remove kill flags (in preparation for rescheduling).
MachineBasicBlock::iterator RegionBegin
The beginning of the range to be scheduled.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
RegPressureTracker RPTracker
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMI(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
std::vector< std::unique_ptr< ScheduleDAGMutation > > Mutations
Ordered list of DAG postprocessing steps.
MachineRegisterInfo & MRI
Virtual/real register map.
const TargetInstrInfo * TII
Target instruction information.
const TargetRegisterInfo * TRI
Target processor register info.
MachineFunction & MF
Machine function.
static const unsigned ScaleFactor
unsigned getMetric() const
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
static bool isSameInstr(SlotIndex A, SlotIndex B)
isSameInstr - Return true if A and B refer to the same instruction.
static bool isEarlierInstr(SlotIndex A, SlotIndex B)
isEarlierInstr - Return true if A refers to an instruction earlier than B.
SlotIndex getPrevSlot() const
Returns the previous slot in the index list.
SlotIndex getMBBStartIdx(unsigned Num) const
Returns the first index in the given basic block number.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:229
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool isReMaterializable(const MachineInstr &MI) const
Return true if the instruction would be materializable at a point in the containing function where al...
virtual bool isIgnorableUse(const MachineOperand &MO) const
Given MO is a PhysReg use return if it can be ignored for the purpose of instruction rematerializatio...
uint8_t getCopyCost() const
Return the cost of copying a value between two registers in this class.
LaneBitmask getSubRegIndexLaneMask(unsigned SubIdx) const
Return a bitmask representing the parts of a register that are covered by SubIdx.
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI bool hasInstrSchedModel() const
Return true if this machine model includes an instruction-level scheduling model.
unsigned getMicroOpBufferSize() const
Number of micro-ops that may be buffered for OOO execution.
bool shouldRevertScheduling(unsigned WavesAfter) override
VNInfo - Value Number Information.
SlotIndex def
The index of the defining instruction.
bool isPHIDef() const
Returns true if this value is defined by a PHI instruction (or was, PHI instructions may have been el...
static bool allUsesAvailableAt(const MachineInstr *MI, SlotIndex UseIdx, const LiveIntervals &LIS, const MachineRegisterInfo &MRI, const TargetInstrInfo &TII)
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize)
LLVM_READONLY int32_t getMFMASrcCVDstAGPROp(uint32_t Opcode)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1765
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI unsigned getWeakLeft(const SUnit *SU, bool isTop)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI, Range &&LiveRegs)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
std::pair< MachineBasicBlock::iterator, MachineBasicBlock::iterator > RegionBoundaries
A region's boundaries i.e.
IterT skipDebugInstructionsForward(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It until it points to a non-debug instruction or to End and return the resulting iterator.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool tryPressure(const PressureChange &TryP, const PressureChange &CandP, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason, const TargetRegisterInfo *TRI, const MachineFunction &MF)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
LLVM_ABI cl::opt< bool > VerifyScheduling
LLVM_ABI bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone)
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
IterT skipDebugInstructionsBackward(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It until it points to a non-debug instruction or to Begin and return the resulting iterator...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool isTheSameCluster(unsigned A, unsigned B)
Return whether the input cluster ID's are the same and valid.
DWARFExpression::Operation Op
LLVM_ABI bool tryGreater(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1917
DenseMap< MachineInstr *, GCNRPTracker::LiveRegSet > getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS)
creates a map MachineInstr -> LiveRegSet R - range of iterators on instructions After - upon entry or...
GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI, const LiveIntervals &LIS)
LLVM_ABI bool tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
Return true if this heuristic determines order.
LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF, GCNRegPressure::RegKind Kind, LiveIntervals &LIS, const MachineLoopInfo *MLI)
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
LLVM_ABI int biasPhysReg(const SUnit *SU, bool isTop)
Minimize physical register live ranges.
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
bool operator()(std::pair< MachineInstr *, unsigned > A, std::pair< MachineInstr *, unsigned > B) const
unsigned getArchVGPRNum() const
unsigned getAGPRNum() const
unsigned getSGPRNum() const
Policy for scheduling the next instruction in the candidate's zone.
Store the state used by GenericScheduler heuristics, required for the lifetime of one invocation of p...
void reset(const CandPolicy &NewPolicy)
LLVM_ABI void initResourceDelta(const ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel)
Status of an instruction's critical resource consumption.
constexpr bool any() const
Definition LaneBitmask.h:53
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition MCSchedule.h:68
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
Execution frequency information required by scoring heuristics.
SmallVector< uint64_t > Regions
Per-region execution frequencies. 0 when unknown.
uint64_t MinFreq
Minimum and maximum observed frequencies.
FreqInfo(MachineFunction &MF, const GCNScheduleDAGMILive &DAG)