LLVM 23.0.0git
GCNSchedStrategy.cpp
Go to the documentation of this file.
1//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This contains a MachineSchedStrategy implementation for maximizing wave
11/// occupancy on GCN hardware.
12///
13/// This pass will apply multiple scheduling stages to the same function.
14/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15/// entry point for the scheduling of those regions is
16/// GCNScheduleDAGMILive::runSchedStages.
17
18/// Generally, the reason for having multiple scheduling stages is to account
19/// for the kernel-wide effect of register usage on occupancy. Usually, only a
20/// few scheduling regions will have register pressure high enough to limit
21/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22/// other regions.
23///
24//===----------------------------------------------------------------------===//
25
26#include "GCNSchedStrategy.h"
27#include "AMDGPUIGroupLP.h"
28#include "GCNHazardRecognizer.h"
29#include "GCNRegPressure.h"
32#include "llvm/ADT/BitVector.h"
33#include "llvm/ADT/STLExtras.h"
40#include "llvm/MC/LaneBitmask.h"
41#include "llvm/MC/MCSchedule.h"
44
45#define DEBUG_TYPE "machine-scheduler"
46
47using namespace llvm;
48
50 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
51 cl::desc("Disable unclustered high register pressure "
52 "reduction scheduling stage."),
53 cl::init(false));
54
56 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
57 cl::desc("Disable clustered low occupancy "
58 "rescheduling for ILP scheduling stage."),
59 cl::init(false));
60
62 "amdgpu-schedule-metric-bias", cl::Hidden,
64 "Sets the bias which adds weight to occupancy vs latency. Set it to "
65 "100 to chase the occupancy only."),
66 cl::init(10));
67
68static cl::opt<bool>
69 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
70 cl::desc("Relax occupancy targets for kernels which are memory "
71 "bound (amdgpu-membound-threshold), or "
72 "Wave Limited (amdgpu-limit-wave-threshold)."),
73 cl::init(false));
74
76 "amdgpu-use-amdgpu-trackers", cl::Hidden,
77 cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
78 cl::init(false));
79
81 "amdgpu-scheduler-pending-queue-limit", cl::Hidden,
83 "Max (Available+Pending) size to inspect pending queue (0 disables)"),
84 cl::init(256));
85
86#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
87#define DUMP_MAX_REG_PRESSURE
89 "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
90 cl::desc("Print a list of live registers along with their def/uses at the "
91 "point of maximum register pressure before scheduling."),
92 cl::init(false));
93
95 "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
96 cl::desc("Print a list of live registers along with their def/uses at the "
97 "point of maximum register pressure after scheduling."),
98 cl::init(false));
99#endif
100
102 "amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden,
103 cl::desc("Disable rewrite mfma rewrite scheduling stage"), cl::init(true));
104
105const unsigned ScheduleMetrics::ScaleFactor = 100;
106
113
116
117 MF = &DAG->MF;
118
119 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
120
122 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
124 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
125
127 // Set the initial TargetOccupnacy to the maximum occupancy that we can
128 // achieve for this function. This effectively sets a lower bound on the
129 // 'Critical' register limits in the scheduler.
130 // Allow for lower occupancy targets if kernel is wave limited or memory
131 // bound, and using the relaxed occupancy feature.
135 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
136
137 if (!KnownExcessRP) {
138 VGPRCriticalLimit = std::min(
139 ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
141 } else {
142 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
143 // returns a reasonably small number for targets with lots of VGPRs, such
144 // as GFX10 and GFX11.
145 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
146 "VGPRCriticalLimit calculation method.\n");
147 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
148 unsigned Granule =
149 AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, DynamicVGPRBlockSize);
150 unsigned Addressable =
151 AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, DynamicVGPRBlockSize);
152 unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
153 VGPRBudget = std::max(VGPRBudget, Granule);
154 VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
155 }
156
157 // Subtract error margin and bias from register limits and avoid overflow.
162 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
163 << ", VGPRExcessLimit = " << VGPRExcessLimit
164 << ", SGPRCriticalLimit = " << SGPRCriticalLimit
165 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
166}
167
168/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
169/// current register pressure.
170///
171/// This works for the common case, but it has a few exceptions that have been
172/// observed through trial and error:
173/// - Explicit physical register operands
174/// - Subregister definitions
175///
176/// In both of those cases, PressureDiff doesn't represent the actual pressure,
177/// and querying LiveIntervals through the RegPressureTracker is needed to get
178/// an accurate value.
179///
180/// We should eventually only use PressureDiff for maximum performance, but this
181/// already allows 80% of SUs to take the fast path without changing scheduling
182/// at all. Further changes would either change scheduling, or require a lot
183/// more logic to recover an accurate pressure estimate from the PressureDiffs.
184static bool canUsePressureDiffs(const SUnit &SU) {
185 if (!SU.isInstr())
186 return false;
187
188 // Cannot use pressure diffs for subregister defs or with physregs, it's
189 // imprecise in both cases.
190 for (const auto &Op : SU.getInstr()->operands()) {
191 if (!Op.isReg() || Op.isImplicit())
192 continue;
193 if (Op.getReg().isPhysical() ||
194 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
195 return false;
196 }
197 return true;
198}
199
201 bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
202 std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
204 ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
205 // getDownwardPressure() and getUpwardPressure() make temporary changes to
206 // the tracker, so we need to pass those function a non-const copy.
207 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
208 if (!useGCNTrackers()) {
209 AtTop
210 ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
211 : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
212
213 return;
214 }
215
216 // GCNTrackers
217 Pressure.resize(4, 0);
218 MachineInstr *MI = SU->getInstr();
219 GCNRegPressure NewPressure;
220 if (AtTop) {
221 GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
222 NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
223 } else {
224 GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
225 TempUpwardTracker.recede(*MI);
226 NewPressure = TempUpwardTracker.getPressure();
227 }
228 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
229 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
230 NewPressure.getArchVGPRNum();
231 Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
232}
233
235 SUnit *SU) const {
236 // Only implemented for top-down scheduling currently.
237 if (!Zone.isTop() || !SU)
238 return 0;
239
240 MachineInstr *MI = SU->getInstr();
241 unsigned CurrCycle = Zone.getCurrCycle();
242 unsigned Stall = 0;
243
244 // Query SchedModel for resource stalls (unbuffered resources).
245 if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
246 const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
247 for (const MCWriteProcResEntry &PE :
248 make_range(SchedModel->getWriteProcResBegin(SC),
249 SchedModel->getWriteProcResEnd(SC))) {
250 unsigned NextAvail =
251 Zone.getNextResourceCycle(SC, PE.ProcResourceIdx, PE.ReleaseAtCycle,
252 PE.AcquireAtCycle)
253 .first;
254 if (NextAvail > CurrCycle)
255 Stall = std::max(Stall, NextAvail - CurrCycle);
256 }
257 }
258
259 // Query HazardRecognizer for sequence-dependent hazard penalties.
260 // AMDGPU currently installs GCNHazardRecognizer for MI scheduling only in
261 // the post-RA configuration without vreg liveness.
262 if (!DAG->hasVRegLiveness() && Zone.HazardRec &&
263 Zone.HazardRec->isEnabled()) {
264 auto *HR = static_cast<GCNHazardRecognizer *>(Zone.HazardRec);
265 Stall = std::max(Stall, HR->getHazardWaitStates(MI));
266 }
267
268 return Stall;
269}
270
272 bool AtTop,
273 const RegPressureTracker &RPTracker,
274 const SIRegisterInfo *SRI,
275 unsigned SGPRPressure,
276 unsigned VGPRPressure, bool IsBottomUp) {
277 Cand.SU = SU;
278 Cand.AtTop = AtTop;
279
280 if (!DAG->isTrackingPressure())
281 return;
282
283 Pressure.clear();
284 MaxPressure.clear();
285
286 // We try to use the cached PressureDiffs in the ScheduleDAG whenever
287 // possible over querying the RegPressureTracker.
288 //
289 // RegPressureTracker will make a lot of LIS queries which are very
290 // expensive, it is considered a slow function in this context.
291 //
292 // PressureDiffs are precomputed and cached, and getPressureDiff is just a
293 // trivial lookup into an array. It is pretty much free.
294 //
295 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
296 // PressureDiffs.
297 if (AtTop || !canUsePressureDiffs(*SU) || useGCNTrackers()) {
298 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
300 } else {
301 // Reserve 4 slots.
302 Pressure.resize(4, 0);
303 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
304 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
305
306 for (const auto &Diff : DAG->getPressureDiff(SU)) {
307 if (!Diff.isValid())
308 continue;
309 // PressureDiffs is always bottom-up so if we're working top-down we need
310 // to invert its sign.
311 Pressure[Diff.getPSet()] +=
312 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
313 }
314
315#ifdef EXPENSIVE_CHECKS
316 std::vector<unsigned> CheckPressure, CheckMaxPressure;
317 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
319 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
320 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
321 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
322 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
323 errs() << "Register Pressure is inaccurate when calculated through "
324 "PressureDiff\n"
325 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
326 << ", expected "
327 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
328 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
329 << ", expected "
330 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
331 report_fatal_error("inaccurate register pressure calculation");
332 }
333#endif
334 }
335
336 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
337 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
338
339 // If two instructions increase the pressure of different register sets
340 // by the same amount, the generic scheduler will prefer to schedule the
341 // instruction that increases the set with the least amount of registers,
342 // which in our case would be SGPRs. This is rarely what we want, so
343 // when we report excess/critical register pressure, we do it either
344 // only for VGPRs or only for SGPRs.
345
346 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
347 const unsigned MaxVGPRPressureInc = 16;
348 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
349 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
350
351 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
352 // to increase the likelihood we don't go over the limits. We should improve
353 // the analysis to look through dependencies to find the path with the least
354 // register pressure.
355
356 // We only need to update the RPDelta for instructions that increase register
357 // pressure. Instructions that decrease or keep reg pressure the same will be
358 // marked as RegExcess in tryCandidate() when they are compared with
359 // instructions that increase the register pressure.
360 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
361 HasHighPressure = true;
362 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
363 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
364 }
365
366 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
367 HasHighPressure = true;
368 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
369 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
370 }
371
372 // Register pressure is considered 'CRITICAL' if it is approaching a value
373 // that would reduce the wave occupancy for the execution unit. When
374 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
375 // has the same cost, so we don't need to prefer one over the other.
376
377 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
378 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
379
380 if (SGPRDelta >= 0 || VGPRDelta >= 0) {
381 HasHighPressure = true;
382 if (SGPRDelta > VGPRDelta) {
383 Cand.RPDelta.CriticalMax =
384 PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
385 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
386 } else {
387 Cand.RPDelta.CriticalMax =
388 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
389 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
390 }
391 }
392}
393
395 const TargetSchedModel *SchedModel) {
396 bool HasBufferedModel =
397 SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
398 unsigned Combined = Zone.Available.size() + Zone.Pending.size();
399 return Combined <= PendingQueueLimit && HasBufferedModel;
400}
401
403 const TargetSchedModel *SchedModel) {
404 // pickOnlyChoice() releases pending instructions and checks for new hazards.
405 SUnit *OnlyChoice = Zone.pickOnlyChoice();
406 if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty())
407 return OnlyChoice;
408
409 return nullptr;
410}
411
413 const SchedCandidate &Preferred) {
414 LLVM_DEBUG({
415 dbgs() << "Prefer:\t\t";
416 DAG->dumpNode(*Preferred.SU);
417
418 if (Current.SU) {
419 dbgs() << "Not:\t";
420 DAG->dumpNode(*Current.SU);
421 }
422
423 dbgs() << "Reason:\t\t";
424 traceCandidate(Preferred);
425 });
426}
427
428// This function is mostly cut and pasted from
429// GenericScheduler::pickNodeFromQueue()
431 const CandPolicy &ZonePolicy,
432 const RegPressureTracker &RPTracker,
433 SchedCandidate &Cand, bool &IsPending,
434 bool IsBottomUp) {
435 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
437 unsigned SGPRPressure = 0;
438 unsigned VGPRPressure = 0;
439 IsPending = false;
440 if (DAG->isTrackingPressure()) {
441 if (!useGCNTrackers()) {
442 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
443 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
444 } else {
445 GCNRPTracker *T = IsBottomUp
446 ? static_cast<GCNRPTracker *>(&UpwardTracker)
447 : static_cast<GCNRPTracker *>(&DownwardTracker);
448 SGPRPressure = T->getPressure().getSGPRNum();
449 VGPRPressure = T->getPressure().getArchVGPRNum();
450 }
451 }
452 LLVM_DEBUG(dbgs() << "Available Q:\n");
453 ReadyQueue &AQ = Zone.Available;
454 for (SUnit *SU : AQ) {
455
456 SchedCandidate TryCand(ZonePolicy);
457 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
458 VGPRPressure, IsBottomUp);
459 // Pass SchedBoundary only when comparing nodes from the same boundary.
460 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
461 tryCandidate(Cand, TryCand, ZoneArg);
462 if (TryCand.Reason != NoCand) {
463 // Initialize resource delta if needed in case future heuristics query it.
464 if (TryCand.ResDelta == SchedResourceDelta())
465 TryCand.initResourceDelta(Zone.DAG, SchedModel);
466 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
467 Cand.setBest(TryCand);
468 } else {
469 printCandidateDecision(TryCand, Cand);
470 }
471 }
472
473 if (!shouldCheckPending(Zone, SchedModel))
474 return;
475
476 LLVM_DEBUG(dbgs() << "Pending Q:\n");
477 ReadyQueue &PQ = Zone.Pending;
478 for (SUnit *SU : PQ) {
479
480 SchedCandidate TryCand(ZonePolicy);
481 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
482 VGPRPressure, IsBottomUp);
483 // Pass SchedBoundary only when comparing nodes from the same boundary.
484 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
485 tryPendingCandidate(Cand, TryCand, ZoneArg);
486 if (TryCand.Reason != NoCand) {
487 // Initialize resource delta if needed in case future heuristics query it.
488 if (TryCand.ResDelta == SchedResourceDelta())
489 TryCand.initResourceDelta(Zone.DAG, SchedModel);
490 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
491 IsPending = true;
492 Cand.setBest(TryCand);
493 } else {
494 printCandidateDecision(TryCand, Cand);
495 }
496 }
497}
498
499// This function is mostly cut and pasted from
500// GenericScheduler::pickNodeBidirectional()
502 bool &PickedPending) {
503 // Schedule as far as possible in the direction of no choice. This is most
504 // efficient, but also provides the best heuristics for CriticalPSets.
505 if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) {
506 IsTopNode = false;
507 return SU;
508 }
509 if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) {
510 IsTopNode = true;
511 return SU;
512 }
513 // Set the bottom-up policy based on the state of the current bottom zone
514 // and the instructions outside the zone, including the top zone.
515 CandPolicy BotPolicy;
516 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
517 // Set the top-down policy based on the state of the current top zone and
518 // the instructions outside the zone, including the bottom zone.
519 CandPolicy TopPolicy;
520 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
521
522 bool BotPending = false;
523 // See if BotCand is still valid (because we previously scheduled from Top).
524 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
525 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
526 BotCand.Policy != BotPolicy) {
527 BotCand.reset(CandPolicy());
528 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
529 BotPending,
530 /*IsBottomUp=*/true);
531 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
532 } else {
534#ifndef NDEBUG
535 if (VerifyScheduling) {
536 SchedCandidate TCand;
537 TCand.reset(CandPolicy());
538 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
539 BotPending,
540 /*IsBottomUp=*/true);
541 assert(TCand.SU == BotCand.SU &&
542 "Last pick result should correspond to re-picking right now");
543 }
544#endif
545 }
546
547 bool TopPending = false;
548 // Check if the top Q has a better candidate.
549 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
550 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
551 TopCand.Policy != TopPolicy) {
552 TopCand.reset(CandPolicy());
553 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
554 TopPending,
555 /*IsBottomUp=*/false);
556 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
557 } else {
559#ifndef NDEBUG
560 if (VerifyScheduling) {
561 SchedCandidate TCand;
562 TCand.reset(CandPolicy());
563 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
564 TopPending,
565 /*IsBottomUp=*/false);
566 assert(TCand.SU == TopCand.SU &&
567 "Last pick result should correspond to re-picking right now");
568 }
569#endif
570 }
571
572 // Pick best from BotCand and TopCand.
573 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
574 dbgs() << "Bot Cand: "; traceCandidate(BotCand););
575 SchedCandidate Cand = BotPending ? TopCand : BotCand;
576 SchedCandidate TryCand = BotPending ? BotCand : TopCand;
577 PickedPending = BotPending && TopPending;
578
579 TryCand.Reason = NoCand;
580 if (BotPending || TopPending) {
581 PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr);
582 } else {
583 tryCandidate(Cand, TryCand, nullptr);
584 }
585
586 if (TryCand.Reason != NoCand) {
587 Cand.setBest(TryCand);
588 }
589
590 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
591
592 IsTopNode = Cand.AtTop;
593 return Cand.SU;
594}
595
596// This function is mostly cut and pasted from
597// GenericScheduler::pickNode()
599 if (DAG->top() == DAG->bottom()) {
600 assert(Top.Available.empty() && Top.Pending.empty() &&
601 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
602 return nullptr;
603 }
604 bool PickedPending;
605 SUnit *SU;
606 do {
607 PickedPending = false;
608 if (RegionPolicy.OnlyTopDown) {
610 if (!SU) {
611 CandPolicy NoPolicy;
612 TopCand.reset(NoPolicy);
613 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
614 PickedPending,
615 /*IsBottomUp=*/false);
616 assert(TopCand.Reason != NoCand && "failed to find a candidate");
617 SU = TopCand.SU;
618 }
619 IsTopNode = true;
620 } else if (RegionPolicy.OnlyBottomUp) {
622 if (!SU) {
623 CandPolicy NoPolicy;
624 BotCand.reset(NoPolicy);
625 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
626 PickedPending,
627 /*IsBottomUp=*/true);
628 assert(BotCand.Reason != NoCand && "failed to find a candidate");
629 SU = BotCand.SU;
630 }
631 IsTopNode = false;
632 } else {
633 SU = pickNodeBidirectional(IsTopNode, PickedPending);
634 }
635 } while (SU->isScheduled);
636
637 if (PickedPending) {
638 unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
639 SchedBoundary &Zone = IsTopNode ? Top : Bot;
640 unsigned CurrentCycle = Zone.getCurrCycle();
641 if (ReadyCycle > CurrentCycle)
642 Zone.bumpCycle(ReadyCycle);
643
644 // FIXME: checkHazard() doesn't give information about which cycle the
645 // hazard will resolve so just keep bumping the cycle by 1. This could be
646 // made more efficient if checkHazard() returned more details.
647 while (Zone.checkHazard(SU))
648 Zone.bumpCycle(Zone.getCurrCycle() + 1);
649
650 Zone.releasePending();
651 }
652
653 if (SU->isTopReady())
654 Top.removeReady(SU);
655 if (SU->isBottomReady())
656 Bot.removeReady(SU);
657
658 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
659 << *SU->getInstr());
660 return SU;
661}
662
663void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
664 if (useGCNTrackers()) {
665 MachineInstr *MI = SU->getInstr();
666 IsTopNode ? (void)DownwardTracker.advance(MI, false)
667 : UpwardTracker.recede(*MI);
668 }
669
670 return GenericScheduler::schedNode(SU, IsTopNode);
671}
672
677
680 if (!CurrentStage)
681 CurrentStage = SchedStages.begin();
682 else
683 CurrentStage++;
684
685 return CurrentStage != SchedStages.end();
686}
687
690 return std::next(CurrentStage) != SchedStages.end();
691}
692
694 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
695 return *std::next(CurrentStage);
696}
697
699 SchedCandidate &TryCand,
700 SchedBoundary *Zone) const {
701 // Initialize the candidate if needed.
702 if (!Cand.isValid()) {
703 TryCand.Reason = NodeOrder;
704 return true;
705 }
706
707 // Bias PhysReg Defs and copies to their uses and defined respectively.
708 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
709 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
710 return TryCand.Reason != NoCand;
711
712 // Avoid exceeding the target's limit.
713 if (DAG->isTrackingPressure() &&
714 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
715 RegExcess, TRI, DAG->MF))
716 return TryCand.Reason != NoCand;
717
718 // Avoid increasing the max critical pressure in the scheduled region.
719 if (DAG->isTrackingPressure() &&
721 TryCand, Cand, RegCritical, TRI, DAG->MF))
722 return TryCand.Reason != NoCand;
723
724 bool SameBoundary = Zone != nullptr;
725 if (SameBoundary) {
728 TryCand, Cand, ResourceReduce))
729 return TryCand.Reason != NoCand;
731 Cand.ResDelta.DemandedResources, TryCand, Cand,
733 return TryCand.Reason != NoCand;
734 }
735
736 return false;
737}
738
751
756
758 SchedCandidate &TryCand,
759 SchedBoundary *Zone) const {
760 // Initialize the candidate if needed.
761 if (!Cand.isValid()) {
762 TryCand.Reason = NodeOrder;
763 return true;
764 }
765
766 // Avoid spilling by exceeding the register limit.
767 if (DAG->isTrackingPressure() &&
768 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
769 RegExcess, TRI, DAG->MF))
770 return TryCand.Reason != NoCand;
771
772 // Bias PhysReg Defs and copies to their uses and defined respectively.
773 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
774 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
775 return TryCand.Reason != NoCand;
776
777 bool SameBoundary = Zone != nullptr;
778 if (SameBoundary) {
779 // Prioritize instructions that read unbuffered resources by stall cycles.
780 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
781 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
782 return TryCand.Reason != NoCand;
783
784 // Avoid critical resource consumption and balance the schedule.
787 TryCand, Cand, ResourceReduce))
788 return TryCand.Reason != NoCand;
790 Cand.ResDelta.DemandedResources, TryCand, Cand,
792 return TryCand.Reason != NoCand;
793
794 // Unconditionally try to reduce latency.
795 if (tryLatency(TryCand, Cand, *Zone))
796 return TryCand.Reason != NoCand;
797
798 // Weak edges are for clustering and other constraints.
799 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
800 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
801 return TryCand.Reason != NoCand;
802 }
803
804 // Keep clustered nodes together to encourage downstream peephole
805 // optimizations which may reduce resource requirements.
806 //
807 // This is a best effort to set things up for a post-RA pass. Optimizations
808 // like generating loads of multiple registers should ideally be done within
809 // the scheduler pass by combining the loads during DAG postprocessing.
810 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
811 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
812 bool CandIsClusterSucc =
813 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
814 bool TryCandIsClusterSucc =
815 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
816 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
817 Cluster))
818 return TryCand.Reason != NoCand;
819
820 // Avoid increasing the max critical pressure in the scheduled region.
821 if (DAG->isTrackingPressure() &&
823 TryCand, Cand, RegCritical, TRI, DAG->MF))
824 return TryCand.Reason != NoCand;
825
826 // Avoid increasing the max pressure of the entire region.
827 if (DAG->isTrackingPressure() &&
828 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
829 Cand, RegMax, TRI, DAG->MF))
830 return TryCand.Reason != NoCand;
831
832 if (SameBoundary) {
833 // Fall through to original instruction order.
834 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
835 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
836 TryCand.Reason = NodeOrder;
837 return true;
838 }
839 }
840 return false;
841}
842
848
849/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
850/// much as possible. This is achieved by:
851// 1. Prioritize clustered operations before stall latency heuristic.
852// 2. Prioritize long-latency-load before stall latency heuristic.
853///
854/// \param Cand provides the policy and current best candidate.
855/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
856/// \param Zone describes the scheduled zone that we are extending, or nullptr
857/// if Cand is from a different zone than TryCand.
858/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
860 SchedCandidate &TryCand,
861 SchedBoundary *Zone) const {
862 // Initialize the candidate if needed.
863 if (!Cand.isValid()) {
864 TryCand.Reason = NodeOrder;
865 return true;
866 }
867
868 // Bias PhysReg Defs and copies to their uses and defined respectively.
869 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
870 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
871 return TryCand.Reason != NoCand;
872
873 if (DAG->isTrackingPressure()) {
874 // Avoid exceeding the target's limit.
875 if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
876 RegExcess, TRI, DAG->MF))
877 return TryCand.Reason != NoCand;
878
879 // Avoid increasing the max critical pressure in the scheduled region.
881 TryCand, Cand, RegCritical, TRI, DAG->MF))
882 return TryCand.Reason != NoCand;
883 }
884
885 // MaxMemoryClause-specific: We prioritize clustered instructions as we would
886 // get more benefit from clausing these memory instructions.
887 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
888 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
889 bool CandIsClusterSucc =
890 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
891 bool TryCandIsClusterSucc =
892 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
893 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
894 Cluster))
895 return TryCand.Reason != NoCand;
896
897 // We only compare a subset of features when comparing nodes between
898 // Top and Bottom boundary. Some properties are simply incomparable, in many
899 // other instances we should only override the other boundary if something
900 // is a clear good pick on one boundary. Skip heuristics that are more
901 // "tie-breaking" in nature.
902 bool SameBoundary = Zone != nullptr;
903 if (SameBoundary) {
904 // For loops that are acyclic path limited, aggressively schedule for
905 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
906 // heuristics to take precedence.
907 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
908 tryLatency(TryCand, Cand, *Zone))
909 return TryCand.Reason != NoCand;
910
911 // MaxMemoryClause-specific: Prioritize long latency memory load
912 // instructions in top-bottom order to hide more latency. The mayLoad check
913 // is used to exclude store-like instructions, which we do not want to
914 // scheduler them too early.
915 bool TryMayLoad =
916 TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
917 bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
918
919 if (TryMayLoad || CandMayLoad) {
920 bool TryLongLatency =
921 TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
922 bool CandLongLatency =
923 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
924
925 if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
926 Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
927 Cand, Stall))
928 return TryCand.Reason != NoCand;
929 }
930 // Prioritize instructions that read unbuffered resources by stall cycles.
931 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
932 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
933 return TryCand.Reason != NoCand;
934 }
935
936 if (SameBoundary) {
937 // Weak edges are for clustering and other constraints.
938 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
939 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
940 return TryCand.Reason != NoCand;
941 }
942
943 // Avoid increasing the max pressure of the entire region.
944 if (DAG->isTrackingPressure() &&
945 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
946 Cand, RegMax, TRI, DAG->MF))
947 return TryCand.Reason != NoCand;
948
949 if (SameBoundary) {
950 // Avoid critical resource consumption and balance the schedule.
953 TryCand, Cand, ResourceReduce))
954 return TryCand.Reason != NoCand;
956 Cand.ResDelta.DemandedResources, TryCand, Cand,
958 return TryCand.Reason != NoCand;
959
960 // Avoid serializing long latency dependence chains.
961 // For acyclic path limited loops, latency was already checked above.
962 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
963 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
964 return TryCand.Reason != NoCand;
965
966 // Fall through to original instruction order.
967 if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
968 assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
969 TryCand.Reason = NodeOrder;
970 return true;
971 }
972 }
973
974 return false;
975}
976
978 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
979 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
980 MFI(*MF.getInfo<SIMachineFunctionInfo>()),
981 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
982 RegionLiveOuts(this, /*IsLiveOut=*/true) {
983
984 // We want regions with a single MI to be scheduled so that we can reason
985 // about them correctly during scheduling stages that move MIs between regions
986 // (e.g., rematerialization).
988 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
989 if (RelaxedOcc) {
990 MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
991 if (MinOccupancy != StartingOccupancy)
992 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
993 << ".\n");
994 }
995}
996
997std::unique_ptr<GCNSchedStage>
998GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
999 switch (SchedStageID) {
1001 return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
1003 return std::make_unique<RewriteMFMAFormStage>(SchedStageID, *this);
1005 return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
1007 return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
1009 return std::make_unique<PreRARematStage>(SchedStageID, *this);
1011 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
1013 return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
1014 *this);
1015 }
1016
1017 llvm_unreachable("Unknown SchedStageID.");
1018}
1019
1021 // Collect all scheduling regions. The actual scheduling is performed in
1022 // GCNScheduleDAGMILive::finalizeSchedule.
1023 Regions.push_back(std::pair(RegionBegin, RegionEnd));
1024}
1025
1027GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
1028 if (Regions[RegionIdx].first == Regions[RegionIdx].second)
1029 return llvm::getRegPressure(MRI, LiveIns[RegionIdx]);
1031 RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
1032 &LiveIns[RegionIdx]);
1033 return RPTracker.moveMaxPressure();
1034}
1035
1037 MachineBasicBlock::iterator RegionEnd) {
1038 assert(RegionBegin != RegionEnd && "Region must not be empty");
1039 return &*skipDebugInstructionsBackward(std::prev(RegionEnd), RegionBegin);
1040}
1041
1042void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
1043 const MachineBasicBlock *MBB) {
1044 GCNDownwardRPTracker RPTracker(*LIS);
1045
1046 // If the block has the only successor then live-ins of that successor are
1047 // live-outs of the current block. We can reuse calculated live set if the
1048 // successor will be sent to scheduling past current block.
1049
1050 // However, due to the bug in LiveInterval analysis it may happen that two
1051 // predecessors of the same successor block have different lane bitmasks for
1052 // a live-out register. Workaround that by sticking to one-to-one relationship
1053 // i.e. one predecessor with one successor block.
1054 const MachineBasicBlock *OnlySucc = nullptr;
1055 if (MBB->succ_size() == 1) {
1056 auto *Candidate = *MBB->succ_begin();
1057 if (!Candidate->empty() && Candidate->pred_size() == 1) {
1058 SlotIndexes *Ind = LIS->getSlotIndexes();
1059 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
1060 OnlySucc = Candidate;
1061 }
1062 }
1063
1064 // Scheduler sends regions from the end of the block upwards.
1065 size_t CurRegion = RegionIdx;
1066 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
1067 if (Regions[CurRegion].first->getParent() != MBB)
1068 break;
1069 --CurRegion;
1070
1071 auto I = MBB->begin();
1072 auto LiveInIt = MBBLiveIns.find(MBB);
1073 auto &Rgn = Regions[CurRegion];
1074 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
1075 if (LiveInIt != MBBLiveIns.end()) {
1076 auto LiveIn = std::move(LiveInIt->second);
1077 RPTracker.reset(*MBB->begin(), &LiveIn);
1078 MBBLiveIns.erase(LiveInIt);
1079 } else {
1080 I = Rgn.first;
1081 auto LRS = BBLiveInMap.lookup(NonDbgMI);
1082#ifdef EXPENSIVE_CHECKS
1083 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
1084#endif
1085 RPTracker.reset(*I, &LRS);
1086 }
1087
1088 for (;;) {
1089 I = RPTracker.getNext();
1090
1091 if (Regions[CurRegion].first == I || NonDbgMI == I) {
1092 LiveIns[CurRegion] = RPTracker.getLiveRegs();
1093 RPTracker.clearMaxPressure();
1094 }
1095
1096 if (Regions[CurRegion].second == I) {
1097 Pressure[CurRegion] = RPTracker.moveMaxPressure();
1098 if (CurRegion-- == RegionIdx)
1099 break;
1100 auto &Rgn = Regions[CurRegion];
1101 NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
1102 }
1103 RPTracker.advanceBeforeNext();
1104 RPTracker.advanceToNext();
1105 }
1106
1107 if (OnlySucc) {
1108 if (I != MBB->end()) {
1109 RPTracker.advanceBeforeNext();
1110 RPTracker.advanceToNext();
1111 RPTracker.advance(MBB->end());
1112 }
1113 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
1114 }
1115}
1116
1118GCNScheduleDAGMILive::getRegionLiveInMap() const {
1119 assert(!Regions.empty());
1120 std::vector<MachineInstr *> RegionFirstMIs;
1121 RegionFirstMIs.reserve(Regions.size());
1122 for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
1123 RegionFirstMIs.push_back(
1125
1126 return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
1127}
1128
1130GCNScheduleDAGMILive::getRegionLiveOutMap() const {
1131 assert(!Regions.empty());
1132 std::vector<MachineInstr *> RegionLastMIs;
1133 RegionLastMIs.reserve(Regions.size());
1134 for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) {
1135 // Skip empty regions.
1136 if (RegionBegin == RegionEnd)
1137 continue;
1138 RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
1139 }
1140 return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
1141}
1142
1144 IdxToInstruction.clear();
1145
1146 RegionLiveRegMap =
1147 IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
1148 for (unsigned I = 0; I < DAG->Regions.size(); I++) {
1149 auto &[RegionBegin, RegionEnd] = DAG->Regions[I];
1150 // Skip empty regions.
1151 if (RegionBegin == RegionEnd)
1152 continue;
1153 MachineInstr *RegionKey =
1154 IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin;
1155 IdxToInstruction[I] = RegionKey;
1156 }
1157}
1158
1160 // Start actual scheduling here. This function is called by the base
1161 // MachineScheduler after all regions have been recorded by
1162 // GCNScheduleDAGMILive::schedule().
1163 LiveIns.resize(Regions.size());
1164 Pressure.resize(Regions.size());
1165 RegionsWithHighRP.resize(Regions.size());
1166 RegionsWithExcessRP.resize(Regions.size());
1167 RegionsWithIGLPInstrs.resize(Regions.size());
1168 RegionsWithHighRP.reset();
1169 RegionsWithExcessRP.reset();
1170 RegionsWithIGLPInstrs.reset();
1171
1172 runSchedStages();
1173}
1174
1175void GCNScheduleDAGMILive::runSchedStages() {
1176 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
1177
1178 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
1179 if (!Regions.empty()) {
1180 BBLiveInMap = getRegionLiveInMap();
1181 if (S.useGCNTrackers())
1182 RegionLiveOuts.buildLiveRegMap();
1183 }
1184
1185#ifdef DUMP_MAX_REG_PRESSURE
1189 LIS->dump();
1190 }
1191#endif
1192
1193 while (S.advanceStage()) {
1194 auto Stage = createSchedStage(S.getCurrentStage());
1195 if (!Stage->initGCNSchedStage())
1196 continue;
1197
1198 for (auto Region : Regions) {
1199 RegionBegin = Region.first;
1200 RegionEnd = Region.second;
1201 // Setup for scheduling the region and check whether it should be skipped.
1202 if (!Stage->initGCNRegion()) {
1203 Stage->advanceRegion();
1204 exitRegion();
1205 continue;
1206 }
1207
1208 if (S.useGCNTrackers()) {
1209 GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
1210 GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
1211 GCNRPTracker::LiveRegSet *RegionLiveIns =
1212 &LiveIns[Stage->getRegionIdx()];
1213
1214 reinterpret_cast<GCNRPTracker *>(DownwardTracker)
1215 ->reset(MRI, *RegionLiveIns);
1216 reinterpret_cast<GCNRPTracker *>(UpwardTracker)
1217 ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
1218 Stage->getRegionIdx()));
1219 }
1220
1222 Stage->finalizeGCNRegion();
1223 Stage->advanceRegion();
1224 exitRegion();
1225 }
1226
1227 Stage->finalizeGCNSchedStage();
1228 }
1229
1230#ifdef DUMP_MAX_REG_PRESSURE
1234 LIS->dump();
1235 }
1236#endif
1237}
1238
1239#ifndef NDEBUG
1241 switch (StageID) {
1243 OS << "Max Occupancy Initial Schedule";
1244 break;
1246 OS << "Instruction Rewriting Reschedule";
1247 break;
1249 OS << "Unclustered High Register Pressure Reschedule";
1250 break;
1252 OS << "Clustered Low Occupancy Reschedule";
1253 break;
1255 OS << "Pre-RA Rematerialize";
1256 break;
1258 OS << "Max ILP Initial Schedule";
1259 break;
1261 OS << "Max memory clause Initial Schedule";
1262 break;
1263 }
1264
1265 return OS;
1266}
1267#endif
1268
1272
1274 if (!DAG.LIS)
1275 return false;
1276
1277 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1278 return true;
1279}
1280
1281void RewriteMFMAFormStage::findReachingDefs(
1282 MachineOperand &UseMO, LiveIntervals *LIS,
1283 SmallVectorImpl<SlotIndex> &DefIdxs) {
1284 MachineInstr *UseMI = UseMO.getParent();
1285 LiveInterval &UseLI = LIS->getInterval(UseMO.getReg());
1286 VNInfo *VNI = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI));
1287
1288 // If the def is not a PHI, then it must be the only reaching def.
1289 if (!VNI->isPHIDef()) {
1290 DefIdxs.push_back(VNI->def);
1291 return;
1292 }
1293
1294 SmallPtrSet<MachineBasicBlock *, 8> Visited = {UseMI->getParent()};
1295 SmallVector<MachineBasicBlock *, 8> Worklist;
1296
1297 // Mark the predecessor blocks for traversal
1298 for (MachineBasicBlock *PredMBB : UseMI->getParent()->predecessors()) {
1299 Worklist.push_back(PredMBB);
1300 Visited.insert(PredMBB);
1301 }
1302
1303 while (!Worklist.empty()) {
1304 MachineBasicBlock *CurrMBB = Worklist.pop_back_val();
1305
1306 SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(CurrMBB);
1307 VNInfo *VNI = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot());
1308
1309 MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNI->def);
1310
1311 // If there is a def in this block, then add it to the list. This is the
1312 // reaching def of this path.
1313 if (!VNI->isPHIDef()) {
1314 DefIdxs.push_back(VNI->def);
1315 continue;
1316 }
1317
1318 for (MachineBasicBlock *PredMBB : DefMBB->predecessors()) {
1319 if (Visited.insert(PredMBB).second)
1320 Worklist.push_back(PredMBB);
1321 }
1322 }
1323}
1324
1325void RewriteMFMAFormStage::findReachingUses(
1327 SmallVectorImpl<MachineOperand *> &ReachingUses) {
1328 SlotIndex DefIdx = LIS->getInstructionIndex(*DefMI);
1329 for (MachineOperand &UseMO :
1330 DAG.MRI.use_nodbg_operands(DefMI->getOperand(0).getReg())) {
1331 SmallVector<SlotIndex, 8> ReachingDefIndexes;
1332 findReachingDefs(UseMO, LIS, ReachingDefIndexes);
1333
1334 // If we find a use that contains this DefMI in its reachingDefs, then it is
1335 // a reaching use.
1336 if (any_of(ReachingDefIndexes, [DefIdx](SlotIndex RDIdx) {
1337 return SlotIndex::isSameInstr(RDIdx, DefIdx);
1338 }))
1339 ReachingUses.push_back(&UseMO);
1340 }
1341}
1342
1344 // We only need to run this pass if the architecture supports AGPRs.
1345 // Additionally, we don't use AGPRs at occupancy levels above 1 so there
1346 // is no need for this pass in that case, either.
1347 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1348 if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1)
1349 return false;
1350
1351 RegionsWithExcessArchVGPR.resize(DAG.Regions.size());
1352 RegionsWithExcessArchVGPR.reset();
1353 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
1355 if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
1356 RegionsWithExcessArchVGPR[Region] = true;
1357 }
1358
1359 if (RegionsWithExcessArchVGPR.none())
1360 return false;
1361
1362 TII = ST.getInstrInfo();
1363 SRI = ST.getRegisterInfo();
1364
1365 std::vector<std::pair<MachineInstr *, unsigned>> RewriteCands;
1368
1369 if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef))
1370 return false;
1371
1372 int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef);
1373
1374 // If we haven't found the beneficial conditions, prefer the VGPR form which
1375 // may result in less cross RC copies.
1376 if (Cost > 0)
1377 return false;
1378
1379 return rewrite(RewriteCands);
1380}
1381
1384 return false;
1385
1387 return false;
1388
1389 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1390 return false;
1391
1392 SavedMutations.swap(DAG.Mutations);
1393 DAG.addMutation(
1395
1396 InitialOccupancy = DAG.MinOccupancy;
1397 // Aggressively try to reduce register pressure in the unclustered high RP
1398 // stage. Temporarily increase occupancy target in the region.
1399 TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
1400 ? InitialOccupancy + 1
1401 : InitialOccupancy;
1402 IsAnyRegionScheduled = false;
1403 S.SGPRLimitBias = S.HighRPSGPRBias;
1404 S.VGPRLimitBias = S.HighRPVGPRBias;
1405
1406 LLVM_DEBUG(
1407 dbgs()
1408 << "Retrying function scheduling without clustering. "
1409 "Aggressively try to reduce register pressure to achieve occupancy "
1410 << TempTargetOccupancy << ".\n");
1411
1412 return true;
1413}
1414
1417 return false;
1418
1420 return false;
1421
1422 // Don't bother trying to improve ILP in lower RP regions if occupancy has not
1423 // been dropped. All regions will have already been scheduled with the ideal
1424 // occupancy targets.
1425 if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1426 return false;
1427
1428 LLVM_DEBUG(
1429 dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1430 << DAG.MinOccupancy << ".\n");
1431 return true;
1432}
1433
1434/// Allows to easily filter for this stage's debug output.
1435#define REMAT_PREFIX "[PreRARemat] "
1436#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
1437
1438#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1439Printable PreRARematStage::ScoredRemat::print() const {
1440 return Printable([&](raw_ostream &OS) {
1441 OS << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
1442 });
1443}
1444#endif
1445
1447 // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
1448 // regions inbetween the defs and region we sinked the def to. Will need to be
1449 // fixed if there is another pass after this pass.
1450 assert(!S.hasNextStage());
1451
1452 if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1)
1453 return false;
1454
1455 // Maps all MIs (except lone terminators, which are not part of any region) to
1456 // their parent region. Non-lone terminators are considered part of the region
1457 // they delimitate.
1458 DenseMap<MachineInstr *, unsigned> MIRegion(MF.getInstructionCount());
1459
1460 // Before performing any IR modification record the parent region of each MI
1461 // and the parent MBB of each region.
1462 const unsigned NumRegions = DAG.Regions.size();
1463 for (unsigned I = 0; I < NumRegions; ++I) {
1464 RegionBoundaries Region = DAG.Regions[I];
1465 for (auto MI = Region.first; MI != Region.second; ++MI)
1466 MIRegion.insert({&*MI, I});
1467 MachineBasicBlock *ParentMBB = Region.first->getParent();
1468 if (Region.second != ParentMBB->end())
1469 MIRegion.insert({&*Region.second, I});
1470 }
1471
1472#ifndef NDEBUG
1473 auto PrintTargetRegions = [&]() -> void {
1474 if (TargetRegions.none()) {
1475 dbgs() << REMAT_PREFIX << "No target regions\n";
1476 return;
1477 }
1478 dbgs() << REMAT_PREFIX << "Target regions:\n";
1479 for (unsigned I : TargetRegions.set_bits())
1480 dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
1481 };
1482 auto PrintCandidate = [&](const ScoredRemat &Cand) -> Printable {
1483 return Printable([&, Cand](raw_ostream &OS) {
1484 // Concatenate all region numbers in which the register is unused and
1485 // live-through.
1486 const RematReg &Remat = *Cand.Remat;
1487 bool HasLiveThroughRegion = false;
1488 OS << '[' << Remat.DefRegion << " -";
1489 for (unsigned I = 0; I < NumRegions; ++I) {
1490 if (!Cand.UnpredictableRPSave[I]) {
1491 if (HasLiveThroughRegion) {
1492 OS << ',';
1493 } else {
1494 OS << "- ";
1495 HasLiveThroughRegion = true;
1496 }
1497 OS << I;
1498 }
1499 }
1500 if (HasLiveThroughRegion)
1501 OS << " -";
1502 OS << "-> " << Remat.UseRegion << "] ";
1503 Remat.DefMI->print(OS, /*IsStandalone=*/true, /*SkipOpers=*/false,
1504 /*SkipDebugLoc=*/false, /*AddNewLine=*/false);
1505 });
1506 };
1507#endif
1508
1509 // Set an objective for the stage based on current RP in each region.
1510 REMAT_DEBUG({
1511 dbgs() << "Analyzing ";
1512 MF.getFunction().printAsOperand(dbgs(), false);
1513 dbgs() << ": ";
1514 });
1515 if (!setObjective()) {
1516 LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at "
1517 << MFI.getMaxWavesPerEU() << '\n');
1518 return false;
1519 }
1520 LLVM_DEBUG({
1521 if (TargetOcc) {
1522 dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n';
1523 } else {
1524 dbgs() << "reduce spilling (minimum target occupancy is "
1525 << MFI.getMinWavesPerEU() << ")\n";
1526 }
1527 PrintTargetRegions();
1528 });
1529
1530 // Collect all rematerializable registers in the function, then create a
1531 // corresponding scored rematerialization candidate for each one.
1532 if (!collectRematRegs(MIRegion)) {
1533 REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
1534 return false;
1535 }
1536 const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
1537 SmallVector<ScoredRemat, 8> Candidates(RematRegs.size());
1538 SmallVector<unsigned> CandidateOrder;
1539 for (auto [I, Remat] : enumerate(RematRegs)) {
1540 ScoredRemat &Candidate = Candidates[I];
1541 Candidate.init(&Remat, FreqInfo, DAG);
1542 Candidate.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc);
1543 if (!Candidate.hasNullScore())
1544 CandidateOrder.push_back(I);
1545 }
1546
1547 REMAT_DEBUG({
1548 dbgs() << "Rematerializable registers:\n";
1549 for (const ScoredRemat &Cand : Candidates)
1550 dbgs() << REMAT_PREFIX << " " << PrintCandidate(Cand) << '\n';
1551 dbgs() << REMAT_PREFIX << "Region frequencies\n";
1552 for (auto [I, Freq] : enumerate(FreqInfo.Regions)) {
1553 dbgs() << REMAT_PREFIX << " [" << I << "] ";
1554 if (Freq)
1555 dbgs() << Freq;
1556 else
1557 dbgs() << "unknown ";
1558 dbgs() << " | " << *DAG.Regions[I].first;
1559 }
1560 });
1561
1562 // Rematerialize registers in successive rounds until all RP targets are
1563 // satisifed or until we run out of rematerialization candidates.
1564 BitVector RecomputeRP(DAG.Regions.size());
1565 for (;;) {
1566 RecomputeRP.reset();
1567
1568 // Sort candidates in increasing score order.
1569 sort(CandidateOrder, [&](unsigned LHSIndex, unsigned RHSIndex) {
1570 return Candidates[LHSIndex] < Candidates[RHSIndex];
1571 });
1572
1573 REMAT_DEBUG({
1574 dbgs() << "==== NEW REMAT ROUND ====\n"
1575 << REMAT_PREFIX
1576 << "Candidates with non-null score, in rematerialization order:\n";
1577 for (const ScoredRemat &Cand : reverse(Candidates)) {
1578 dbgs() << REMAT_PREFIX << " " << Cand.print() << " | "
1579 << PrintCandidate(Cand) << '\n';
1580 }
1581 PrintTargetRegions();
1582 });
1583
1584 // Rematerialize registers in decreasing score order until we estimate
1585 // that all RP targets are satisfied or until rematerialization candidates
1586 // are no longer useful to decrease RP.
1587 while (!CandidateOrder.empty()) {
1588 const ScoredRemat &Cand = Candidates[CandidateOrder.back()];
1589 // When previous rematerializations in this round have already satisfied
1590 // RP targets in all regions this rematerialization can impact, we have a
1591 // good indication that our scores have diverged significantly from
1592 // reality, in which case we interrupt this round and re-score. This also
1593 // ensures that every rematerialization we perform is possibly impactful
1594 // in at least one target region.
1595 if (!Cand.maybeBeneficial(TargetRegions, RPTargets)) {
1596 REMAT_DEBUG(dbgs() << "Interrupt round on stale score for "
1597 << Cand.print() << " | " << *Cand.Remat->DefMI);
1598 break;
1599 }
1600 CandidateOrder.pop_back();
1601 RematReg &Remat = *Cand.Remat;
1602
1603 // Remove the register from all regions where it is a live-in or live-out
1604 // and rematerialize it.
1605 REMAT_DEBUG(dbgs() << "** REMAT " << PrintCandidate(Cand) << '\n');
1606 removeFromLiveMaps(Remat.getReg(), Cand.LiveIn, Cand.LiveOut);
1607 MachineInstr *RematMI = Cand.rematerialize(DAG);
1608
1609 // Every rematerialization we do here is likely to move the instruction
1610 // into a higher frequency region, increasing the total sum latency of the
1611 // instruction itself. This is acceptable if we are eliminating a spill in
1612 // the process, but when the goal is increasing occupancy we get nothing
1613 // out of rematerialization if occupancy is not increased in the end; in
1614 // such cases we want to roll back the rematerialization.
1615 if (TargetOcc) {
1616 RollbackInfo &Rollback =
1617 Rollbacks.emplace_back(&Remat, Cand.LiveIn, Cand.LiveOut);
1618 Rollback.RematMI = RematMI;
1619 // Make the original MI a debug value so that it does not influence
1620 // scheduling and replace all read registers with a sentinel register to
1621 // prevent operands to appear in use-lists of other MIs during LIS
1622 // updates. Store mappings between operand indices and original
1623 // registers for potential rollback.
1624 Remat.DefMI->setDesc(DAG.TII->get(TargetOpcode::DBG_VALUE));
1625 for (auto [Idx, MO] : enumerate(Remat.DefMI->operands())) {
1626 if (MO.isReg() && MO.readsReg()) {
1627 Rollback.RegMap.insert({Idx, MO.getReg()});
1628 MO.setReg(Register());
1629 }
1630 }
1631 } else {
1632 // Just delete the original instruction if it cannot be rolled back.
1633 DAG.deleteMI(Remat.DefRegion, Remat.DefMI);
1634 }
1635
1636 // Adjust RP targets. The save is guaranteed in regions in which the
1637 // register is live-through and unused but optimistic in all other regions
1638 // where the register is live.
1639 updateRPTargets(Cand.Live, Cand.RPSave);
1640 RecomputeRP |= Cand.UnpredictableRPSave;
1641 RescheduleRegions |= Cand.Live;
1642 if (!TargetRegions.any()) {
1643 REMAT_DEBUG(dbgs() << "All targets cleared, verifying...\n");
1644 break;
1645 }
1646 }
1647
1648 if (!updateAndVerifyRPTargets(RecomputeRP) && !TargetRegions.any()) {
1649 REMAT_DEBUG(dbgs() << "Objectives achieved!\n");
1650 break;
1651 }
1652
1653 // Update the score of remaining candidates and filter out those that have
1654 // become useless from the vector. Candidates never become useful after
1655 // having been useless for a round, so we can freely drop them without
1656 // losing any future rematerialization opportunity.
1657 unsigned NumUsefulCandidates = 0;
1658 for (unsigned CandIdx : CandidateOrder) {
1659 ScoredRemat &Candidate = Candidates[CandIdx];
1660 Candidate.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc);
1661 if (!Candidate.hasNullScore())
1662 CandidateOrder[NumUsefulCandidates++] = CandIdx;
1663 }
1664 if (NumUsefulCandidates == 0) {
1665 REMAT_DEBUG(dbgs() << "Stop on exhausted rematerialization candidates\n");
1666 break;
1667 }
1668 CandidateOrder.truncate(NumUsefulCandidates);
1669 }
1670
1671 if (RescheduleRegions.none())
1672 return false;
1673
1674 // Commit all pressure changes to the DAG and compute minimum achieved
1675 // occupancy in impacted regions.
1676 REMAT_DEBUG(dbgs() << "==== REMAT RESULTS ====\n");
1677 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
1678 for (unsigned I : RescheduleRegions.set_bits()) {
1679 DAG.Pressure[I] = RPTargets[I].getCurrentRP();
1680 REMAT_DEBUG(dbgs() << '[' << I << "] Achieved occupancy "
1681 << DAG.Pressure[I].getOccupancy(ST, DynamicVGPRBlockSize)
1682 << " (" << RPTargets[I] << ")\n");
1683 }
1684 AchievedOcc = MFI.getMaxWavesPerEU();
1685 for (const GCNRegPressure &RP : DAG.Pressure) {
1686 AchievedOcc =
1687 std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
1688 }
1689
1690 REMAT_DEBUG({
1691 dbgs() << "Retrying function scheduling with new min. occupancy of "
1692 << AchievedOcc << " from rematerializing (original was "
1693 << DAG.MinOccupancy;
1694 if (TargetOcc)
1695 dbgs() << ", target was " << *TargetOcc;
1696 dbgs() << ")\n";
1697 });
1698
1699 DAG.setTargetOccupancy(getStageTargetOccupancy());
1700 return true;
1701}
1702
1704 DAG.finishBlock();
1705 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1706}
1707
1709 SavedMutations.swap(DAG.Mutations);
1710 S.SGPRLimitBias = S.VGPRLimitBias = 0;
1711 if (DAG.MinOccupancy > InitialOccupancy) {
1712 assert(IsAnyRegionScheduled);
1714 << " stage successfully increased occupancy to "
1715 << DAG.MinOccupancy << '\n');
1716 } else if (!IsAnyRegionScheduled) {
1717 assert(DAG.MinOccupancy == InitialOccupancy);
1719 << ": No regions scheduled, min occupancy stays at "
1720 << DAG.MinOccupancy << ", MFI occupancy stays at "
1721 << MFI.getOccupancy() << ".\n");
1722 }
1723
1725}
1726
1728 // Skip empty scheduling region.
1729 if (DAG.begin() == DAG.end())
1730 return false;
1731
1732 // Check whether this new region is also a new block.
1733 if (DAG.RegionBegin->getParent() != CurrentMBB)
1734 setupNewBlock();
1735
1736 unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
1737 DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
1738
1739 // Skip regions with 1 schedulable instruction.
1740 if (DAG.begin() == std::prev(DAG.end()))
1741 return false;
1742
1743 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1744 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1745 << " " << CurrentMBB->getName()
1746 << "\n From: " << *DAG.begin() << " To: ";
1747 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1748 else dbgs() << "End";
1749 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1750
1751 // Save original instruction order before scheduling for possible revert.
1752 Unsched.clear();
1753 Unsched.reserve(DAG.NumRegionInstrs);
1756 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
1757 for (auto &I : DAG) {
1758 Unsched.push_back(&I);
1759 if (SII->isIGLPMutationOnly(I.getOpcode()))
1760 DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1761 }
1762 } else {
1763 for (auto &I : DAG)
1764 Unsched.push_back(&I);
1765 }
1766
1767 PressureBefore = DAG.Pressure[RegionIdx];
1768
1769 LLVM_DEBUG(
1770 dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1771 << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1772 << "Region live-in pressure: "
1773 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1774 << "Region register pressure: " << print(PressureBefore));
1775
1776 S.HasHighPressure = false;
1777 S.KnownExcessRP = isRegionWithExcessRP();
1778
1779 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1781 SavedMutations.clear();
1782 SavedMutations.swap(DAG.Mutations);
1783 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1785 DAG.addMutation(createIGroupLPDAGMutation(
1786 IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1788 }
1789
1790 return true;
1791}
1792
1794 // Only reschedule regions that have excess register pressure (i.e. spilling)
1795 // or had minimum occupancy at the beginning of the stage (as long as
1796 // rescheduling of previous regions did not make occupancy drop back down to
1797 // the initial minimum).
1798 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1799 // If no region has been scheduled yet, the DAG has not yet been updated with
1800 // the occupancy target. So retrieve it from the temporary.
1801 unsigned CurrentTargetOccupancy =
1802 IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
1803 if (!DAG.RegionsWithExcessRP[RegionIdx] &&
1804 (CurrentTargetOccupancy <= InitialOccupancy ||
1805 DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
1806 InitialOccupancy))
1807 return false;
1808
1809 bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
1810 // If this is the first region scheduled during this stage, make the target
1811 // occupancy changes in the DAG and MFI.
1812 if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
1813 IsAnyRegionScheduled = true;
1814 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
1815 DAG.setTargetOccupancy(TempTargetOccupancy);
1816 }
1817 return IsSchedulingThisRegion;
1818}
1819
1821 // We may need to reschedule this region if it wasn't rescheduled in the last
1822 // stage, or if we found it was testing critical register pressure limits in
1823 // the unclustered reschedule stage. The later is because we may not have been
1824 // able to raise the min occupancy in the previous stage so the region may be
1825 // overly constrained even if it was already rescheduled.
1826 if (!DAG.RegionsWithHighRP[RegionIdx])
1827 return false;
1828
1830}
1831
1833 return !RevertAllRegions && RescheduleRegions[RegionIdx] &&
1835}
1836
1838 if (CurrentMBB)
1839 DAG.finishBlock();
1840
1841 CurrentMBB = DAG.RegionBegin->getParent();
1842 DAG.startBlock(CurrentMBB);
1843 // Get real RP for the region if it hasn't be calculated before. After the
1844 // initial schedule stage real RP will be collected after scheduling.
1848 DAG.computeBlockPressure(RegionIdx, CurrentMBB);
1849}
1850
1852 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1853 if (S.HasHighPressure)
1854 DAG.RegionsWithHighRP[RegionIdx] = true;
1855
1856 // Revert scheduling if we have dropped occupancy or there is some other
1857 // reason that the original schedule is better.
1859
1860 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1862 SavedMutations.swap(DAG.Mutations);
1863}
1864
1867 // When the goal is to increase occupancy, all regions must reach the target
1868 // occupancy for rematerializations to be possibly useful, otherwise we will
1869 // just hurt latency for no benefit. If minimum occupancy drops below the
1870 // target there is no point in trying to re-schedule further regions.
1871 if (!TargetOcc)
1872 return;
1873 RegionReverts.emplace_back(RegionIdx, Unsched, PressureBefore);
1874 if (DAG.MinOccupancy < *TargetOcc) {
1875 REMAT_DEBUG(dbgs() << "Region " << RegionIdx
1876 << " cannot meet occupancy target, interrupting "
1877 "re-scheduling in all regions\n");
1878 RevertAllRegions = true;
1879 }
1880}
1881
1883 // Check the results of scheduling.
1884 PressureAfter = DAG.getRealRegPressure(RegionIdx);
1885
1886 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1887 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1888
1889 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1890
1891 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1892 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1893 DAG.Pressure[RegionIdx] = PressureAfter;
1894
1895 // Early out if we have achieved the occupancy target.
1896 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1897 return;
1898 }
1899
1900 unsigned TargetOccupancy = std::min(
1901 S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
1902 unsigned WavesAfter = std::min(
1903 TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
1904 unsigned WavesBefore = std::min(
1905 TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
1906 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1907 << ", after " << WavesAfter << ".\n");
1908
1909 // We may not be able to keep the current target occupancy because of the just
1910 // scheduled region. We might still be able to revert scheduling if the
1911 // occupancy before was higher, or if the current schedule has register
1912 // pressure higher than the excess limits which could lead to more spilling.
1913 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
1914
1915 // Allow memory bound functions to drop to 4 waves if not limited by an
1916 // attribute.
1917 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1918 WavesAfter >= MFI.getMinAllowedOccupancy()) {
1919 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1920 << MFI.getMinAllowedOccupancy() << " waves\n");
1921 NewOccupancy = WavesAfter;
1922 }
1923
1924 if (NewOccupancy < DAG.MinOccupancy) {
1925 DAG.MinOccupancy = NewOccupancy;
1926 MFI.limitOccupancy(DAG.MinOccupancy);
1927 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1928 << DAG.MinOccupancy << ".\n");
1929 }
1930 // The maximum number of arch VGPR on non-unified register file, or the
1931 // maximum VGPR + AGPR in the unified register file case.
1932 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1933 // The maximum number of arch VGPR for both unified and non-unified register
1934 // file.
1935 unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
1936 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1937
1938 if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
1939 PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
1940 PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1941 PressureAfter.getSGPRNum() > MaxSGPRs) {
1942 DAG.RegionsWithHighRP[RegionIdx] = true;
1943 DAG.RegionsWithExcessRP[RegionIdx] = true;
1944 }
1945
1946 // Revert if this region's schedule would cause a drop in occupancy or
1947 // spilling.
1948 if (shouldRevertScheduling(WavesAfter)) {
1950 std::tie(DAG.RegionBegin, DAG.RegionEnd) = DAG.Regions[RegionIdx];
1951 } else {
1952 DAG.Pressure[RegionIdx] = PressureAfter;
1953 }
1954}
1955
1956unsigned
1957GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1958 DenseMap<unsigned, unsigned> &ReadyCycles,
1959 const TargetSchedModel &SM) {
1960 unsigned ReadyCycle = CurrCycle;
1961 for (auto &D : SU.Preds) {
1962 if (D.isAssignedRegDep()) {
1963 MachineInstr *DefMI = D.getSUnit()->getInstr();
1964 unsigned Latency = SM.computeInstrLatency(DefMI);
1965 unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
1966 ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
1967 }
1968 }
1969 ReadyCycles[SU.NodeNum] = ReadyCycle;
1970 return ReadyCycle;
1971}
1972
1973#ifndef NDEBUG
1975 bool operator()(std::pair<MachineInstr *, unsigned> A,
1976 std::pair<MachineInstr *, unsigned> B) const {
1977 return A.second < B.second;
1978 }
1979};
1980
1981static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1982 EarlierIssuingCycle> &ReadyCycles) {
1983 if (ReadyCycles.empty())
1984 return;
1985 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1986 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1987 << " ##################\n# Cycle #\t\t\tInstruction "
1988 " "
1989 " \n";
1990 unsigned IPrev = 1;
1991 for (auto &I : ReadyCycles) {
1992 if (I.second > IPrev + 1)
1993 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1994 << " CYCLES DETECTED ******************************\n\n";
1995 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
1996 IPrev = I.second;
1997 }
1998}
1999#endif
2000
2001ScheduleMetrics
2002GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
2003#ifndef NDEBUG
2004 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
2005 ReadyCyclesSorted;
2006#endif
2007 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
2008 unsigned SumBubbles = 0;
2009 DenseMap<unsigned, unsigned> ReadyCycles;
2010 unsigned CurrCycle = 0;
2011 for (auto &SU : InputSchedule) {
2012 unsigned ReadyCycle =
2013 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
2014 SumBubbles += ReadyCycle - CurrCycle;
2015#ifndef NDEBUG
2016 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
2017#endif
2018 CurrCycle = ++ReadyCycle;
2019 }
2020#ifndef NDEBUG
2021 LLVM_DEBUG(
2022 printScheduleModel(ReadyCyclesSorted);
2023 dbgs() << "\n\t"
2024 << "Metric: "
2025 << (SumBubbles
2026 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
2027 : 1)
2028 << "\n\n");
2029#endif
2030
2031 return ScheduleMetrics(CurrCycle, SumBubbles);
2032}
2033
2036#ifndef NDEBUG
2037 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
2038 ReadyCyclesSorted;
2039#endif
2040 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
2041 unsigned SumBubbles = 0;
2042 DenseMap<unsigned, unsigned> ReadyCycles;
2043 unsigned CurrCycle = 0;
2044 for (auto &MI : DAG) {
2045 SUnit *SU = DAG.getSUnit(&MI);
2046 if (!SU)
2047 continue;
2048 unsigned ReadyCycle =
2049 computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
2050 SumBubbles += ReadyCycle - CurrCycle;
2051#ifndef NDEBUG
2052 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
2053#endif
2054 CurrCycle = ++ReadyCycle;
2055 }
2056#ifndef NDEBUG
2057 LLVM_DEBUG(
2058 printScheduleModel(ReadyCyclesSorted);
2059 dbgs() << "\n\t"
2060 << "Metric: "
2061 << (SumBubbles
2062 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
2063 : 1)
2064 << "\n\n");
2065#endif
2066
2067 return ScheduleMetrics(CurrCycle, SumBubbles);
2068}
2069
2070bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
2071 if (WavesAfter < DAG.MinOccupancy)
2072 return true;
2073
2074 // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
2075 if (DAG.MFI.isDynamicVGPREnabled()) {
2076 unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2077 &ST, DAG.MFI.getDynamicVGPRBlockSize(),
2078 PressureBefore.getVGPRNum(false));
2079 unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2080 &ST, DAG.MFI.getDynamicVGPRBlockSize(),
2081 PressureAfter.getVGPRNum(false));
2082 if (BlocksAfter > BlocksBefore)
2083 return true;
2084 }
2085
2086 return false;
2087}
2088
2091 return false;
2092
2094 return true;
2095
2096 if (mayCauseSpilling(WavesAfter))
2097 return true;
2098
2099 return false;
2100}
2101
2103 // If RP is not reduced in the unclustered reschedule stage, revert to the
2104 // old schedule.
2105 if ((WavesAfter <=
2106 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
2107 mayCauseSpilling(WavesAfter)) ||
2109 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
2110 return true;
2111 }
2112
2113 // Do not attempt to relax schedule even more if we are already spilling.
2115 return false;
2116
2117 LLVM_DEBUG(
2118 dbgs()
2119 << "\n\t *** In shouldRevertScheduling ***\n"
2120 << " *********** BEFORE UnclusteredHighRPStage ***********\n");
2121 ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
2122 LLVM_DEBUG(
2123 dbgs()
2124 << "\n *********** AFTER UnclusteredHighRPStage ***********\n");
2126 unsigned OldMetric = MBefore.getMetric();
2127 unsigned NewMetric = MAfter.getMetric();
2128 unsigned WavesBefore = std::min(
2129 S.getTargetOccupancy(),
2130 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
2131 unsigned Profit =
2132 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
2134 NewMetric) /
2136 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
2137 << MAfter << "Profit: " << Profit << "\n");
2138 return Profit < ScheduleMetrics::ScaleFactor;
2139}
2140
2143 return false;
2144
2146 return true;
2147
2148 if (mayCauseSpilling(WavesAfter))
2149 return true;
2150
2151 return false;
2152}
2153
2155 // When trying to increase occupancy (TargetOcc == true) the stage manages
2156 // region reverts globally (all or none), so we always return false here.
2157 return !TargetOcc && mayCauseSpilling(WavesAfter);
2158}
2159
2161 if (mayCauseSpilling(WavesAfter))
2162 return true;
2163
2164 return false;
2165}
2166
2168 unsigned WavesAfter) {
2169 return mayCauseSpilling(WavesAfter);
2170}
2171
2172bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
2173 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
2175 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
2176 return true;
2177 }
2178
2179 return false;
2180}
2181
2183 ArrayRef<MachineInstr *> MIOrder) {
2184 assert(static_cast<size_t>(std::distance(DAG.Regions[RegionIdx].first,
2185 DAG.Regions[RegionIdx].second)) ==
2186 MIOrder.size() &&
2187 "instruction number mismatch");
2188 if (MIOrder.empty())
2189 return;
2190
2191 LLVM_DEBUG(dbgs() << "Reverting scheduling for region " << RegionIdx << '\n');
2192
2193 // Reconstruct MI sequence by moving instructions in desired order before
2194 // the current region's start.
2195 MachineBasicBlock::iterator RegionEnd = DAG.Regions[RegionIdx].first;
2196 MachineBasicBlock *MBB = MIOrder.front()->getParent();
2197 for (MachineInstr *MI : MIOrder) {
2198 // Either move the next MI in order before the end of the region or move the
2199 // region end past the MI if it is at the correct position.
2200 MachineBasicBlock::iterator MII = MI->getIterator();
2201 if (MII != RegionEnd) {
2202 // Will subsequent splice move MI up past a non-debug instruction?
2203 bool NonDebugReordered =
2204 !MI->isDebugInstr() &&
2205 skipDebugInstructionsForward(RegionEnd, MII) != MII;
2206 MBB->splice(RegionEnd, MBB, MI);
2207 // Only update LiveIntervals information if non-debug instructions are
2208 // reordered. Otherwise debug instructions could cause code generation to
2209 // change.
2210 if (NonDebugReordered)
2211 DAG.LIS->handleMove(*MI, true);
2212 } else {
2213 ++RegionEnd;
2214 }
2215 if (MI->isDebugInstr()) {
2216 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2217 continue;
2218 }
2219
2220 // Reset read-undef flags and update them later.
2221 for (MachineOperand &Op : MI->all_defs())
2222 Op.setIsUndef(false);
2223 RegisterOperands RegOpers;
2224 RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
2225 if (DAG.ShouldTrackLaneMasks) {
2226 // Adjust liveness and add missing dead+read-undef flags.
2227 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
2228 RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
2229 } else {
2230 // Adjust for missing dead-def flags.
2231 RegOpers.detectDeadDefs(*MI, *DAG.LIS);
2232 }
2233 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2234 }
2235
2236 // The region end doesn't change throughout scheduling since it itself is
2237 // outside the region (whether that is a MBB end or a terminator MI).
2238 assert(RegionEnd == DAG.Regions[RegionIdx].second && "region end mismatch");
2239 DAG.Regions[RegionIdx].first = MIOrder.front();
2240}
2241
2242bool RewriteMFMAFormStage::isRewriteCandidate(MachineInstr *MI) const {
2243
2244 if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
2245 return false;
2246 return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1;
2247}
2248
2249bool RewriteMFMAFormStage::initHeuristics(
2250 std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
2251 DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2252 SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2253 bool Changed = false;
2254
2255 // Prepare for the heuristics
2256 for (MachineBasicBlock &MBB : MF) {
2257 for (MachineInstr &MI : MBB) {
2258 if (!isRewriteCandidate(&MI))
2259 continue;
2260
2261 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
2262 assert(ReplacementOp != -1);
2263
2264 RewriteCands.push_back({&MI, MI.getOpcode()});
2265 MI.setDesc(TII->get(ReplacementOp));
2266
2267 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
2268 if (Src2->isReg()) {
2269 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2270 findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
2271
2272 // For any definition of the src2 register which is non-MFMA, we
2273 // insert a copy.
2274 for (SlotIndex RDIdx : Src2ReachingDefs) {
2275 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
2276 if (!TII->isMAI(*RD))
2277 CopyForDef.insert(RD);
2278 }
2279 }
2280
2281 MachineOperand &Dst = MI.getOperand(0);
2282 SmallVector<MachineOperand *, 8> DstReachingUses;
2283
2284 findReachingUses(&MI, DAG.LIS, DstReachingUses);
2285
2286 for (MachineOperand *RUOp : DstReachingUses) {
2287 if (TII->isMAI(*RUOp->getParent()))
2288 continue;
2289
2290 // For any user of the result of the MFMA which is not an MFMA, we
2291 // insert a copy. For a given register, we will only insert one copy
2292 // per user block.
2293 CopyForUse[RUOp->getParent()->getParent()].insert(RUOp->getReg());
2294
2295 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2296 findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
2297
2298 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2299 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2300 if (TII->isMAI(*RD))
2301 continue;
2302
2303 // For any definition of the user of the MFMA which is not an MFMA,
2304 // we insert a copy. We do this to transform all the reaching defs
2305 // of this use to AGPR. By doing this, we can insert a copy from
2306 // AGPR to VGPR at the user rather than after the MFMA.
2307 CopyForDef.insert(RD);
2308 }
2309 }
2310
2311 // Do the rewrite to allow for updated RP calculation.
2312 const TargetRegisterClass *VDefRC = DAG.MRI.getRegClass(Dst.getReg());
2313 const TargetRegisterClass *ADefRC = SRI->getEquivalentAGPRClass(VDefRC);
2314 DAG.MRI.setRegClass(Dst.getReg(), ADefRC);
2315 if (Src2->isReg()) {
2316 // Have to get src types separately since subregs may cause C and D
2317 // registers to be different types even though the actual operand is
2318 // the same size.
2319 const TargetRegisterClass *VUseRC = DAG.MRI.getRegClass(Src2->getReg());
2320 const TargetRegisterClass *AUseRC = SRI->getEquivalentAGPRClass(VUseRC);
2321 DAG.MRI.setRegClass(Src2->getReg(), AUseRC);
2322 }
2323 Changed = true;
2324 }
2325 }
2326
2327 return Changed;
2328}
2329
2330int64_t RewriteMFMAFormStage::getRewriteCost(
2331 const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
2332 const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2333 const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2334 MachineBlockFrequencyInfo *MBFI = DAG.MBFI;
2335
2336 int64_t BestSpillCost = 0;
2337 int64_t Cost = 0;
2338 uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
2339
2340 std::pair<unsigned, unsigned> MaxVectorRegs =
2341 ST.getMaxNumVectorRegs(MF.getFunction());
2342 unsigned ArchVGPRThreshold = MaxVectorRegs.first;
2343 unsigned AGPRThreshold = MaxVectorRegs.second;
2344 unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
2345
2346 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2347 if (!RegionsWithExcessArchVGPR[Region])
2348 continue;
2349
2350 GCNRegPressure &PressureBefore = DAG.Pressure[Region];
2351 unsigned SpillCostBefore = PressureBefore.getVGPRSpills(
2352 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2353
2354 // For the cases we care about (i.e. ArchVGPR usage is greater than the
2355 // addressable limit), rewriting alone should bring pressure to manageable
2356 // level. If we find any such region, then the rewrite is potentially
2357 // beneficial.
2358 GCNRegPressure PressureAfter = DAG.getRealRegPressure(Region);
2359 unsigned SpillCostAfter = PressureAfter.getVGPRSpills(
2360 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2361
2362 uint64_t BlockFreq =
2363 MBFI->getBlockFreq(DAG.Regions[Region].first->getParent())
2364 .getFrequency();
2365
2366 bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
2367 uint64_t RelativeFreq = EntryFreq && BlockFreq
2368 ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
2369 : BlockFreq / EntryFreq)
2370 : 1;
2371
2372 // This assumes perfect spilling / splitting -- using one spill / copy
2373 // instruction and one restoreFrom / copy for each excess register,
2374 int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2;
2375
2376 // Also account for the block frequency.
2377 if (RelativeFreqIsDenom)
2378 SpillCost /= (int64_t)RelativeFreq;
2379 else
2380 SpillCost *= (int64_t)RelativeFreq;
2381
2382 // If we have increased spilling in any block, just bail.
2383 if (SpillCost > 0)
2384 return SpillCost;
2385
2386 if (SpillCost < BestSpillCost)
2387 BestSpillCost = SpillCost;
2388 }
2389
2390 // Set the cost to the largest decrease in spill cost in order to not double
2391 // count spill reductions.
2392 Cost = BestSpillCost;
2393 assert(Cost <= 0);
2394
2395 unsigned CopyCost = 0;
2396
2397 // For each CopyForDef, increase the cost by the register size while
2398 // accounting for block frequency.
2399 for (MachineInstr *DefMI : CopyForDef) {
2400 Register DefReg = DefMI->getOperand(0).getReg();
2401 uint64_t DefFreq =
2402 EntryFreq
2403 ? MBFI->getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
2404 : 1;
2405
2406 const TargetRegisterClass *RC = DAG.MRI.getRegClass(DefReg);
2407 CopyCost += RC->getCopyCost() * DefFreq;
2408 }
2409
2410 // Account for CopyForUse copies in each block that the register is used.
2411 for (auto &[UseBlock, UseRegs] : CopyForUse) {
2412 uint64_t UseFreq =
2413 EntryFreq ? MBFI->getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
2414
2415 for (Register UseReg : UseRegs) {
2416 const TargetRegisterClass *RC = DAG.MRI.getRegClass(UseReg);
2417 CopyCost += RC->getCopyCost() * UseFreq;
2418 }
2419 }
2420
2421 // Reset the classes that were changed to AGPR for better RB analysis.
2422 // We must do rewriting after copy-insertion, as some defs of the register
2423 // may require VGPR. Additionally, if we bail out and don't perform the
2424 // rewrite then these need to be restored anyway.
2425 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2426 assert(TII->isMAI(*MI));
2427 const TargetRegisterClass *ADefRC =
2428 DAG.MRI.getRegClass(MI->getOperand(0).getReg());
2429 const TargetRegisterClass *VDefRC = SRI->getEquivalentVGPRClass(ADefRC);
2430 DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VDefRC);
2431 MI->setDesc(TII->get(OriginalOpcode));
2432
2433 MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
2434 assert(Src2);
2435 if (!Src2->isReg())
2436 continue;
2437
2438 // Have to get src types separately since subregs may cause C and D
2439 // registers to be different types even though the actual operand is
2440 // the same size.
2441 const TargetRegisterClass *AUseRC = DAG.MRI.getRegClass(Src2->getReg());
2442 const TargetRegisterClass *VUseRC = SRI->getEquivalentVGPRClass(AUseRC);
2443 DAG.MRI.setRegClass(Src2->getReg(), VUseRC);
2444 }
2445
2446 return Cost + CopyCost;
2447}
2448
2449bool RewriteMFMAFormStage::rewrite(
2450 const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
2451 DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
2452 DenseMap<MachineInstr *, unsigned> LastMIToRegion;
2453
2454 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2455 RegionBoundaries Entry = DAG.Regions[Region];
2456 if (Entry.first == Entry.second)
2457 continue;
2458
2459 FirstMIToRegion[&*Entry.first] = Region;
2460 if (Entry.second != Entry.first->getParent()->end())
2461 LastMIToRegion[&*Entry.second] = Region;
2462 }
2463
2464 // Rewrite the MFMAs to AGPR, and insert any copies as needed.
2465 // The general assumption of the algorithm (and the previous cost calculation)
2466 // is that it is better to insert the copies in the MBB of the def of the src2
2467 // operands, and in the MBB of the user of the dest operands. This is based on
2468 // the assumption that the MFMAs are likely to appear in loop bodies, while
2469 // the src2 and dest operands are live-in / live-out of the loop. Due to this
2470 // design, the algorithm for finding copy insertion points is more
2471 // complicated.
2472 //
2473 // There are three main cases to handle: 1. the reaching defs of the src2
2474 // operands, 2. the reaching uses of the dst operands, and 3. the reaching
2475 // defs of the reaching uses of the dst operand.
2476 //
2477 // In the first case, we simply insert copies after each of the reaching
2478 // definitions. In the second case, we collect all the uses of a given dest
2479 // and organize them by MBB. Then, we insert 1 copy for each MBB before the
2480 // earliest use. Since the use may have multiple reaching defs, and since we
2481 // want to replace the register it is using with the result of the copy, we
2482 // must handle case 3. In the third case, we simply insert a copy after each
2483 // of the reaching defs to connect to the copy of the reaching uses of the dst
2484 // reg. This allows us to avoid inserting copies next to the MFMAs.
2485 //
2486 // While inserting the copies, we maintain a map of operands which will use
2487 // different regs (i.e. the result of the copies). For example, a case 1 src2
2488 // operand will use the register result of the copies after the reaching defs,
2489 // as opposed to the original register. Now that we have completed our copy
2490 // analysis and placement, we can bulk update the registers. We do this
2491 // separately as to avoid complicating the reachingDef and reachingUse
2492 // queries.
2493 //
2494 // While inserting the copies, we also maintain a list or registers which we
2495 // will want to reclassify as AGPR. After doing the copy insertion and the
2496 // register replacement, we can finally do the reclassification. This uses the
2497 // redef map, as the registers we are interested in reclassifying may be
2498 // replaced by the result of a copy. We must do this after the copy analysis
2499 // and placement as we must have an accurate redef map -- otherwise we may end
2500 // up creating illegal instructions.
2501
2502 // The original registers of the MFMA that need to be reclassified as AGPR.
2503 DenseSet<Register> RewriteRegs;
2504 // The map of an original register in the MFMA to a new register (result of a
2505 // copy) that it should be replaced with.
2506 DenseMap<Register, Register> RedefMap;
2507 // The map of the original MFMA registers to the relevant MFMA operands.
2508 DenseMap<Register, DenseSet<MachineOperand *>> ReplaceMap;
2509 // The map of reaching defs for a given register -- to avoid duplicate copies.
2510 DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap;
2511 // The map of reaching uses for a given register by basic block -- to avoid
2512 // duplicate copies and to calculate per MBB insert pts.
2513 DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>
2514 ReachingUseTracker;
2515
2516 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2517 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode());
2518 if (ReplacementOp == -1)
2519 continue;
2520 MI->setDesc(TII->get(ReplacementOp));
2521
2522 // Case 1: insert copies for the reaching defs of the Src2Reg.
2523 MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
2524 if (Src2->isReg()) {
2525 Register Src2Reg = Src2->getReg();
2526 if (!Src2Reg.isVirtual())
2527 return false;
2528
2529 Register MappedReg = Src2->getReg();
2530 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2531 findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
2532 SmallSetVector<MachineInstr *, 8> Src2DefsReplace;
2533
2534 for (SlotIndex RDIndex : Src2ReachingDefs) {
2535 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2536 if (TII->isMAI(*RD))
2537 continue;
2538
2539 // If there is a non mai reaching def, then we need a copy.
2540 Src2DefsReplace.insert(RD);
2541 }
2542
2543 if (!Src2DefsReplace.empty()) {
2544 DenseMap<Register, Register>::iterator RI = RedefMap.find(Src2Reg);
2545 if (RI != RedefMap.end()) {
2546 MappedReg = RI->second;
2547 } else {
2548 assert(!ReachingDefCopyMap.contains(Src2Reg));
2549 const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Src2Reg);
2550 const TargetRegisterClass *VGPRRC =
2551 SRI->getEquivalentVGPRClass(Src2RC);
2552
2553 // Track the mapping of the original register to the new register.
2554 MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
2555 RedefMap[Src2Reg] = MappedReg;
2556 }
2557
2558 // If none exists, create a copy from this reaching def.
2559 // We may have inserted a copy already in an earlier iteration.
2560 for (MachineInstr *RD : Src2DefsReplace) {
2561 // Do not create redundant copies.
2562 if (ReachingDefCopyMap[Src2Reg].insert(RD).second) {
2563 MachineInstrBuilder VGPRCopy =
2564 BuildMI(*RD->getParent(), std::next(RD->getIterator()),
2565 RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
2566 .addDef(MappedReg, {}, 0)
2567 .addUse(Src2Reg, {}, 0);
2568 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2569
2570 // If this reaching def was the last MI in the region, update the
2571 // region boundaries.
2572 if (LastMIToRegion.contains(RD)) {
2573 unsigned UpdateRegion = LastMIToRegion[RD];
2574 DAG.Regions[UpdateRegion].second = VGPRCopy;
2575 LastMIToRegion.erase(RD);
2576 }
2577 }
2578 }
2579 }
2580
2581 // Track the register for reclassification
2582 RewriteRegs.insert(Src2Reg);
2583
2584 // Always insert the operand for replacement. If this corresponds with a
2585 // chain of tied-def we may not see the VGPR requirement until later.
2586 ReplaceMap[Src2Reg].insert(Src2);
2587 }
2588
2589 // Case 2 and Case 3: insert copies before the reaching uses of the dsts,
2590 // and after the reaching defs of the reaching uses of the dsts.
2591
2592 MachineOperand *Dst = &MI->getOperand(0);
2593 Register DstReg = Dst->getReg();
2594 if (!DstReg.isVirtual())
2595 return false;
2596
2597 Register MappedReg = DstReg;
2598 SmallVector<MachineOperand *, 8> DstReachingUses;
2599
2600 SmallVector<MachineOperand *, 8> DstReachingUseCopies;
2601 SmallVector<MachineInstr *, 8> DstUseDefsReplace;
2602
2603 findReachingUses(MI, DAG.LIS, DstReachingUses);
2604
2605 for (MachineOperand *RUOp : DstReachingUses) {
2606 if (TII->isMAI(*RUOp->getParent()))
2607 continue;
2608
2609 // If there is a non mai reaching use, then we need a copy.
2610 if (find(DstReachingUseCopies, RUOp) == DstReachingUseCopies.end())
2611 DstReachingUseCopies.push_back(RUOp);
2612 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2613 findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
2614
2615 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2616 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2617 if (TII->isMAI(*RD))
2618 continue;
2619
2620 // If there is a non mai reaching def of this reaching use, then we will
2621 // need a copy.
2622 if (find(DstUseDefsReplace, RD) == DstUseDefsReplace.end())
2623 DstUseDefsReplace.push_back(RD);
2624 }
2625 }
2626
2627 if (!DstUseDefsReplace.empty()) {
2628 DenseMap<Register, Register>::iterator RI = RedefMap.find(DstReg);
2629 if (RI != RedefMap.end()) {
2630 MappedReg = RI->second;
2631 } else {
2632 assert(!ReachingDefCopyMap.contains(DstReg));
2633 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
2634 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2635
2636 // Track the mapping of the original register to the new register.
2637 MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
2638 RedefMap[DstReg] = MappedReg;
2639 }
2640
2641 // If none exists, create a copy from this reaching def.
2642 // We may have inserted a copy already in an earlier iteration.
2643 for (MachineInstr *RD : DstUseDefsReplace) {
2644 // Do not create reundant copies.
2645 if (ReachingDefCopyMap[DstReg].insert(RD).second) {
2646 MachineInstrBuilder VGPRCopy =
2647 BuildMI(*RD->getParent(), std::next(RD->getIterator()),
2648 RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
2649 .addDef(MappedReg, {}, 0)
2650 .addUse(DstReg, {}, 0);
2651 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2652
2653 // If this reaching def was the last MI in the region, update the
2654 // region boundaries.
2656 LastMIToRegion.find(RD);
2657 if (LMI != LastMIToRegion.end()) {
2658 unsigned UpdateRegion = LMI->second;
2659 DAG.Regions[UpdateRegion].second = VGPRCopy;
2660 LastMIToRegion.erase(RD);
2661 }
2662 }
2663 }
2664 }
2665
2666 DenseSet<MachineOperand *> &DstRegSet = ReplaceMap[DstReg];
2667 for (MachineOperand *RU : DstReachingUseCopies) {
2668 MachineBasicBlock *RUBlock = RU->getParent()->getParent();
2669 // Just keep track of the reaching use of this register by block. After we
2670 // have scanned all the MFMAs we can find optimal insert pts.
2671 if (RUBlock != MI->getParent()) {
2672 ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(RU);
2673 continue;
2674 }
2675
2676 // Special case, the use is in the same block as the MFMA. Insert the copy
2677 // just before the use.
2678 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
2679 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2680 Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
2681 MachineInstr *UseInst = RU->getParent();
2682 MachineInstrBuilder VGPRCopy =
2683 BuildMI(*UseInst->getParent(), UseInst->getIterator(),
2684 UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
2685 .addDef(NewUseReg, {}, 0)
2686 .addUse(DstReg, {}, 0);
2687 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2688 // Since we know this use has only one reaching def, we can replace the
2689 // use reg.
2690 RU->setReg(NewUseReg);
2691 // Track the copy source operand for r eplacement.
2692 DstRegSet.insert(&VGPRCopy->getOperand(1));
2693 }
2694
2695 // Track the register for reclassification
2696 RewriteRegs.insert(DstReg);
2697
2698 // Insert the dst operand for replacement. If this dst is in a chain of
2699 // tied-def MFMAs, and the first src2 needs to be replaced with a new reg,
2700 // all the correspond operands need to be replaced.
2701 DstRegSet.insert(Dst);
2702 }
2703
2704 // Handle the copies for dst uses.
2705 using RUBType =
2706 std::pair<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>;
2707 for (RUBType RUBlockEntry : ReachingUseTracker) {
2708 using RUDType = std::pair<Register, SmallPtrSet<MachineOperand *, 8>>;
2709 for (RUDType RUDst : RUBlockEntry.second) {
2710 MachineOperand *OpBegin = *RUDst.second.begin();
2711 SlotIndex InstPt = DAG.LIS->getInstructionIndex(*OpBegin->getParent());
2712
2713 // Find the earliest use in this block.
2714 for (MachineOperand *User : RUDst.second) {
2715 SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(*User->getParent());
2716 if (SlotIndex::isEarlierInstr(NewInstPt, InstPt))
2717 InstPt = NewInstPt;
2718 }
2719
2720 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(RUDst.first);
2721 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2722 Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
2723 MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(InstPt);
2724
2725 MachineInstrBuilder VGPRCopy =
2726 BuildMI(*UseInst->getParent(), UseInst->getIterator(),
2727 UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
2728 .addDef(NewUseReg, {}, 0)
2729 .addUse(RUDst.first, {}, 0);
2730 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2731
2732 // If this UseInst was the first MI in the region, update the region
2733 // boundaries.
2735 FirstMIToRegion.find(UseInst);
2736 if (FI != FirstMIToRegion.end()) {
2737 unsigned UpdateRegion = FI->second;
2738 DAG.Regions[UpdateRegion].first = VGPRCopy;
2739 FirstMIToRegion.erase(UseInst);
2740 }
2741
2742 // Replace the operand for all users.
2743 for (MachineOperand *User : RUDst.second) {
2744 User->setReg(NewUseReg);
2745 }
2746
2747 // Track the copy source operand for replacement.
2748 ReplaceMap[RUDst.first].insert(&VGPRCopy->getOperand(1));
2749 }
2750 }
2751
2752 // We may have needed to insert copies after the reaching defs of the MFMAs.
2753 // Replace the original register with the result of the copy for all relevant
2754 // operands.
2755 for (std::pair<Register, Register> NewDef : RedefMap) {
2756 Register OldReg = NewDef.first;
2757 Register NewReg = NewDef.second;
2758
2759 // Replace the register for any associated operand in the MFMA chain.
2760 for (MachineOperand *ReplaceOp : ReplaceMap[OldReg])
2761 ReplaceOp->setReg(NewReg);
2762 }
2763
2764 // Finally, do the reclassification of the MFMA registers.
2765 for (Register RewriteReg : RewriteRegs) {
2766 Register RegToRewrite = RewriteReg;
2767
2768 // Be sure to update the replacement register and not the original.
2769 DenseMap<Register, Register>::iterator RI = RedefMap.find(RewriteReg);
2770 if (RI != RedefMap.end())
2771 RegToRewrite = RI->second;
2772
2773 const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(RegToRewrite);
2774 const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(CurrRC);
2775
2776 DAG.MRI.setRegClass(RegToRewrite, AGPRRC);
2777 }
2778
2779 // Bulk update the LIS.
2780 DAG.LIS->reanalyze(DAG.MF);
2781 // Liveins may have been modified for cross RC copies
2782 RegionPressureMap LiveInUpdater(&DAG, false);
2783 LiveInUpdater.buildLiveRegMap();
2784
2785 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++)
2786 DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(Region);
2787
2788 DAG.Pressure[RegionIdx] = DAG.getRealRegPressure(RegionIdx);
2789
2790 return true;
2791}
2792
2793unsigned PreRARematStage::getStageTargetOccupancy() const {
2794 return TargetOcc ? *TargetOcc : MFI.getMinWavesPerEU();
2795}
2796
2797bool PreRARematStage::setObjective() {
2798 const Function &F = MF.getFunction();
2799
2800 // Set up "spilling targets" for all regions.
2801 unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
2802 unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
2803 bool HasVectorRegisterExcess = false;
2804 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2805 const GCNRegPressure &RP = DAG.Pressure[I];
2806 GCNRPTarget &Target = RPTargets.emplace_back(MaxSGPRs, MaxVGPRs, MF, RP);
2807 if (!Target.satisfied())
2808 TargetRegions.set(I);
2809 HasVectorRegisterExcess |= Target.hasVectorRegisterExcess();
2810 }
2811
2812 if (HasVectorRegisterExcess || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
2813 // In addition to register usage being above addressable limits, occupancy
2814 // below the minimum is considered like "spilling" as well.
2815 TargetOcc = std::nullopt;
2816 } else {
2817 // There is no spilling and room to improve occupancy; set up "increased
2818 // occupancy targets" for all regions.
2819 TargetOcc = DAG.MinOccupancy + 1;
2820 const unsigned VGPRBlockSize = MFI.getDynamicVGPRBlockSize();
2821 MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
2822 MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
2823 for (auto [I, Target] : enumerate(RPTargets)) {
2824 Target.setTarget(MaxSGPRs, MaxVGPRs);
2825 if (!Target.satisfied())
2826 TargetRegions.set(I);
2827 }
2828 }
2829
2830 return TargetRegions.any();
2831}
2832
2833bool PreRARematStage::collectRematRegs(
2834 const DenseMap<MachineInstr *, unsigned> &MIRegion) {
2835 // We need up-to-date live-out info. to query live-out register masks in
2836 // regions containing rematerializable instructions.
2837 DAG.RegionLiveOuts.buildLiveRegMap();
2838
2839 // Set of registers already marked for potential remterialization; used to
2840 // avoid rematerialization chains.
2841 SmallSet<Register, 4> MarkedRegs;
2842 auto IsMarkedForRemat = [&MarkedRegs](const MachineOperand &MO) -> bool {
2843 return MO.isReg() && MarkedRegs.contains(MO.getReg());
2844 };
2845
2846 // Identify rematerializable instructions in the function.
2847 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2848 RegionBoundaries Bounds = DAG.Regions[I];
2849 for (auto MI = Bounds.first; MI != Bounds.second; ++MI) {
2850 // The instruction must be rematerializable.
2851 MachineInstr &DefMI = *MI;
2852 if (!isReMaterializable(DefMI))
2853 continue;
2854
2855 // We only support rematerializing virtual registers with one
2856 // definition.
2857 Register Reg = DefMI.getOperand(0).getReg();
2858 if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg))
2859 continue;
2860
2861 // We only care to rematerialize the instruction if it has a single
2862 // non-debug user in a different region.
2863 // FIXME: Allow rematerializations with multiple uses. This should be
2864 // relatively easy to support using the current cost model.
2865 MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
2866 if (!UseMI)
2867 continue;
2868 auto UseRegion = MIRegion.find(UseMI);
2869 if (UseRegion == MIRegion.end() || UseRegion->second == I)
2870 continue;
2871
2872 // Do not rematerialize an instruction if it uses or is used by an
2873 // instruction that we have designated for rematerialization.
2874 // FIXME: Allow for rematerialization chains: this requires 1. updating
2875 // remat points to account for uses that are rematerialized, and 2.
2876 // either rematerializing the candidates in careful ordering, or
2877 // deferring the MBB RP walk until the entire chain has been
2878 // rematerialized.
2879 const MachineOperand &UseMO = UseMI->getOperand(0);
2880 if (IsMarkedForRemat(UseMO) ||
2881 llvm::any_of(DefMI.operands(), IsMarkedForRemat))
2882 continue;
2883
2884 // Do not rematerialize an instruction it it uses registers that aren't
2885 // available at its use. This ensures that we are not extending any live
2886 // range while rematerializing.
2887 SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
2888 if (!VirtRegAuxInfo::allUsesAvailableAt(&DefMI, UseIdx, *DAG.LIS, DAG.MRI,
2889 *DAG.TII))
2890 continue;
2891
2892 // Add the instruction to the rematerializable list.
2893 MarkedRegs.insert(Reg);
2894 RematRegs.emplace_back(&DefMI, UseMI, DAG, MIRegion);
2895 }
2896 }
2897
2898 return !RematRegs.empty();
2899}
2900
2901PreRARematStage::RematReg::RematReg(
2902 MachineInstr *DefMI, MachineInstr *UseMI, GCNScheduleDAGMILive &DAG,
2903 const DenseMap<MachineInstr *, unsigned> &MIRegion)
2904 : DefMI(DefMI), UseMI(UseMI), DefRegion(MIRegion.at(DefMI)),
2905 UseRegion(MIRegion.at(UseMI)),
2906 Mask(DAG.RegionLiveOuts.getLiveRegsForRegionIdx(DefRegion).at(getReg())) {
2907}
2908
2909bool PreRARematStage::ScoredRemat::maybeBeneficial(
2910 const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets) const {
2911 for (unsigned I : TargetRegions.set_bits()) {
2912 if (Live[I] && RPTargets[I].isSaveBeneficial(RPSave))
2913 return true;
2914 }
2915 return false;
2916}
2917
2918void PreRARematStage::ScoredRemat::insertMI(unsigned RegionIdx,
2919 MachineInstr *RematMI,
2920 GCNScheduleDAGMILive &DAG) const {
2921 RegionBoundaries &Bounds = DAG.Regions[RegionIdx];
2922 if (Bounds.first == std::next(MachineBasicBlock::iterator(RematMI)))
2923 Bounds.first = RematMI;
2924 DAG.LIS->InsertMachineInstrInMaps(*RematMI);
2925 DAG.LIS->createAndComputeVirtRegInterval(RematMI->getOperand(0).getReg());
2926}
2927
2930 assert(DAG.MLI && "MLI not defined in DAG");
2932 MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI);
2933
2934 const unsigned NumRegions = DAG.Regions.size();
2936 MaxFreq = 0;
2937 Regions.reserve(NumRegions);
2938 for (unsigned I = 0; I < NumRegions; ++I) {
2939 MachineBasicBlock *MBB = DAG.Regions[I].first->getParent();
2940 uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency();
2941 Regions.push_back(BlockFreq);
2942 if (BlockFreq && BlockFreq < MinFreq)
2943 MinFreq = BlockFreq;
2944 else if (BlockFreq > MaxFreq)
2945 MaxFreq = BlockFreq;
2946 }
2947 if (!MinFreq)
2948 return;
2949
2950 // Scale everything down if frequencies are high.
2951 if (MinFreq >= ScaleFactor * ScaleFactor) {
2952 for (uint64_t &Freq : Regions)
2953 Freq /= ScaleFactor;
2954 MinFreq /= ScaleFactor;
2955 MaxFreq /= ScaleFactor;
2956 }
2957}
2958
2959void PreRARematStage::ScoredRemat::init(RematReg *Remat, const FreqInfo &Freq,
2961 this->Remat = Remat;
2962 const unsigned NumRegions = DAG.Regions.size();
2963 LiveIn.resize(NumRegions);
2964 LiveOut.resize(NumRegions);
2965 Live.resize(NumRegions);
2966 UnpredictableRPSave.resize(NumRegions);
2967
2968 // Mark regions in which the rematerializable register is live.
2969 Register DefReg = Remat->getReg();
2970 for (unsigned I = 0, E = NumRegions; I != E; ++I) {
2971 if (DAG.LiveIns[I].contains(DefReg))
2972 LiveIn.set(I);
2973 if (DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).contains(DefReg))
2974 LiveOut.set(I);
2975
2976 // If the register is both unused and live-through in the region, the
2977 // latter's RP is guaranteed to decrease.
2978 if (!LiveIn[I] || !LiveOut[I] || I == Remat->UseRegion)
2979 UnpredictableRPSave.set(I);
2980 }
2981 Live |= LiveIn;
2982 Live |= LiveOut;
2983 RPSave.inc(DefReg, LaneBitmask::getNone(), Remat->Mask, DAG.MRI);
2984
2985 // Get frequencies of defining and using regions. A rematerialization from the
2986 // least frequent region to the most frequent region will yield the greatest
2987 // latency penalty and therefore should get minimum score. Reciprocally, a
2988 // rematerialization in the other direction should get maximum score. Default
2989 // to values that will yield the worst possible score given known frequencies
2990 // in order to penalize rematerializations from or into regions whose
2991 // frequency is unknown.
2992 int64_t DefOrMin = std::max(Freq.Regions[Remat->DefRegion], Freq.MinFreq);
2993 int64_t UseOrMax = Freq.Regions[Remat->UseRegion];
2994 if (!UseOrMax)
2995 UseOrMax = Freq.MaxFreq;
2996 FreqDiff = DefOrMin - UseOrMax;
2997}
2998
2999void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
3000 ArrayRef<GCNRPTarget> RPTargets,
3001 const FreqInfo &FreqInfo,
3002 bool ReduceSpill) {
3003 MaxFreq = 0;
3004 RegionImpact = 0;
3005 for (unsigned I : TargetRegions.set_bits()) {
3006 if (!Live[I])
3007 continue;
3008
3009 // The rematerialization must contribute positively in at least one
3010 // register class with usage above the RP target for this region to
3011 // contribute to the score.
3012 const GCNRPTarget &RegionTarget = RPTargets[I];
3013 const unsigned NumRegsBenefit = RegionTarget.getNumRegsBenefit(RPSave);
3014 if (!NumRegsBenefit)
3015 continue;
3016
3017 // Regions in which RP is guaranteed to decrease have more weight.
3018 RegionImpact += (UnpredictableRPSave[I] ? 1 : 2) * NumRegsBenefit;
3019
3020 if (ReduceSpill) {
3021 uint64_t Freq = FreqInfo.Regions[I];
3022 if (UnpredictableRPSave[I]) {
3023 // Apply a frequency penalty in regions in which we are not sure that RP
3024 // will decrease.
3025 Freq /= 2;
3026 }
3027 MaxFreq = std::max(MaxFreq, Freq);
3028 }
3029 }
3030}
3031
3032MachineInstr *
3033PreRARematStage::ScoredRemat::rematerialize(GCNScheduleDAGMILive &DAG) const {
3034 const SIInstrInfo *TII = DAG.MF.getSubtarget<GCNSubtarget>().getInstrInfo();
3035 MachineInstr &DefMI = *Remat->DefMI;
3036 Register Reg = DefMI.getOperand(0).getReg();
3037 Register NewReg = DAG.MRI.cloneVirtualRegister(Reg);
3038
3039 // Rematerialize the register in the region where it is used.
3040 MachineBasicBlock::iterator InsertPos = Remat->UseMI;
3041 TII->reMaterialize(*InsertPos->getParent(), InsertPos, NewReg, 0, DefMI);
3042 MachineInstr *RematMI = &*std::prev(InsertPos);
3043 Remat->UseMI->substituteRegister(Reg, NewReg, 0, *DAG.TRI);
3044 insertMI(Remat->UseRegion, RematMI, DAG);
3045
3046#ifdef EXPENSIVE_CHECKS
3047 // All uses are known to be available / live at the remat point. Thus,
3048 // the uses should already be live in to the using region.
3049 for (MachineOperand &MO : DefMI.operands()) {
3050 if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
3051 continue;
3052
3053 Register UseReg = MO.getReg();
3054 if (!UseReg.isVirtual())
3055 continue;
3056
3057 LiveInterval &LI = DAG.LIS->getInterval(UseReg);
3058 LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
3059 if (LI.hasSubRanges() && MO.getSubReg())
3060 LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
3061
3062 LaneBitmask LiveInMask = DAG.LiveIns[Remat->UseRegion].at(UseReg);
3063 LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
3064 // If this register has lanes not covered by the LiveIns, be sure they
3065 // do not map to any subrange. ref:
3066 // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
3067 if (UncoveredLanes.any()) {
3068 assert(LI.hasSubRanges());
3069 for (LiveInterval::SubRange &SR : LI.subranges())
3070 assert((SR.LaneMask & UncoveredLanes).none());
3071 }
3072 }
3073#endif
3074 return RematMI;
3075}
3076
3077void PreRARematStage::commitRematerializations() const {
3078 REMAT_DEBUG(dbgs() << "Commiting all rematerializations\n");
3079 for (const RollbackInfo &Rollback : Rollbacks)
3080 DAG.deleteMI(Rollback.Remat->DefRegion, Rollback.Remat->DefMI);
3081}
3082
3083void PreRARematStage::updateRPTargets(const BitVector &Regions,
3084 const GCNRegPressure &RPSave) {
3085 for (unsigned I : Regions.set_bits()) {
3086 RPTargets[I].saveRP(RPSave);
3087 if (TargetRegions[I] && RPTargets[I].satisfied()) {
3088 REMAT_DEBUG(dbgs() << " [" << I << "] Target reached!\n");
3089 TargetRegions.reset(I);
3090 }
3091 }
3092}
3093
3094bool PreRARematStage::updateAndVerifyRPTargets(const BitVector &Regions) {
3095 bool TooOptimistic = false;
3096 for (unsigned I : Regions.set_bits()) {
3097 GCNRPTarget &Target = RPTargets[I];
3098 Target.setRP(DAG.getRealRegPressure(I));
3099
3100 // Since we were optimistic in assessing RP decreases in these regions, we
3101 // may need to remark the target as a target region if RP didn't decrease
3102 // as expected.
3103 if (!TargetRegions[I] && !Target.satisfied()) {
3104 REMAT_DEBUG(dbgs() << " [" << I << "] Incorrect RP estimation\n");
3105 TooOptimistic = true;
3106 TargetRegions.set(I);
3107 }
3108 }
3109 return TooOptimistic;
3110}
3111
3112// Copied from MachineLICM
3113bool PreRARematStage::isReMaterializable(const MachineInstr &MI) {
3114 if (!DAG.TII->isReMaterializable(MI))
3115 return false;
3116
3117 for (const MachineOperand &MO : MI.all_uses()) {
3118 // We can't remat physreg uses, unless it is a constant or an ignorable
3119 // use (e.g. implicit exec use on VALU instructions)
3120 if (MO.getReg().isPhysical()) {
3121 if (DAG.MRI.isConstantPhysReg(MO.getReg()) || DAG.TII->isIgnorableUse(MO))
3122 continue;
3123 return false;
3124 }
3125 }
3126
3127 return true;
3128}
3129
3130void PreRARematStage::removeFromLiveMaps(Register Reg, const BitVector &LiveIn,
3131 const BitVector &LiveOut) {
3132 assert(LiveIn.size() == DAG.Regions.size() &&
3133 LiveOut.size() == DAG.Regions.size() && "region num mismatch");
3134 for (unsigned I : LiveIn.set_bits())
3135 DAG.LiveIns[I].erase(Reg);
3136 for (unsigned I : LiveOut.set_bits())
3137 DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).erase(Reg);
3138}
3139
3140void PreRARematStage::addToLiveMaps(Register Reg, LaneBitmask Mask,
3141 const BitVector &LiveIn,
3142 const BitVector &LiveOut) {
3143 assert(LiveIn.size() == DAG.Regions.size() &&
3144 LiveOut.size() == DAG.Regions.size() && "region num mismatch");
3145 std::pair<Register, LaneBitmask> LiveReg(Reg, Mask);
3146 for (unsigned I : LiveIn.set_bits())
3147 DAG.LiveIns[I].insert(LiveReg);
3148 for (unsigned I : LiveOut.set_bits())
3149 DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).insert(LiveReg);
3150}
3151
3153 // We consider that reducing spilling is always beneficial so we never
3154 // rollback rematerializations or revert scheduling in such cases.
3155 if (!TargetOcc)
3156 return;
3157
3158 // When increasing occupancy, it is possible that re-scheduling is not able to
3159 // achieve the target occupancy in all regions, in which case re-scheduling in
3160 // all regions should be reverted.
3161 if (DAG.MinOccupancy >= *TargetOcc) {
3162 commitRematerializations();
3163 return;
3164 }
3165
3166 // It is possible that re-scheduling lowers occupancy over the one achieved
3167 // just through rematerializations, in which case we revert re-scheduling in
3168 // all regions but do not roll back rematerializations.
3169 const bool ShouldRollbackRemats = AchievedOcc < *TargetOcc;
3170
3171 // When we both need to revert re-scheduling and rollback rematerializations,
3172 // restore rematerialized MIs' original state before reverting so that they
3173 // are treated as non-debug instructions by the revert logic.
3174 if (ShouldRollbackRemats) {
3175 for (const RollbackInfo &Rollback : Rollbacks) {
3176 const RematReg *Remat = Rollback.Remat;
3177 MachineInstr *RematMI = Rollback.RematMI;
3178 Rollback.Remat->DefMI->setDesc(DAG.TII->get(RematMI->getOpcode()));
3179 for (const auto &[MOIdx, Reg] : Rollback.RegMap)
3180 Remat->DefMI->getOperand(MOIdx).setReg(Reg);
3181 }
3182 }
3183
3184 // Revert re-scheduling in all affected regions.
3185 for (const auto &[RegionIdx, OrigMIOrder, MaxPressure] : RegionReverts) {
3186 REMAT_DEBUG(dbgs() << "Reverting re-scheduling in region " << RegionIdx
3187 << '\n');
3188 DAG.Pressure[RegionIdx] = MaxPressure;
3189 modifyRegionSchedule(RegionIdx, OrigMIOrder);
3190 }
3191
3192 if (!ShouldRollbackRemats) {
3193 commitRematerializations();
3194 DAG.setTargetOccupancy(AchievedOcc);
3195 return;
3196 }
3197
3198 // Reset the target occupancy to what it was pre-rematerialization.
3199 DAG.setTargetOccupancy(*TargetOcc - 1);
3200
3201 // Finish rolling back rematerializations, then recompute pressure in all
3202 // affected regions.
3203 REMAT_DEBUG(dbgs() << "==== ROLLBACK ====\n");
3204 DenseSet<Register> RecomputeLI;
3205 for (const RollbackInfo &Rollback : Rollbacks) {
3206 const RematReg *Remat = Rollback.Remat;
3207 MachineInstr *RematMI = Rollback.RematMI;
3208
3209 // Switch back to using the original register and delete the
3210 // rematerialization.
3211 Register Reg = RematMI->getOperand(0).getReg();
3212 Register OriginalReg = Remat->DefMI->getOperand(0).getReg();
3213 Remat->UseMI->substituteRegister(Reg, OriginalReg, 0, *DAG.TRI);
3214 REMAT_DEBUG(dbgs() << '[' << Remat->UseRegion
3215 << "] Deleting rematerialization " << *RematMI);
3216 DAG.deleteMI(Remat->UseRegion, RematMI);
3217 addToLiveMaps(OriginalReg, Remat->Mask, Rollback.LiveIn, Rollback.LiveOut);
3218
3219 // Regenerate intervals for all register operands of rematerialized MIs as
3220 // slot indices may have changed slightly from before re-scheduling.
3221 for (MachineOperand &MO : Rollback.Remat->DefMI->operands()) {
3222 if (MO.isReg() && MO.getReg().isVirtual())
3223 RecomputeLI.insert(MO.getReg());
3224 }
3225 }
3226 for (Register Reg : RecomputeLI) {
3227 DAG.LIS->removeInterval(Reg);
3228 DAG.LIS->createAndComputeVirtRegInterval(Reg);
3229 }
3230#ifdef EXPENSIVE_CHECKS
3231 // In particular, we want to check for coherent MI/slot order in regions in
3232 // which reverts and/or rollbacks may have happened.
3233 MF.verify();
3234#endif
3235 for (unsigned I : RescheduleRegions.set_bits())
3236 DAG.Pressure[I] = DAG.getRealRegPressure(I);
3237
3239}
3240
3241void GCNScheduleDAGMILive::deleteMI(unsigned RegionIdx, MachineInstr *MI) {
3242 // It's not possible for the deleted instruction to be upper region boundary
3243 // since we don't delete region terminators.
3244 if (Regions[RegionIdx].first == MI)
3245 Regions[RegionIdx].first = std::next(MachineBasicBlock::iterator(MI));
3246 LIS->removeInterval(MI->getOperand(0).getReg());
3248 MI->eraseFromParent();
3249}
3250
3251void GCNScheduleDAGMILive::setTargetOccupancy(unsigned TargetOccupancy) {
3252 MinOccupancy = TargetOccupancy;
3253 if (MFI.getOccupancy() < TargetOccupancy)
3254 MFI.increaseOccupancy(MF, MinOccupancy);
3255 else
3256 MFI.limitOccupancy(MinOccupancy);
3257}
3258
3260 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
3261 return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) {
3262 return SII->isIGLPMutationOnly(MI->getOpcode());
3263 });
3264}
3265
3270
3272 HasIGLPInstrs = hasIGLPInstrs(this);
3273 if (HasIGLPInstrs) {
3274 SavedMutations.clear();
3275 SavedMutations.swap(Mutations);
3277 }
3278
3280}
3281
3283 if (HasIGLPInstrs)
3284 SavedMutations.swap(Mutations);
3285
3287}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SUnit * pickOnlyChoice(SchedBoundary &Zone)
MachineBasicBlock & MBB
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file defines the GCNRegPressure class, which tracks registry pressure by bookkeeping number of S...
static cl::opt< bool > GCNTrackers("amdgpu-use-amdgpu-trackers", cl::Hidden, cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false))
static cl::opt< bool > DisableClusteredLowOccupancy("amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden, cl::desc("Disable clustered low occupancy " "rescheduling for ILP scheduling stage."), cl::init(false))
#define REMAT_PREFIX
Allows to easily filter for this stage's debug output.
static MachineInstr * getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, MachineBasicBlock::iterator RegionEnd)
static bool shouldCheckPending(SchedBoundary &Zone, const TargetSchedModel *SchedModel)
static cl::opt< bool > RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, cl::desc("Relax occupancy targets for kernels which are memory " "bound (amdgpu-membound-threshold), or " "Wave Limited (amdgpu-limit-wave-threshold)."), cl::init(false))
#define REMAT_DEBUG(X)
static cl::opt< bool > DisableUnclusterHighRP("amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden, cl::desc("Disable unclustered high register pressure " "reduction scheduling stage."), cl::init(false))
static void printScheduleModel(std::set< std::pair< MachineInstr *, unsigned >, EarlierIssuingCycle > &ReadyCycles)
static cl::opt< bool > PrintMaxRPRegUsageAfterScheduler("amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure after scheduling."), cl::init(false))
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG)
static cl::opt< bool > DisableRewriteMFMAFormSchedStage("amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden, cl::desc("Disable rewrite mfma rewrite scheduling stage"), cl::init(true))
static bool canUsePressureDiffs(const SUnit &SU)
Checks whether SU can use the cached DAG pressure diffs to compute the current register pressure.
static cl::opt< unsigned > PendingQueueLimit("amdgpu-scheduler-pending-queue-limit", cl::Hidden, cl::desc("Max (Available+Pending) size to inspect pending queue (0 disables)"), cl::init(256))
static cl::opt< bool > PrintMaxRPRegUsageBeforeScheduler("amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure before scheduling."), cl::init(false))
static cl::opt< unsigned > ScheduleMetricBias("amdgpu-schedule-metric-bias", cl::Hidden, cl::desc("Sets the bias which adds weight to occupancy vs latency. Set it to " "100 to chase the occupancy only."), cl::init(10))
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
static constexpr std::pair< StringLiteral, StringLiteral > ReplaceMap[]
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static constexpr unsigned SM(unsigned Version)
if(PassOpts->AAPipeline)
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
iterator_range< const_set_bits_iterator > set_bits() const
Definition BitVector.h:159
size_type size() const
size - Returns the number of bits in this bitvector.
Definition BitVector.h:178
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
bool shouldRevertScheduling(unsigned WavesAfter) override
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
bool erase(const KeyT &Val)
Definition DenseMap.h:330
DenseMapIterator< KeyT, ValueT, KeyInfoT, BucketT > iterator
Definition DenseMap.h:74
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
GCNRegPressure bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI) const
Mostly copy/paste from CodeGen/RegisterPressure.cpp Calculate the impact MI will have on CurPressure ...
GCNMaxILPSchedStrategy(const MachineSchedContext *C)
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
Apply a set of heuristics to a new candidate.
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as much as possible.
GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext *C)
GCNMaxOccupancySchedStrategy(const MachineSchedContext *C, bool IsLegacyScheduler=false)
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNPostScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
Models a register pressure target, allowing to evaluate and track register savings against that targe...
unsigned getNumRegsBenefit(const GCNRegPressure &SaveRP) const
Returns the benefit towards achieving the RP target that saving SaveRP represents,...
GCNRegPressure getPressure() const
DenseMap< unsigned, LaneBitmask > LiveRegSet
GCNSchedStrategy & S
GCNRegPressure PressureBefore
bool isRegionWithExcessRP() const
void modifyRegionSchedule(unsigned RegionIdx, ArrayRef< MachineInstr * > MIOrder)
Sets the schedule of region RegionIdx to MIOrder.
bool mayCauseSpilling(unsigned WavesAfter)
ScheduleMetrics getScheduleMetrics(const std::vector< SUnit > &InputSchedule)
GCNScheduleDAGMILive & DAG
const GCNSchedStageID StageID
std::vector< MachineInstr * > Unsched
GCNRegPressure PressureAfter
MachineFunction & MF
virtual void finalizeGCNRegion()
SIMachineFunctionInfo & MFI
unsigned computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, DenseMap< unsigned, unsigned > &ReadyCycles, const TargetSchedModel &SM)
virtual void finalizeGCNSchedStage()
virtual bool initGCNSchedStage()
virtual bool shouldRevertScheduling(unsigned WavesAfter)
std::vector< std::unique_ptr< ScheduleDAGMutation > > SavedMutations
GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
MachineBasicBlock * CurrentMBB
const GCNSubtarget & ST
This is a minimal scheduler strategy.
GCNDownwardRPTracker DownwardTracker
void getRegisterPressures(bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, std::vector< unsigned > &Pressure, std::vector< unsigned > &MaxPressure, GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, ScheduleDAGMI *DAG, const SIRegisterInfo *SRI)
GCNSchedStrategy(const MachineSchedContext *C)
SmallVector< GCNSchedStageID, 4 > SchedStages
std::vector< unsigned > MaxPressure
SUnit * pickNodeBidirectional(bool &IsTopNode, bool &PickedPending)
GCNSchedStageID getCurrentStage()
bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Evaluates instructions in the pending queue using a subset of scheduling heuristics.
SmallVectorImpl< GCNSchedStageID >::iterator CurrentStage
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
std::optional< bool > GCNTrackersOverride
GCNDownwardRPTracker * getDownwardTracker()
std::vector< unsigned > Pressure
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
GCNUpwardRPTracker UpwardTracker
void printCandidateDecision(const SchedCandidate &Current, const SchedCandidate &Preferred)
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand, bool &IsPending, bool IsBottomUp)
unsigned getStructuralStallCycles(SchedBoundary &Zone, SUnit *SU) const
Estimate how many cycles SU must wait due to structural hazards at the current boundary cycle.
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp)
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
GCNUpwardRPTracker * getUpwardTracker()
GCNSchedStageID getNextStage() const
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
void recede(const MachineInstr &MI)
Move to the state of RP just before the MI .
void traceCandidate(const SchedCandidate &Cand)
LLVM_ABI void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone, SchedBoundary *OtherZone)
Set the CandPolicy given a scheduling zone given the current resources and latencies inside and outsi...
MachineSchedPolicy RegionPolicy
const TargetSchedModel * SchedModel
const MachineSchedContext * Context
const TargetRegisterInfo * TRI
SchedCandidate BotCand
Candidate last picked from Bot boundary.
SchedCandidate TopCand
Candidate last picked from Top boundary.
virtual bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Apply a set of heuristics to a new candidate.
ScheduleDAGMILive * DAG
void initialize(ScheduleDAGMI *dag) override
Initialize the strategy after building the DAG for a new region.
void schedNode(SUnit *SU, bool IsTopNode) override
Update the scheduler's state after scheduling a node.
GenericScheduler(const MachineSchedContext *C)
bool shouldRevertScheduling(unsigned WavesAfter) override
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
void RemoveMachineInstrFromMaps(MachineInstr &MI)
SlotIndex getMBBEndIdx(const MachineBasicBlock *mbb) const
Return the last index in the given basic block.
LiveInterval & getInterval(Register Reg)
void removeInterval(Register Reg)
Interval removal.
LLVM_ABI void dump() const
MachineBasicBlock * getMBBFromIndex(SlotIndex index) const
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
LLVM_ABI BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const
getblockFreq - Return block frequency.
LLVM_ABI BlockFrequency getEntryFreq() const
Divide a block's BlockFrequency::getFrequency() value by this value to obtain the entry block - relat...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
LLVM_ABI void substituteRegister(Register FromReg, Register ToReg, unsigned SubIdx, const TargetRegisterInfo &RegInfo)
Replace all occurrences of FromReg with ToReg:SubIdx, properly composing subreg indices where necessa...
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Register getReg() const
getReg - Returns the register number.
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
void finalizeGCNRegion() override
bool initGCNSchedStage() override
Capture a change in pressure for a single pressure set.
Simple wrapper around std::function<void(raw_ostream&)>.
Definition Printable.h:38
Helpers for implementing custom MachineSchedStrategy classes.
unsigned size() const
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void advance()
Advance across the current instruction.
LLVM_ABI void getDownwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction top-down.
const std::vector< unsigned > & getRegSetPressureAtPos() const
Get the register set pressure at the current position, which may be less than the pressure across the...
LLVM_ABI void getUpwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction bottom-up.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
List of registers defined and used by a machine instruction.
LLVM_ABI void collect(const MachineInstr &MI, const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, bool TrackLaneMasks, bool IgnoreDead)
Analyze the given instruction MI and fill in the Uses, Defs and DeadDefs list based on the MachineOpe...
LLVM_ABI void adjustLaneLiveness(const LiveIntervals &LIS, const MachineRegisterInfo &MRI, SlotIndex Pos, MachineInstr *AddFlagsMI=nullptr)
Use liveness information to find out which uses/defs are partially undefined/dead and adjust the VReg...
LLVM_ABI void detectDeadDefs(const MachineInstr &MI, const LiveIntervals &LIS)
Use liveness information to find dead defs not marked with a dead flag and move them to the DeadDefs ...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
bool isIGLPMutationOnly(unsigned Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
unsigned TopReadyCycle
Cycle relative to start when node is ready.
unsigned NodeNum
Entry # of node in the node vector.
unsigned short Latency
Node latency.
bool isScheduled
True once scheduled.
unsigned ParentClusterIdx
The parent cluster id.
unsigned BotReadyCycle
Cycle relative to end when node is ready.
bool hasReservedResource
Uses a reserved resource.
bool isBottomReady() const
bool isTopReady() const
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Each Scheduling boundary is associated with ready queues.
LLVM_ABI void releasePending()
Release pending ready nodes in to the available queue.
LLVM_ABI unsigned getLatencyStallCycles(SUnit *SU)
Get the difference between the given SUnit's ready time and the current cycle.
LLVM_ABI SUnit * pickOnlyChoice()
Call this before applying any other heuristics to the Available queue.
ScheduleHazardRecognizer * HazardRec
LLVM_ABI void bumpCycle(unsigned NextCycle)
Move the boundary of scheduled code by one cycle.
unsigned getCurrMOps() const
Micro-ops issued in the current cycle.
unsigned getCurrCycle() const
Number of cycles to issue the instructions scheduled in this zone.
LLVM_ABI bool checkHazard(SUnit *SU)
Does this SU have a hazard within the current instruction group.
LLVM_ABI std::pair< unsigned, unsigned > getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx, unsigned ReleaseAtCycle, unsigned AcquireAtCycle)
Compute the next cycle at which the given processor resource can be scheduled.
A ScheduleDAG for scheduling lists of MachineInstr.
bool ScheduleSingleMIRegions
True if regions with a single MI should be scheduled.
MachineBasicBlock::iterator RegionEnd
The end of the range to be scheduled.
virtual void finalizeSchedule()
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
virtual void exitRegion()
Called when the scheduler has finished scheduling the current region.
const MachineLoopInfo * MLI
bool RemoveKillFlags
True if the DAG builder should remove kill flags (in preparation for rescheduling).
MachineBasicBlock::iterator RegionBegin
The beginning of the range to be scheduled.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
RegPressureTracker RPTracker
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMI(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
std::vector< std::unique_ptr< ScheduleDAGMutation > > Mutations
Ordered list of DAG postprocessing steps.
MachineRegisterInfo & MRI
Virtual/real register map.
const TargetInstrInfo * TII
Target instruction information.
MachineFunction & MF
Machine function.
static const unsigned ScaleFactor
unsigned getMetric() const
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
static bool isSameInstr(SlotIndex A, SlotIndex B)
isSameInstr - Return true if A and B refer to the same instruction.
static bool isEarlierInstr(SlotIndex A, SlotIndex B)
isEarlierInstr - Return true if A refers to an instruction earlier than B.
SlotIndex getPrevSlot() const
Returns the previous slot in the index list.
SlotIndex getMBBStartIdx(unsigned Num) const
Returns the first index in the given basic block number.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:229
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
uint8_t getCopyCost() const
Return the cost of copying a value between two registers in this class.
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI bool hasInstrSchedModel() const
Return true if this machine model includes an instruction-level scheduling model.
unsigned getMicroOpBufferSize() const
Number of micro-ops that may be buffered for OOO execution.
bool shouldRevertScheduling(unsigned WavesAfter) override
VNInfo - Value Number Information.
SlotIndex def
The index of the defining instruction.
bool isPHIDef() const
Returns true if this value is defined by a PHI instruction (or was, PHI instructions may have been el...
static bool allUsesAvailableAt(const MachineInstr *MI, SlotIndex UseIdx, const LiveIntervals &LIS, const MachineRegisterInfo &MRI, const TargetInstrInfo &TII)
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize)
LLVM_READONLY int32_t getMFMASrcCVDstAGPROp(uint32_t Opcode)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
This is an optimization pass for GlobalISel generic memory operations.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1765
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
LLVM_ABI unsigned getWeakLeft(const SUnit *SU, bool isTop)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI, Range &&LiveRegs)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
std::pair< MachineBasicBlock::iterator, MachineBasicBlock::iterator > RegionBoundaries
A region's boundaries i.e.
IterT skipDebugInstructionsForward(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It until it points to a non-debug instruction or to End and return the resulting iterator.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool tryPressure(const PressureChange &TryP, const PressureChange &CandP, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason, const TargetRegisterInfo *TRI, const MachineFunction &MF)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
LLVM_ABI cl::opt< bool > VerifyScheduling
LLVM_ABI bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone)
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
IterT skipDebugInstructionsBackward(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It until it points to a non-debug instruction or to Begin and return the resulting iterator...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool isTheSameCluster(unsigned A, unsigned B)
Return whether the input cluster ID's are the same and valid.
DWARFExpression::Operation Op
LLVM_ABI bool tryGreater(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1917
DenseMap< MachineInstr *, GCNRPTracker::LiveRegSet > getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS)
creates a map MachineInstr -> LiveRegSet R - range of iterators on instructions After - upon entry or...
GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI, const LiveIntervals &LIS)
LLVM_ABI bool tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
Return true if this heuristic determines order.
LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF, GCNRegPressure::RegKind Kind, LiveIntervals &LIS, const MachineLoopInfo *MLI)
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
LLVM_ABI int biasPhysReg(const SUnit *SU, bool isTop)
Minimize physical register live ranges.
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
bool operator()(std::pair< MachineInstr *, unsigned > A, std::pair< MachineInstr *, unsigned > B) const
unsigned getArchVGPRNum() const
unsigned getAGPRNum() const
unsigned getSGPRNum() const
Policy for scheduling the next instruction in the candidate's zone.
Store the state used by GenericScheduler heuristics, required for the lifetime of one invocation of p...
void reset(const CandPolicy &NewPolicy)
LLVM_ABI void initResourceDelta(const ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel)
Status of an instruction's critical resource consumption.
constexpr bool any() const
Definition LaneBitmask.h:53
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
Identify one of the processor resource kinds consumed by a particular scheduling class for the specif...
Definition MCSchedule.h:68
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
Execution frequency information required by scoring heuristics.
SmallVector< uint64_t > Regions
Per-region execution frequencies. 0 when unknown.
uint64_t MinFreq
Minimum and maximum observed frequencies.
FreqInfo(MachineFunction &MF, const GCNScheduleDAGMILive &DAG)