LLVM 23.0.0git
GCNSchedStrategy.cpp
Go to the documentation of this file.
1//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This contains a MachineSchedStrategy implementation for maximizing wave
11/// occupancy on GCN hardware.
12///
13/// This pass will apply multiple scheduling stages to the same function.
14/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15/// entry point for the scheduling of those regions is
16/// GCNScheduleDAGMILive::runSchedStages.
17
18/// Generally, the reason for having multiple scheduling stages is to account
19/// for the kernel-wide effect of register usage on occupancy. Usually, only a
20/// few scheduling regions will have register pressure high enough to limit
21/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22/// other regions.
23///
24//===----------------------------------------------------------------------===//
25
26#include "GCNSchedStrategy.h"
27#include "AMDGPUIGroupLP.h"
28#include "GCNRegPressure.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/STLExtras.h"
40#include "llvm/MC/LaneBitmask.h"
42#include "llvm/MC/MCSchedule.h"
45
46#define DEBUG_TYPE "machine-scheduler"
47
48using namespace llvm;
49
51 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
52 cl::desc("Disable unclustered high register pressure "
53 "reduction scheduling stage."),
54 cl::init(false));
55
57 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
58 cl::desc("Disable clustered low occupancy "
59 "rescheduling for ILP scheduling stage."),
60 cl::init(false));
61
63 "amdgpu-schedule-metric-bias", cl::Hidden,
65 "Sets the bias which adds weight to occupancy vs latency. Set it to "
66 "100 to chase the occupancy only."),
67 cl::init(10));
68
69static cl::opt<bool>
70 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
71 cl::desc("Relax occupancy targets for kernels which are memory "
72 "bound (amdgpu-membound-threshold), or "
73 "Wave Limited (amdgpu-limit-wave-threshold)."),
74 cl::init(false));
75
77 "amdgpu-use-amdgpu-trackers", cl::Hidden,
78 cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
79 cl::init(false));
80
82 "amdgpu-scheduler-pending-queue-limit", cl::Hidden,
84 "Max (Available+Pending) size to inspect pending queue (0 disables)"),
85 cl::init(256));
86
87#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
88#define DUMP_MAX_REG_PRESSURE
90 "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
91 cl::desc("Print a list of live registers along with their def/uses at the "
92 "point of maximum register pressure before scheduling."),
93 cl::init(false));
94
96 "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
97 cl::desc("Print a list of live registers along with their def/uses at the "
98 "point of maximum register pressure after scheduling."),
99 cl::init(false));
100#endif
101
103 "amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden,
104 cl::desc("Disable rewrite mfma rewrite scheduling stage"), cl::init(true));
105
106const unsigned ScheduleMetrics::ScaleFactor = 100;
107
114
117
118 MF = &DAG->MF;
119
120 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
121
123 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
125 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
126
128 // Set the initial TargetOccupnacy to the maximum occupancy that we can
129 // achieve for this function. This effectively sets a lower bound on the
130 // 'Critical' register limits in the scheduler.
131 // Allow for lower occupancy targets if kernel is wave limited or memory
132 // bound, and using the relaxed occupancy feature.
136 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
137
138 if (!KnownExcessRP) {
139 VGPRCriticalLimit = std::min(
140 ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
142 } else {
143 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
144 // returns a reasonably small number for targets with lots of VGPRs, such
145 // as GFX10 and GFX11.
146 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
147 "VGPRCriticalLimit calculation method.\n");
148 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
149 unsigned Granule =
150 AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, DynamicVGPRBlockSize);
151 unsigned Addressable =
152 AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, DynamicVGPRBlockSize);
153 unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
154 VGPRBudget = std::max(VGPRBudget, Granule);
155 VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
156 }
157
158 // Subtract error margin and bias from register limits and avoid overflow.
163 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
164 << ", VGPRExcessLimit = " << VGPRExcessLimit
165 << ", SGPRCriticalLimit = " << SGPRCriticalLimit
166 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
167}
168
169/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
170/// current register pressure.
171///
172/// This works for the common case, but it has a few exceptions that have been
173/// observed through trial and error:
174/// - Explicit physical register operands
175/// - Subregister definitions
176///
177/// In both of those cases, PressureDiff doesn't represent the actual pressure,
178/// and querying LiveIntervals through the RegPressureTracker is needed to get
179/// an accurate value.
180///
181/// We should eventually only use PressureDiff for maximum performance, but this
182/// already allows 80% of SUs to take the fast path without changing scheduling
183/// at all. Further changes would either change scheduling, or require a lot
184/// more logic to recover an accurate pressure estimate from the PressureDiffs.
185static bool canUsePressureDiffs(const SUnit &SU) {
186 if (!SU.isInstr())
187 return false;
188
189 // Cannot use pressure diffs for subregister defs or with physregs, it's
190 // imprecise in both cases.
191 for (const auto &Op : SU.getInstr()->operands()) {
192 if (!Op.isReg() || Op.isImplicit())
193 continue;
194 if (Op.getReg().isPhysical() ||
195 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
196 return false;
197 }
198 return true;
199}
200
202 bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
203 std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
205 ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
206 // getDownwardPressure() and getUpwardPressure() make temporary changes to
207 // the tracker, so we need to pass those function a non-const copy.
208 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
209 if (!useGCNTrackers()) {
210 AtTop
211 ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
212 : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
213
214 return;
215 }
216
217 // GCNTrackers
218 Pressure.resize(4, 0);
219 MachineInstr *MI = SU->getInstr();
220 GCNRegPressure NewPressure;
221 if (AtTop) {
222 GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
223 NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
224 } else {
225 GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
226 TempUpwardTracker.recede(*MI);
227 NewPressure = TempUpwardTracker.getPressure();
228 }
229 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
230 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
231 NewPressure.getArchVGPRNum();
232 Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
233}
234
236 bool AtTop,
237 const RegPressureTracker &RPTracker,
238 const SIRegisterInfo *SRI,
239 unsigned SGPRPressure,
240 unsigned VGPRPressure, bool IsBottomUp) {
241 Cand.SU = SU;
242 Cand.AtTop = AtTop;
243
244 if (!DAG->isTrackingPressure())
245 return;
246
247 Pressure.clear();
248 MaxPressure.clear();
249
250 // We try to use the cached PressureDiffs in the ScheduleDAG whenever
251 // possible over querying the RegPressureTracker.
252 //
253 // RegPressureTracker will make a lot of LIS queries which are very
254 // expensive, it is considered a slow function in this context.
255 //
256 // PressureDiffs are precomputed and cached, and getPressureDiff is just a
257 // trivial lookup into an array. It is pretty much free.
258 //
259 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
260 // PressureDiffs.
261 if (AtTop || !canUsePressureDiffs(*SU) || useGCNTrackers()) {
262 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
264 } else {
265 // Reserve 4 slots.
266 Pressure.resize(4, 0);
267 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
268 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
269
270 for (const auto &Diff : DAG->getPressureDiff(SU)) {
271 if (!Diff.isValid())
272 continue;
273 // PressureDiffs is always bottom-up so if we're working top-down we need
274 // to invert its sign.
275 Pressure[Diff.getPSet()] +=
276 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
277 }
278
279#ifdef EXPENSIVE_CHECKS
280 std::vector<unsigned> CheckPressure, CheckMaxPressure;
281 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
283 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
284 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
285 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
286 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
287 errs() << "Register Pressure is inaccurate when calculated through "
288 "PressureDiff\n"
289 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
290 << ", expected "
291 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
292 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
293 << ", expected "
294 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
295 report_fatal_error("inaccurate register pressure calculation");
296 }
297#endif
298 }
299
300 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
301 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
302
303 // If two instructions increase the pressure of different register sets
304 // by the same amount, the generic scheduler will prefer to schedule the
305 // instruction that increases the set with the least amount of registers,
306 // which in our case would be SGPRs. This is rarely what we want, so
307 // when we report excess/critical register pressure, we do it either
308 // only for VGPRs or only for SGPRs.
309
310 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
311 const unsigned MaxVGPRPressureInc = 16;
312 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
313 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
314
315 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
316 // to increase the likelihood we don't go over the limits. We should improve
317 // the analysis to look through dependencies to find the path with the least
318 // register pressure.
319
320 // We only need to update the RPDelta for instructions that increase register
321 // pressure. Instructions that decrease or keep reg pressure the same will be
322 // marked as RegExcess in tryCandidate() when they are compared with
323 // instructions that increase the register pressure.
324 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
325 HasHighPressure = true;
326 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
327 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
328 }
329
330 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
331 HasHighPressure = true;
332 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
333 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
334 }
335
336 // Register pressure is considered 'CRITICAL' if it is approaching a value
337 // that would reduce the wave occupancy for the execution unit. When
338 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
339 // has the same cost, so we don't need to prefer one over the other.
340
341 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
342 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
343
344 if (SGPRDelta >= 0 || VGPRDelta >= 0) {
345 HasHighPressure = true;
346 if (SGPRDelta > VGPRDelta) {
347 Cand.RPDelta.CriticalMax =
348 PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
349 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
350 } else {
351 Cand.RPDelta.CriticalMax =
352 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
353 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
354 }
355 }
356}
357
359 const TargetSchedModel *SchedModel) {
360 bool HasBufferedModel =
361 SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
362 unsigned Combined = Zone.Available.size() + Zone.Pending.size();
363 return Combined <= PendingQueueLimit && HasBufferedModel;
364}
365
367 const TargetSchedModel *SchedModel) {
368 // pickOnlyChoice() releases pending instructions and checks for new hazards.
369 SUnit *OnlyChoice = Zone.pickOnlyChoice();
370 if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty())
371 return OnlyChoice;
372
373 return nullptr;
374}
375
377 const SchedCandidate &Preferred) {
378 LLVM_DEBUG({
379 dbgs() << "Prefer:\t\t";
380 DAG->dumpNode(*Preferred.SU);
381
382 if (Current.SU) {
383 dbgs() << "Not:\t";
384 DAG->dumpNode(*Current.SU);
385 }
386
387 dbgs() << "Reason:\t\t";
388 traceCandidate(Preferred);
389 });
390}
391
392// This function is mostly cut and pasted from
393// GenericScheduler::pickNodeFromQueue()
395 const CandPolicy &ZonePolicy,
396 const RegPressureTracker &RPTracker,
397 SchedCandidate &Cand, bool &IsPending,
398 bool IsBottomUp) {
399 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
401 unsigned SGPRPressure = 0;
402 unsigned VGPRPressure = 0;
403 IsPending = false;
404 if (DAG->isTrackingPressure()) {
405 if (!useGCNTrackers()) {
406 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
407 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
408 } else {
409 GCNRPTracker *T = IsBottomUp
410 ? static_cast<GCNRPTracker *>(&UpwardTracker)
411 : static_cast<GCNRPTracker *>(&DownwardTracker);
412 SGPRPressure = T->getPressure().getSGPRNum();
413 VGPRPressure = T->getPressure().getArchVGPRNum();
414 }
415 }
416 LLVM_DEBUG(dbgs() << "Available Q:\n");
417 ReadyQueue &AQ = Zone.Available;
418 for (SUnit *SU : AQ) {
419
420 SchedCandidate TryCand(ZonePolicy);
421 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
422 VGPRPressure, IsBottomUp);
423 // Pass SchedBoundary only when comparing nodes from the same boundary.
424 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
425 tryCandidate(Cand, TryCand, ZoneArg);
426 if (TryCand.Reason != NoCand) {
427 // Initialize resource delta if needed in case future heuristics query it.
428 if (TryCand.ResDelta == SchedResourceDelta())
429 TryCand.initResourceDelta(Zone.DAG, SchedModel);
430 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
431 Cand.setBest(TryCand);
432 } else {
433 printCandidateDecision(TryCand, Cand);
434 }
435 }
436
437 if (!shouldCheckPending(Zone, SchedModel))
438 return;
439
440 LLVM_DEBUG(dbgs() << "Pending Q:\n");
441 ReadyQueue &PQ = Zone.Pending;
442 for (SUnit *SU : PQ) {
443
444 SchedCandidate TryCand(ZonePolicy);
445 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
446 VGPRPressure, IsBottomUp);
447 // Pass SchedBoundary only when comparing nodes from the same boundary.
448 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
449 tryPendingCandidate(Cand, TryCand, ZoneArg);
450 if (TryCand.Reason != NoCand) {
451 // Initialize resource delta if needed in case future heuristics query it.
452 if (TryCand.ResDelta == SchedResourceDelta())
453 TryCand.initResourceDelta(Zone.DAG, SchedModel);
454 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
455 IsPending = true;
456 Cand.setBest(TryCand);
457 } else {
458 printCandidateDecision(TryCand, Cand);
459 }
460 }
461}
462
463// This function is mostly cut and pasted from
464// GenericScheduler::pickNodeBidirectional()
466 bool &PickedPending) {
467 // Schedule as far as possible in the direction of no choice. This is most
468 // efficient, but also provides the best heuristics for CriticalPSets.
469 if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) {
470 IsTopNode = false;
471 return SU;
472 }
473 if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) {
474 IsTopNode = true;
475 return SU;
476 }
477 // Set the bottom-up policy based on the state of the current bottom zone
478 // and the instructions outside the zone, including the top zone.
479 CandPolicy BotPolicy;
480 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
481 // Set the top-down policy based on the state of the current top zone and
482 // the instructions outside the zone, including the bottom zone.
483 CandPolicy TopPolicy;
484 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
485
486 bool BotPending = false;
487 // See if BotCand is still valid (because we previously scheduled from Top).
488 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
489 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
490 BotCand.Policy != BotPolicy) {
491 BotCand.reset(CandPolicy());
492 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
493 BotPending,
494 /*IsBottomUp=*/true);
495 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
496 } else {
498#ifndef NDEBUG
499 if (VerifyScheduling) {
500 SchedCandidate TCand;
501 TCand.reset(CandPolicy());
502 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
503 BotPending,
504 /*IsBottomUp=*/true);
505 assert(TCand.SU == BotCand.SU &&
506 "Last pick result should correspond to re-picking right now");
507 }
508#endif
509 }
510
511 bool TopPending = false;
512 // Check if the top Q has a better candidate.
513 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
514 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
515 TopCand.Policy != TopPolicy) {
516 TopCand.reset(CandPolicy());
517 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
518 TopPending,
519 /*IsBottomUp=*/false);
520 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
521 } else {
523#ifndef NDEBUG
524 if (VerifyScheduling) {
525 SchedCandidate TCand;
526 TCand.reset(CandPolicy());
527 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
528 TopPending,
529 /*IsBottomUp=*/false);
530 assert(TCand.SU == TopCand.SU &&
531 "Last pick result should correspond to re-picking right now");
532 }
533#endif
534 }
535
536 // Pick best from BotCand and TopCand.
537 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
538 dbgs() << "Bot Cand: "; traceCandidate(BotCand););
539 SchedCandidate Cand = BotPending ? TopCand : BotCand;
540 SchedCandidate TryCand = BotPending ? BotCand : TopCand;
541 PickedPending = BotPending && TopPending;
542
543 TryCand.Reason = NoCand;
544 if (BotPending || TopPending) {
545 PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr);
546 } else {
547 tryCandidate(Cand, TryCand, nullptr);
548 }
549
550 if (TryCand.Reason != NoCand) {
551 Cand.setBest(TryCand);
552 }
553
554 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
555
556 IsTopNode = Cand.AtTop;
557 return Cand.SU;
558}
559
560// This function is mostly cut and pasted from
561// GenericScheduler::pickNode()
563 if (DAG->top() == DAG->bottom()) {
564 assert(Top.Available.empty() && Top.Pending.empty() &&
565 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
566 return nullptr;
567 }
568 bool PickedPending;
569 SUnit *SU;
570 do {
571 PickedPending = false;
572 if (RegionPolicy.OnlyTopDown) {
574 if (!SU) {
575 CandPolicy NoPolicy;
576 TopCand.reset(NoPolicy);
577 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
578 PickedPending,
579 /*IsBottomUp=*/false);
580 assert(TopCand.Reason != NoCand && "failed to find a candidate");
581 SU = TopCand.SU;
582 }
583 IsTopNode = true;
584 } else if (RegionPolicy.OnlyBottomUp) {
586 if (!SU) {
587 CandPolicy NoPolicy;
588 BotCand.reset(NoPolicy);
589 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
590 PickedPending,
591 /*IsBottomUp=*/true);
592 assert(BotCand.Reason != NoCand && "failed to find a candidate");
593 SU = BotCand.SU;
594 }
595 IsTopNode = false;
596 } else {
597 SU = pickNodeBidirectional(IsTopNode, PickedPending);
598 }
599 } while (SU->isScheduled);
600
601 if (PickedPending) {
602 unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
603 SchedBoundary &Zone = IsTopNode ? Top : Bot;
604 unsigned CurrentCycle = Zone.getCurrCycle();
605 if (ReadyCycle > CurrentCycle)
606 Zone.bumpCycle(ReadyCycle);
607
608 // FIXME: checkHazard() doesn't give information about which cycle the
609 // hazard will resolve so just keep bumping the cycle by 1. This could be
610 // made more efficient if checkHazard() returned more details.
611 while (Zone.checkHazard(SU))
612 Zone.bumpCycle(Zone.getCurrCycle() + 1);
613
614 Zone.releasePending();
615 }
616
617 if (SU->isTopReady())
618 Top.removeReady(SU);
619 if (SU->isBottomReady())
620 Bot.removeReady(SU);
621
622 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
623 << *SU->getInstr());
624 return SU;
625}
626
627void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
628 if (useGCNTrackers()) {
629 MachineInstr *MI = SU->getInstr();
630 IsTopNode ? (void)DownwardTracker.advance(MI, false)
631 : UpwardTracker.recede(*MI);
632 }
633
634 return GenericScheduler::schedNode(SU, IsTopNode);
635}
636
641
644 if (!CurrentStage)
645 CurrentStage = SchedStages.begin();
646 else
647 CurrentStage++;
648
649 return CurrentStage != SchedStages.end();
650}
651
654 return std::next(CurrentStage) != SchedStages.end();
655}
656
658 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
659 return *std::next(CurrentStage);
660}
661
663 SchedCandidate &TryCand,
664 SchedBoundary *Zone) const {
665 // Initialize the candidate if needed.
666 if (!Cand.isValid()) {
667 TryCand.Reason = NodeOrder;
668 return true;
669 }
670
671 // Bias PhysReg Defs and copies to their uses and defined respectively.
672 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
673 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
674 return TryCand.Reason != NoCand;
675
676 // Avoid exceeding the target's limit.
677 if (DAG->isTrackingPressure() &&
678 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
679 RegExcess, TRI, DAG->MF))
680 return TryCand.Reason != NoCand;
681
682 // Avoid increasing the max critical pressure in the scheduled region.
683 if (DAG->isTrackingPressure() &&
685 TryCand, Cand, RegCritical, TRI, DAG->MF))
686 return TryCand.Reason != NoCand;
687
688 bool SameBoundary = Zone != nullptr;
689 if (SameBoundary) {
692 TryCand, Cand, ResourceReduce))
693 return TryCand.Reason != NoCand;
695 Cand.ResDelta.DemandedResources, TryCand, Cand,
697 return TryCand.Reason != NoCand;
698 }
699
700 return false;
701}
702
715
720
722 SchedCandidate &TryCand,
723 SchedBoundary *Zone) const {
724 // Initialize the candidate if needed.
725 if (!Cand.isValid()) {
726 TryCand.Reason = NodeOrder;
727 return true;
728 }
729
730 // Avoid spilling by exceeding the register limit.
731 if (DAG->isTrackingPressure() &&
732 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
733 RegExcess, TRI, DAG->MF))
734 return TryCand.Reason != NoCand;
735
736 // Bias PhysReg Defs and copies to their uses and defined respectively.
737 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
738 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
739 return TryCand.Reason != NoCand;
740
741 bool SameBoundary = Zone != nullptr;
742 if (SameBoundary) {
743 // Prioritize instructions that read unbuffered resources by stall cycles.
744 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
745 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
746 return TryCand.Reason != NoCand;
747
748 // Avoid critical resource consumption and balance the schedule.
751 TryCand, Cand, ResourceReduce))
752 return TryCand.Reason != NoCand;
754 Cand.ResDelta.DemandedResources, TryCand, Cand,
756 return TryCand.Reason != NoCand;
757
758 // Unconditionally try to reduce latency.
759 if (tryLatency(TryCand, Cand, *Zone))
760 return TryCand.Reason != NoCand;
761
762 // Weak edges are for clustering and other constraints.
763 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
764 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
765 return TryCand.Reason != NoCand;
766 }
767
768 // Keep clustered nodes together to encourage downstream peephole
769 // optimizations which may reduce resource requirements.
770 //
771 // This is a best effort to set things up for a post-RA pass. Optimizations
772 // like generating loads of multiple registers should ideally be done within
773 // the scheduler pass by combining the loads during DAG postprocessing.
774 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
775 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
776 bool CandIsClusterSucc =
777 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
778 bool TryCandIsClusterSucc =
779 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
780 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
781 Cluster))
782 return TryCand.Reason != NoCand;
783
784 // Avoid increasing the max critical pressure in the scheduled region.
785 if (DAG->isTrackingPressure() &&
787 TryCand, Cand, RegCritical, TRI, DAG->MF))
788 return TryCand.Reason != NoCand;
789
790 // Avoid increasing the max pressure of the entire region.
791 if (DAG->isTrackingPressure() &&
792 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
793 Cand, RegMax, TRI, DAG->MF))
794 return TryCand.Reason != NoCand;
795
796 if (SameBoundary) {
797 // Fall through to original instruction order.
798 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
799 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
800 TryCand.Reason = NodeOrder;
801 return true;
802 }
803 }
804 return false;
805}
806
812
813/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
814/// much as possible. This is achieved by:
815// 1. Prioritize clustered operations before stall latency heuristic.
816// 2. Prioritize long-latency-load before stall latency heuristic.
817///
818/// \param Cand provides the policy and current best candidate.
819/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
820/// \param Zone describes the scheduled zone that we are extending, or nullptr
821/// if Cand is from a different zone than TryCand.
822/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
824 SchedCandidate &TryCand,
825 SchedBoundary *Zone) const {
826 // Initialize the candidate if needed.
827 if (!Cand.isValid()) {
828 TryCand.Reason = NodeOrder;
829 return true;
830 }
831
832 // Bias PhysReg Defs and copies to their uses and defined respectively.
833 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
834 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
835 return TryCand.Reason != NoCand;
836
837 if (DAG->isTrackingPressure()) {
838 // Avoid exceeding the target's limit.
839 if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
840 RegExcess, TRI, DAG->MF))
841 return TryCand.Reason != NoCand;
842
843 // Avoid increasing the max critical pressure in the scheduled region.
845 TryCand, Cand, RegCritical, TRI, DAG->MF))
846 return TryCand.Reason != NoCand;
847 }
848
849 // MaxMemoryClause-specific: We prioritize clustered instructions as we would
850 // get more benefit from clausing these memory instructions.
851 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
852 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
853 bool CandIsClusterSucc =
854 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
855 bool TryCandIsClusterSucc =
856 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
857 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
858 Cluster))
859 return TryCand.Reason != NoCand;
860
861 // We only compare a subset of features when comparing nodes between
862 // Top and Bottom boundary. Some properties are simply incomparable, in many
863 // other instances we should only override the other boundary if something
864 // is a clear good pick on one boundary. Skip heuristics that are more
865 // "tie-breaking" in nature.
866 bool SameBoundary = Zone != nullptr;
867 if (SameBoundary) {
868 // For loops that are acyclic path limited, aggressively schedule for
869 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
870 // heuristics to take precedence.
871 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
872 tryLatency(TryCand, Cand, *Zone))
873 return TryCand.Reason != NoCand;
874
875 // MaxMemoryClause-specific: Prioritize long latency memory load
876 // instructions in top-bottom order to hide more latency. The mayLoad check
877 // is used to exclude store-like instructions, which we do not want to
878 // scheduler them too early.
879 bool TryMayLoad =
880 TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
881 bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
882
883 if (TryMayLoad || CandMayLoad) {
884 bool TryLongLatency =
885 TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
886 bool CandLongLatency =
887 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
888
889 if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
890 Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
891 Cand, Stall))
892 return TryCand.Reason != NoCand;
893 }
894 // Prioritize instructions that read unbuffered resources by stall cycles.
895 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
896 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
897 return TryCand.Reason != NoCand;
898 }
899
900 if (SameBoundary) {
901 // Weak edges are for clustering and other constraints.
902 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
903 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
904 return TryCand.Reason != NoCand;
905 }
906
907 // Avoid increasing the max pressure of the entire region.
908 if (DAG->isTrackingPressure() &&
909 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
910 Cand, RegMax, TRI, DAG->MF))
911 return TryCand.Reason != NoCand;
912
913 if (SameBoundary) {
914 // Avoid critical resource consumption and balance the schedule.
917 TryCand, Cand, ResourceReduce))
918 return TryCand.Reason != NoCand;
920 Cand.ResDelta.DemandedResources, TryCand, Cand,
922 return TryCand.Reason != NoCand;
923
924 // Avoid serializing long latency dependence chains.
925 // For acyclic path limited loops, latency was already checked above.
926 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
927 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
928 return TryCand.Reason != NoCand;
929
930 // Fall through to original instruction order.
931 if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
932 assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
933 TryCand.Reason = NodeOrder;
934 return true;
935 }
936 }
937
938 return false;
939}
940
942 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
943 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
944 MFI(*MF.getInfo<SIMachineFunctionInfo>()),
945 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
946 RegionLiveOuts(this, /*IsLiveOut=*/true) {
947
948 // We want regions with a single MI to be scheduled so that we can reason
949 // about them correctly during scheduling stages that move MIs between regions
950 // (e.g., rematerialization).
952 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
953 if (RelaxedOcc) {
954 MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
955 if (MinOccupancy != StartingOccupancy)
956 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
957 << ".\n");
958 }
959}
960
961std::unique_ptr<GCNSchedStage>
962GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
963 switch (SchedStageID) {
965 return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
967 return std::make_unique<RewriteMFMAFormStage>(SchedStageID, *this);
969 return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
971 return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
973 return std::make_unique<PreRARematStage>(SchedStageID, *this);
975 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
977 return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
978 *this);
979 }
980
981 llvm_unreachable("Unknown SchedStageID.");
982}
983
985 // Collect all scheduling regions. The actual scheduling is performed in
986 // GCNScheduleDAGMILive::finalizeSchedule.
987 Regions.push_back(std::pair(RegionBegin, RegionEnd));
988}
989
991GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
992 if (Regions[RegionIdx].first == Regions[RegionIdx].second)
993 return llvm::getRegPressure(MRI, LiveIns[RegionIdx]);
995 RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
996 &LiveIns[RegionIdx]);
997 return RPTracker.moveMaxPressure();
998}
999
1001 MachineBasicBlock::iterator RegionEnd) {
1002 assert(RegionBegin != RegionEnd && "Region must not be empty");
1003 return &*skipDebugInstructionsBackward(std::prev(RegionEnd), RegionBegin);
1004}
1005
1006void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
1007 const MachineBasicBlock *MBB) {
1008 GCNDownwardRPTracker RPTracker(*LIS);
1009
1010 // If the block has the only successor then live-ins of that successor are
1011 // live-outs of the current block. We can reuse calculated live set if the
1012 // successor will be sent to scheduling past current block.
1013
1014 // However, due to the bug in LiveInterval analysis it may happen that two
1015 // predecessors of the same successor block have different lane bitmasks for
1016 // a live-out register. Workaround that by sticking to one-to-one relationship
1017 // i.e. one predecessor with one successor block.
1018 const MachineBasicBlock *OnlySucc = nullptr;
1019 if (MBB->succ_size() == 1) {
1020 auto *Candidate = *MBB->succ_begin();
1021 if (!Candidate->empty() && Candidate->pred_size() == 1) {
1022 SlotIndexes *Ind = LIS->getSlotIndexes();
1023 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
1024 OnlySucc = Candidate;
1025 }
1026 }
1027
1028 // Scheduler sends regions from the end of the block upwards.
1029 size_t CurRegion = RegionIdx;
1030 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
1031 if (Regions[CurRegion].first->getParent() != MBB)
1032 break;
1033 --CurRegion;
1034
1035 auto I = MBB->begin();
1036 auto LiveInIt = MBBLiveIns.find(MBB);
1037 auto &Rgn = Regions[CurRegion];
1038 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
1039 if (LiveInIt != MBBLiveIns.end()) {
1040 auto LiveIn = std::move(LiveInIt->second);
1041 RPTracker.reset(*MBB->begin(), &LiveIn);
1042 MBBLiveIns.erase(LiveInIt);
1043 } else {
1044 I = Rgn.first;
1045 auto LRS = BBLiveInMap.lookup(NonDbgMI);
1046#ifdef EXPENSIVE_CHECKS
1047 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
1048#endif
1049 RPTracker.reset(*I, &LRS);
1050 }
1051
1052 for (;;) {
1053 I = RPTracker.getNext();
1054
1055 if (Regions[CurRegion].first == I || NonDbgMI == I) {
1056 LiveIns[CurRegion] = RPTracker.getLiveRegs();
1057 RPTracker.clearMaxPressure();
1058 }
1059
1060 if (Regions[CurRegion].second == I) {
1061 Pressure[CurRegion] = RPTracker.moveMaxPressure();
1062 if (CurRegion-- == RegionIdx)
1063 break;
1064 auto &Rgn = Regions[CurRegion];
1065 NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
1066 }
1067 RPTracker.advanceBeforeNext();
1068 RPTracker.advanceToNext();
1069 }
1070
1071 if (OnlySucc) {
1072 if (I != MBB->end()) {
1073 RPTracker.advanceBeforeNext();
1074 RPTracker.advanceToNext();
1075 RPTracker.advance(MBB->end());
1076 }
1077 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
1078 }
1079}
1080
1082GCNScheduleDAGMILive::getRegionLiveInMap() const {
1083 assert(!Regions.empty());
1084 std::vector<MachineInstr *> RegionFirstMIs;
1085 RegionFirstMIs.reserve(Regions.size());
1086 for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
1087 RegionFirstMIs.push_back(
1089
1090 return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
1091}
1092
1094GCNScheduleDAGMILive::getRegionLiveOutMap() const {
1095 assert(!Regions.empty());
1096 std::vector<MachineInstr *> RegionLastMIs;
1097 RegionLastMIs.reserve(Regions.size());
1098 for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) {
1099 // Skip empty regions.
1100 if (RegionBegin == RegionEnd)
1101 continue;
1102 RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
1103 }
1104 return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
1105}
1106
1108 IdxToInstruction.clear();
1109
1110 RegionLiveRegMap =
1111 IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
1112 for (unsigned I = 0; I < DAG->Regions.size(); I++) {
1113 auto &[RegionBegin, RegionEnd] = DAG->Regions[I];
1114 // Skip empty regions.
1115 if (RegionBegin == RegionEnd)
1116 continue;
1117 MachineInstr *RegionKey =
1118 IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin;
1119 IdxToInstruction[I] = RegionKey;
1120 }
1121}
1122
1124 // Start actual scheduling here. This function is called by the base
1125 // MachineScheduler after all regions have been recorded by
1126 // GCNScheduleDAGMILive::schedule().
1127 LiveIns.resize(Regions.size());
1128 Pressure.resize(Regions.size());
1129 RegionsWithHighRP.resize(Regions.size());
1130 RegionsWithExcessRP.resize(Regions.size());
1131 RegionsWithIGLPInstrs.resize(Regions.size());
1132 RegionsWithHighRP.reset();
1133 RegionsWithExcessRP.reset();
1134 RegionsWithIGLPInstrs.reset();
1135
1136 runSchedStages();
1137}
1138
1139void GCNScheduleDAGMILive::runSchedStages() {
1140 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
1141
1142 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
1143 if (!Regions.empty()) {
1144 BBLiveInMap = getRegionLiveInMap();
1145 if (S.useGCNTrackers())
1146 RegionLiveOuts.buildLiveRegMap();
1147 }
1148
1149#ifdef DUMP_MAX_REG_PRESSURE
1153 LIS->dump();
1154 }
1155#endif
1156
1157 while (S.advanceStage()) {
1158 auto Stage = createSchedStage(S.getCurrentStage());
1159 if (!Stage->initGCNSchedStage())
1160 continue;
1161
1162 for (auto Region : Regions) {
1163 RegionBegin = Region.first;
1164 RegionEnd = Region.second;
1165 // Setup for scheduling the region and check whether it should be skipped.
1166 if (!Stage->initGCNRegion()) {
1167 Stage->advanceRegion();
1168 exitRegion();
1169 continue;
1170 }
1171
1172 if (S.useGCNTrackers()) {
1173 GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
1174 GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
1175 GCNRPTracker::LiveRegSet *RegionLiveIns =
1176 &LiveIns[Stage->getRegionIdx()];
1177
1178 reinterpret_cast<GCNRPTracker *>(DownwardTracker)
1179 ->reset(MRI, *RegionLiveIns);
1180 reinterpret_cast<GCNRPTracker *>(UpwardTracker)
1181 ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
1182 Stage->getRegionIdx()));
1183 }
1184
1186 Stage->finalizeGCNRegion();
1187 Stage->advanceRegion();
1188 exitRegion();
1189 }
1190
1191 Stage->finalizeGCNSchedStage();
1192 }
1193
1194#ifdef DUMP_MAX_REG_PRESSURE
1198 LIS->dump();
1199 }
1200#endif
1201}
1202
1203#ifndef NDEBUG
1205 switch (StageID) {
1207 OS << "Max Occupancy Initial Schedule";
1208 break;
1210 OS << "Instruction Rewriting Reschedule";
1211 break;
1213 OS << "Unclustered High Register Pressure Reschedule";
1214 break;
1216 OS << "Clustered Low Occupancy Reschedule";
1217 break;
1219 OS << "Pre-RA Rematerialize";
1220 break;
1222 OS << "Max ILP Initial Schedule";
1223 break;
1225 OS << "Max memory clause Initial Schedule";
1226 break;
1227 }
1228
1229 return OS;
1230}
1231#endif
1232
1236
1238 if (!DAG.LIS)
1239 return false;
1240
1241 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1242 return true;
1243}
1244
1245void RewriteMFMAFormStage::findReachingDefs(
1246 MachineOperand &UseMO, LiveIntervals *LIS,
1247 SmallVectorImpl<SlotIndex> &DefIdxs) {
1248 MachineInstr *UseMI = UseMO.getParent();
1249 LiveInterval &UseLI = LIS->getInterval(UseMO.getReg());
1250 VNInfo *VNI = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI));
1251
1252 // If the def is not a PHI, then it must be the only reaching def.
1253 if (!VNI->isPHIDef()) {
1254 DefIdxs.push_back(VNI->def);
1255 return;
1256 }
1257
1258 SmallPtrSet<MachineBasicBlock *, 8> Visited = {UseMI->getParent()};
1259 SmallVector<MachineBasicBlock *, 8> Worklist;
1260
1261 // Mark the predecessor blocks for traversal
1262 for (MachineBasicBlock *PredMBB : UseMI->getParent()->predecessors()) {
1263 Worklist.push_back(PredMBB);
1264 Visited.insert(PredMBB);
1265 }
1266
1267 while (!Worklist.empty()) {
1268 MachineBasicBlock *CurrMBB = Worklist.pop_back_val();
1269
1270 SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(CurrMBB);
1271 VNInfo *VNI = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot());
1272
1273 MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNI->def);
1274
1275 // If there is a def in this block, then add it to the list. This is the
1276 // reaching def of this path.
1277 if (!VNI->isPHIDef()) {
1278 DefIdxs.push_back(VNI->def);
1279 continue;
1280 }
1281
1282 for (MachineBasicBlock *PredMBB : DefMBB->predecessors()) {
1283 if (Visited.insert(PredMBB).second)
1284 Worklist.push_back(PredMBB);
1285 }
1286 }
1287}
1288
1289void RewriteMFMAFormStage::findReachingUses(
1291 SmallVectorImpl<MachineOperand *> &ReachingUses) {
1292 SlotIndex DefIdx = LIS->getInstructionIndex(*DefMI);
1293 for (MachineOperand &UseMO :
1294 DAG.MRI.use_nodbg_operands(DefMI->getOperand(0).getReg())) {
1295 SmallVector<SlotIndex, 8> ReachingDefIndexes;
1296 findReachingDefs(UseMO, LIS, ReachingDefIndexes);
1297
1298 // If we find a use that contains this DefMI in its reachingDefs, then it is
1299 // a reaching use.
1300 if (any_of(ReachingDefIndexes, [DefIdx](SlotIndex RDIdx) {
1301 return SlotIndex::isSameInstr(RDIdx, DefIdx);
1302 }))
1303 ReachingUses.push_back(&UseMO);
1304 }
1305}
1306
1308 // We only need to run this pass if the architecture supports AGPRs.
1309 // Additionally, we don't use AGPRs at occupancy levels above 1 so there
1310 // is no need for this pass in that case, either.
1311 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1312 if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1)
1313 return false;
1314
1315 RegionsWithExcessArchVGPR.resize(DAG.Regions.size());
1316 RegionsWithExcessArchVGPR.reset();
1317 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
1319 if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
1320 RegionsWithExcessArchVGPR[Region] = true;
1321 }
1322
1323 if (RegionsWithExcessArchVGPR.none())
1324 return false;
1325
1326 TII = ST.getInstrInfo();
1327 SRI = ST.getRegisterInfo();
1328
1329 std::vector<std::pair<MachineInstr *, unsigned>> RewriteCands;
1332
1333 if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef))
1334 return false;
1335
1336 int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef);
1337
1338 // If we haven't found the beneficial conditions, prefer the VGPR form which
1339 // may result in less cross RC copies.
1340 if (Cost > 0)
1341 return false;
1342
1343 return rewrite(RewriteCands);
1344}
1345
1348 return false;
1349
1351 return false;
1352
1353 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1354 return false;
1355
1356 SavedMutations.swap(DAG.Mutations);
1357 DAG.addMutation(
1359
1360 InitialOccupancy = DAG.MinOccupancy;
1361 // Aggressively try to reduce register pressure in the unclustered high RP
1362 // stage. Temporarily increase occupancy target in the region.
1363 TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
1364 ? InitialOccupancy + 1
1365 : InitialOccupancy;
1366 IsAnyRegionScheduled = false;
1367 S.SGPRLimitBias = S.HighRPSGPRBias;
1368 S.VGPRLimitBias = S.HighRPVGPRBias;
1369
1370 LLVM_DEBUG(
1371 dbgs()
1372 << "Retrying function scheduling without clustering. "
1373 "Aggressively try to reduce register pressure to achieve occupancy "
1374 << TempTargetOccupancy << ".\n");
1375
1376 return true;
1377}
1378
1381 return false;
1382
1384 return false;
1385
1386 // Don't bother trying to improve ILP in lower RP regions if occupancy has not
1387 // been dropped. All regions will have already been scheduled with the ideal
1388 // occupancy targets.
1389 if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1390 return false;
1391
1392 LLVM_DEBUG(
1393 dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1394 << DAG.MinOccupancy << ".\n");
1395 return true;
1396}
1397
1398/// Allows to easily filter for this stage's debug output.
1399#define REMAT_PREFIX "[PreRARemat] "
1400#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
1401
1402#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1403Printable PreRARematStage::ScoredRemat::print() const {
1404 return Printable([&](raw_ostream &OS) {
1405 OS << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
1406 });
1407}
1408#endif
1409
1411 // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
1412 // regions inbetween the defs and region we sinked the def to. Will need to be
1413 // fixed if there is another pass after this pass.
1414 assert(!S.hasNextStage());
1415
1416 if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1)
1417 return false;
1418
1419 // Maps all MIs (except lone terminators, which are not part of any region) to
1420 // their parent region. Non-lone terminators are considered part of the region
1421 // they delimitate.
1422 DenseMap<MachineInstr *, unsigned> MIRegion(MF.getInstructionCount());
1423
1424 // Before performing any IR modification record the parent region of each MI
1425 // and the parent MBB of each region.
1426 const unsigned NumRegions = DAG.Regions.size();
1427 for (unsigned I = 0; I < NumRegions; ++I) {
1428 RegionBoundaries Region = DAG.Regions[I];
1429 for (auto MI = Region.first; MI != Region.second; ++MI)
1430 MIRegion.insert({&*MI, I});
1431 MachineBasicBlock *ParentMBB = Region.first->getParent();
1432 if (Region.second != ParentMBB->end())
1433 MIRegion.insert({&*Region.second, I});
1434 RegionBB.push_back(ParentMBB);
1435 }
1436
1437#ifndef NDEBUG
1438 auto PrintTargetRegions = [&]() -> void {
1439 if (TargetRegions.none()) {
1440 dbgs() << REMAT_PREFIX << "No target regions\n";
1441 return;
1442 }
1443 dbgs() << REMAT_PREFIX << "Target regions:\n";
1444 for (unsigned I : TargetRegions.set_bits())
1445 dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
1446 };
1447 auto PrintRematReg = [&](const RematReg &Remat) -> Printable {
1448 return Printable([&, Remat](raw_ostream &OS) {
1449 // Concatenate all region numbers in which the register is unused and
1450 // live-through.
1451 bool HasLiveThroughRegion = false;
1452 OS << '[' << Remat.DefRegion << " -";
1453 for (unsigned I = 0; I < NumRegions; ++I) {
1454 if (Remat.isUnusedLiveThrough(I)) {
1455 if (HasLiveThroughRegion) {
1456 OS << ',';
1457 } else {
1458 OS << "- ";
1459 HasLiveThroughRegion = true;
1460 }
1461 OS << I;
1462 }
1463 }
1464 if (HasLiveThroughRegion)
1465 OS << " -";
1466 OS << "-> " << Remat.UseRegion << "] ";
1467 Remat.DefMI->print(OS, /*IsStandalone=*/true, /*SkipOpers=*/false,
1468 /*SkipDebugLoc=*/false, /*AddNewLine=*/false);
1469 });
1470 };
1471#endif
1472
1473 // Set an objective for the stage based on current RP in each region.
1474 REMAT_DEBUG({
1475 dbgs() << "Analyzing ";
1476 MF.getFunction().printAsOperand(dbgs(), false);
1477 dbgs() << ": ";
1478 });
1479 if (!setObjective()) {
1480 LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at "
1481 << MFI.getMaxWavesPerEU() << '\n');
1482 return false;
1483 }
1484 LLVM_DEBUG({
1485 if (TargetOcc) {
1486 dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n';
1487 } else {
1488 dbgs() << "reduce spilling (minimum target occupancy is "
1489 << MFI.getMinWavesPerEU() << ")\n";
1490 }
1491 PrintTargetRegions();
1492 });
1493
1494 if (!collectRematRegs(MIRegion)) {
1495 REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
1496 return false;
1497 }
1498 const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
1499 REMAT_DEBUG({
1500 dbgs() << "Rematerializable registers:\n";
1501 for (const RematReg &Remat : RematRegs)
1502 dbgs() << REMAT_PREFIX << " " << PrintRematReg(Remat) << '\n';
1503 dbgs() << REMAT_PREFIX << "Region frequencies\n";
1504 for (auto [I, Freq] : enumerate(FreqInfo.Regions)) {
1505 dbgs() << REMAT_PREFIX << " [" << I << "] ";
1506 if (Freq)
1507 dbgs() << Freq;
1508 else
1509 dbgs() << "unknown ";
1510 dbgs() << " | " << *DAG.Regions[I].first;
1511 }
1512 });
1513
1514 SmallVector<ScoredRemat> ScoredRemats;
1515 for (RematReg &Remat : RematRegs)
1516 ScoredRemats.emplace_back(&Remat, FreqInfo, DAG);
1517
1518// Rematerialize registers in successive rounds until all RP targets are
1519// satisifed or until we run out of rematerialization candidates.
1520#ifndef NDEBUG
1521 unsigned RoundNum = 0;
1522#endif
1523 BitVector RecomputeRP(NumRegions);
1524 do {
1525 assert(!ScoredRemats.empty() && "no more remat candidates");
1526
1527 // (Re-)Score and (re-)sort all remats in increasing score order.
1528 for (ScoredRemat &Remat : ScoredRemats)
1529 Remat.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc);
1530 sort(ScoredRemats);
1531
1532 REMAT_DEBUG({
1533 dbgs() << "==== ROUND " << RoundNum++ << " ====\n"
1534 << REMAT_PREFIX
1535 << "Candidates with non-null score, in rematerialization order:\n";
1536 for (const ScoredRemat &RematDecision : reverse(ScoredRemats)) {
1537 if (RematDecision.hasNullScore())
1538 break;
1539 dbgs() << REMAT_PREFIX << " " << RematDecision.print() << " | "
1540 << *RematDecision.Remat->DefMI;
1541 }
1542 PrintTargetRegions();
1543 });
1544
1545 RecomputeRP.reset();
1546 unsigned RematIdx = ScoredRemats.size();
1547
1548 // Rematerialize registers in decreasing score order until we estimate
1549 // that all RP targets are satisfied or until rematerialization candidates
1550 // are no longer useful to decrease RP.
1551 for (; RematIdx && TargetRegions.any(); --RematIdx) {
1552 const ScoredRemat &Candidate = ScoredRemats[RematIdx - 1];
1553 // Stop rematerializing on encountering a null score. Since scores
1554 // monotonically decrease as we rematerialize, we know there is nothing
1555 // useful left to do in such cases, even if we were to re-score.
1556 if (Candidate.hasNullScore()) {
1557 RematIdx = 0;
1558 break;
1559 }
1560
1561 RematReg &Remat = *Candidate.Remat;
1562 // When previous rematerializations in this round have already satisfied
1563 // RP targets in all regions this rematerialization can impact, we have a
1564 // good indication that our scores have diverged significantly from
1565 // reality, in which case we interrupt this round and re-score. This also
1566 // ensures that every rematerialization we perform is possibly impactful
1567 // in at least one target region.
1568 if (!Remat.maybeBeneficial(TargetRegions, RPTargets))
1569 break;
1570
1571 REMAT_DEBUG(dbgs() << "** REMAT " << PrintRematReg(Remat) << '\n';);
1572 MachineInstr *RematMI =
1573 Candidate.rematerialize(RecomputeRP, RPTargets, DAG);
1574 RescheduleRegions |= Remat.Live;
1575
1576 // Every rematerialization we do here is likely to move the instruction
1577 // into a higher frequency region, increasing the total sum latency of the
1578 // instruction itself. This is acceptable if we are eliminating a spill in
1579 // the process, but when the goal is increasing occupancy we get nothing
1580 // out of rematerialization if occupancy is not increased in the end; in
1581 // such cases we want to roll back the rematerialization.
1582 if (TargetOcc) {
1583 RollbackInfo &Rollback = Rollbacks.emplace_back(&Remat);
1584 Rollback.RematMI = RematMI;
1585 // Make the original MI a debug value so that it does not influence
1586 // scheduling and replace all read registers with a sentinel register to
1587 // prevent operands to appear in use-lists of other MIs during LIS
1588 // updates. Store mappings between operand indices and original
1589 // registers for potential rollback.
1590 Remat.DefMI->setDesc(DAG.TII->get(TargetOpcode::DBG_VALUE));
1591 for (auto [Idx, MO] : enumerate(Remat.DefMI->operands())) {
1592 if (MO.isReg() && MO.readsReg()) {
1593 Rollback.RegMap.insert({Idx, MO.getReg()});
1594 MO.setReg(Register());
1595 }
1596 }
1597 } else {
1598 // Just delete the original instruction if it cannot be rolled back.
1599 DAG.deleteMI(Remat.DefRegion, Remat.DefMI);
1600 }
1601
1602 unsetSatisfiedRPTargets(Remat.Live);
1603 }
1604
1605 REMAT_DEBUG({
1606 if (!TargetRegions.any()) {
1607 dbgs() << "** Interrupt round on all targets achieved\n";
1608 } else if (RematIdx) {
1609 dbgs() << "** Interrupt round on stale score for "
1610 << *ScoredRemats[RematIdx - 1].Remat->DefMI;
1611 } else {
1612 dbgs() << "** Stop on exhausted rematerialization candidates\n";
1613 }
1614 });
1615
1616 // Peel off registers we already rematerialized from the vector's tail.
1617 ScoredRemats.truncate(RematIdx);
1618 } while ((updateAndVerifyRPTargets(RecomputeRP) || TargetRegions.any()) &&
1619 !ScoredRemats.empty());
1620 if (RescheduleRegions.none())
1621 return false;
1622
1623 // Commit all pressure changes to the DAG and compute minimum achieved
1624 // occupancy in impacted regions.
1625 REMAT_DEBUG(dbgs() << "==== REMAT RESULTS ====\n");
1626 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
1627 for (unsigned I : RescheduleRegions.set_bits()) {
1628 DAG.Pressure[I] = RPTargets[I].getCurrentRP();
1629 REMAT_DEBUG(dbgs() << '[' << I << "] Achieved occupancy "
1630 << DAG.Pressure[I].getOccupancy(ST, DynamicVGPRBlockSize)
1631 << " (" << RPTargets[I] << ")\n");
1632 }
1633 AchievedOcc = MFI.getMaxWavesPerEU();
1634 for (const GCNRegPressure &RP : DAG.Pressure) {
1635 AchievedOcc =
1636 std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
1637 }
1638
1639 REMAT_DEBUG({
1640 dbgs() << "Retrying function scheduling with new min. occupancy of "
1641 << AchievedOcc << " from rematerializing (original was "
1642 << DAG.MinOccupancy;
1643 if (TargetOcc)
1644 dbgs() << ", target was " << *TargetOcc;
1645 dbgs() << ")\n";
1646 });
1647
1648 DAG.setTargetOccupancy(getStageTargetOccupancy());
1649 return true;
1650}
1651
1653 DAG.finishBlock();
1654 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1655}
1656
1658 SavedMutations.swap(DAG.Mutations);
1659 S.SGPRLimitBias = S.VGPRLimitBias = 0;
1660 if (DAG.MinOccupancy > InitialOccupancy) {
1661 assert(IsAnyRegionScheduled);
1663 << " stage successfully increased occupancy to "
1664 << DAG.MinOccupancy << '\n');
1665 } else if (!IsAnyRegionScheduled) {
1666 assert(DAG.MinOccupancy == InitialOccupancy);
1668 << ": No regions scheduled, min occupancy stays at "
1669 << DAG.MinOccupancy << ", MFI occupancy stays at "
1670 << MFI.getOccupancy() << ".\n");
1671 }
1672
1674}
1675
1677 // Skip empty scheduling region.
1678 if (DAG.begin() == DAG.end())
1679 return false;
1680
1681 // Check whether this new region is also a new block.
1682 if (DAG.RegionBegin->getParent() != CurrentMBB)
1683 setupNewBlock();
1684
1685 unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
1686 DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
1687
1688 // Skip regions with 1 schedulable instruction.
1689 if (DAG.begin() == std::prev(DAG.end()))
1690 return false;
1691
1692 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1693 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1694 << " " << CurrentMBB->getName()
1695 << "\n From: " << *DAG.begin() << " To: ";
1696 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1697 else dbgs() << "End";
1698 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1699
1700 // Save original instruction order before scheduling for possible revert.
1701 Unsched.clear();
1702 Unsched.reserve(DAG.NumRegionInstrs);
1705 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
1706 for (auto &I : DAG) {
1707 Unsched.push_back(&I);
1708 if (SII->isIGLPMutationOnly(I.getOpcode()))
1709 DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1710 }
1711 } else {
1712 for (auto &I : DAG)
1713 Unsched.push_back(&I);
1714 }
1715
1716 PressureBefore = DAG.Pressure[RegionIdx];
1717
1718 LLVM_DEBUG(
1719 dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1720 << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1721 << "Region live-in pressure: "
1722 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1723 << "Region register pressure: " << print(PressureBefore));
1724
1725 S.HasHighPressure = false;
1726 S.KnownExcessRP = isRegionWithExcessRP();
1727
1728 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1730 SavedMutations.clear();
1731 SavedMutations.swap(DAG.Mutations);
1732 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1734 DAG.addMutation(createIGroupLPDAGMutation(
1735 IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1737 }
1738
1739 return true;
1740}
1741
1743 // Only reschedule regions that have excess register pressure (i.e. spilling)
1744 // or had minimum occupancy at the beginning of the stage (as long as
1745 // rescheduling of previous regions did not make occupancy drop back down to
1746 // the initial minimum).
1747 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1748 // If no region has been scheduled yet, the DAG has not yet been updated with
1749 // the occupancy target. So retrieve it from the temporary.
1750 unsigned CurrentTargetOccupancy =
1751 IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
1752 if (!DAG.RegionsWithExcessRP[RegionIdx] &&
1753 (CurrentTargetOccupancy <= InitialOccupancy ||
1754 DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
1755 InitialOccupancy))
1756 return false;
1757
1758 bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
1759 // If this is the first region scheduled during this stage, make the target
1760 // occupancy changes in the DAG and MFI.
1761 if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
1762 IsAnyRegionScheduled = true;
1763 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
1764 DAG.setTargetOccupancy(TempTargetOccupancy);
1765 }
1766 return IsSchedulingThisRegion;
1767}
1768
1770 // We may need to reschedule this region if it wasn't rescheduled in the last
1771 // stage, or if we found it was testing critical register pressure limits in
1772 // the unclustered reschedule stage. The later is because we may not have been
1773 // able to raise the min occupancy in the previous stage so the region may be
1774 // overly constrained even if it was already rescheduled.
1775 if (!DAG.RegionsWithHighRP[RegionIdx])
1776 return false;
1777
1779}
1780
1782 return RescheduleRegions[RegionIdx] && GCNSchedStage::initGCNRegion();
1783}
1784
1786 if (CurrentMBB)
1787 DAG.finishBlock();
1788
1789 CurrentMBB = DAG.RegionBegin->getParent();
1790 DAG.startBlock(CurrentMBB);
1791 // Get real RP for the region if it hasn't be calculated before. After the
1792 // initial schedule stage real RP will be collected after scheduling.
1796 DAG.computeBlockPressure(RegionIdx, CurrentMBB);
1797}
1798
1800 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1801 if (S.HasHighPressure)
1802 DAG.RegionsWithHighRP[RegionIdx] = true;
1803
1804 // Revert scheduling if we have dropped occupancy or there is some other
1805 // reason that the original schedule is better.
1807
1808 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1810 SavedMutations.swap(DAG.Mutations);
1811}
1812
1815 // When the goal is to increase occupancy, all regions must reach the target
1816 // occupancy for rematerializations to be possibly useful, otherwise we will
1817 // just hurt latency for no benefit. If minimum occupancy drops below the
1818 // target there is no point in trying to re-schedule further regions.
1819 if (!TargetOcc)
1820 return;
1821 RegionReverts.emplace_back(RegionIdx, Unsched, PressureBefore);
1822 if (DAG.MinOccupancy < *TargetOcc) {
1823 REMAT_DEBUG(dbgs() << "Region " << RegionIdx
1824 << " cannot meet occupancy target, interrupting "
1825 "re-scheduling in all regions\n");
1826 RescheduleRegions.reset();
1827 }
1828}
1829
1831 // Check the results of scheduling.
1832 PressureAfter = DAG.getRealRegPressure(RegionIdx);
1833
1834 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1835 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1836
1837 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1838
1839 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1840 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1841 DAG.Pressure[RegionIdx] = PressureAfter;
1842
1843 // Early out if we have achieved the occupancy target.
1844 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1845 return;
1846 }
1847
1848 unsigned TargetOccupancy = std::min(
1849 S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
1850 unsigned WavesAfter = std::min(
1851 TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
1852 unsigned WavesBefore = std::min(
1853 TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
1854 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1855 << ", after " << WavesAfter << ".\n");
1856
1857 // We may not be able to keep the current target occupancy because of the just
1858 // scheduled region. We might still be able to revert scheduling if the
1859 // occupancy before was higher, or if the current schedule has register
1860 // pressure higher than the excess limits which could lead to more spilling.
1861 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
1862
1863 // Allow memory bound functions to drop to 4 waves if not limited by an
1864 // attribute.
1865 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1866 WavesAfter >= MFI.getMinAllowedOccupancy()) {
1867 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1868 << MFI.getMinAllowedOccupancy() << " waves\n");
1869 NewOccupancy = WavesAfter;
1870 }
1871
1872 if (NewOccupancy < DAG.MinOccupancy) {
1873 DAG.MinOccupancy = NewOccupancy;
1874 MFI.limitOccupancy(DAG.MinOccupancy);
1875 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1876 << DAG.MinOccupancy << ".\n");
1877 }
1878 // The maximum number of arch VGPR on non-unified register file, or the
1879 // maximum VGPR + AGPR in the unified register file case.
1880 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1881 // The maximum number of arch VGPR for both unified and non-unified register
1882 // file.
1883 unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
1884 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1885
1886 if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
1887 PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
1888 PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1889 PressureAfter.getSGPRNum() > MaxSGPRs) {
1890 DAG.RegionsWithHighRP[RegionIdx] = true;
1891 DAG.RegionsWithExcessRP[RegionIdx] = true;
1892 }
1893
1894 // Revert if this region's schedule would cause a drop in occupancy or
1895 // spilling.
1896 if (shouldRevertScheduling(WavesAfter)) {
1898 std::tie(DAG.RegionBegin, DAG.RegionEnd) = DAG.Regions[RegionIdx];
1899 } else {
1900 DAG.Pressure[RegionIdx] = PressureAfter;
1901 }
1902}
1903
1904unsigned
1905GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1906 DenseMap<unsigned, unsigned> &ReadyCycles,
1907 const TargetSchedModel &SM) {
1908 unsigned ReadyCycle = CurrCycle;
1909 for (auto &D : SU.Preds) {
1910 if (D.isAssignedRegDep()) {
1911 MachineInstr *DefMI = D.getSUnit()->getInstr();
1912 unsigned Latency = SM.computeInstrLatency(DefMI);
1913 unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
1914 ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
1915 }
1916 }
1917 ReadyCycles[SU.NodeNum] = ReadyCycle;
1918 return ReadyCycle;
1919}
1920
1921#ifndef NDEBUG
1923 bool operator()(std::pair<MachineInstr *, unsigned> A,
1924 std::pair<MachineInstr *, unsigned> B) const {
1925 return A.second < B.second;
1926 }
1927};
1928
1929static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1930 EarlierIssuingCycle> &ReadyCycles) {
1931 if (ReadyCycles.empty())
1932 return;
1933 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1934 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1935 << " ##################\n# Cycle #\t\t\tInstruction "
1936 " "
1937 " \n";
1938 unsigned IPrev = 1;
1939 for (auto &I : ReadyCycles) {
1940 if (I.second > IPrev + 1)
1941 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1942 << " CYCLES DETECTED ******************************\n\n";
1943 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
1944 IPrev = I.second;
1945 }
1946}
1947#endif
1948
1949ScheduleMetrics
1950GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1951#ifndef NDEBUG
1952 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1953 ReadyCyclesSorted;
1954#endif
1955 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1956 unsigned SumBubbles = 0;
1957 DenseMap<unsigned, unsigned> ReadyCycles;
1958 unsigned CurrCycle = 0;
1959 for (auto &SU : InputSchedule) {
1960 unsigned ReadyCycle =
1961 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1962 SumBubbles += ReadyCycle - CurrCycle;
1963#ifndef NDEBUG
1964 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1965#endif
1966 CurrCycle = ++ReadyCycle;
1967 }
1968#ifndef NDEBUG
1969 LLVM_DEBUG(
1970 printScheduleModel(ReadyCyclesSorted);
1971 dbgs() << "\n\t"
1972 << "Metric: "
1973 << (SumBubbles
1974 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1975 : 1)
1976 << "\n\n");
1977#endif
1978
1979 return ScheduleMetrics(CurrCycle, SumBubbles);
1980}
1981
1984#ifndef NDEBUG
1985 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1986 ReadyCyclesSorted;
1987#endif
1988 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1989 unsigned SumBubbles = 0;
1990 DenseMap<unsigned, unsigned> ReadyCycles;
1991 unsigned CurrCycle = 0;
1992 for (auto &MI : DAG) {
1993 SUnit *SU = DAG.getSUnit(&MI);
1994 if (!SU)
1995 continue;
1996 unsigned ReadyCycle =
1997 computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
1998 SumBubbles += ReadyCycle - CurrCycle;
1999#ifndef NDEBUG
2000 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
2001#endif
2002 CurrCycle = ++ReadyCycle;
2003 }
2004#ifndef NDEBUG
2005 LLVM_DEBUG(
2006 printScheduleModel(ReadyCyclesSorted);
2007 dbgs() << "\n\t"
2008 << "Metric: "
2009 << (SumBubbles
2010 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
2011 : 1)
2012 << "\n\n");
2013#endif
2014
2015 return ScheduleMetrics(CurrCycle, SumBubbles);
2016}
2017
2018bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
2019 if (WavesAfter < DAG.MinOccupancy)
2020 return true;
2021
2022 // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
2023 if (DAG.MFI.isDynamicVGPREnabled()) {
2024 unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2025 &ST, DAG.MFI.getDynamicVGPRBlockSize(),
2026 PressureBefore.getVGPRNum(false));
2027 unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2028 &ST, DAG.MFI.getDynamicVGPRBlockSize(),
2029 PressureAfter.getVGPRNum(false));
2030 if (BlocksAfter > BlocksBefore)
2031 return true;
2032 }
2033
2034 return false;
2035}
2036
2039 return false;
2040
2042 return true;
2043
2044 if (mayCauseSpilling(WavesAfter))
2045 return true;
2046
2047 return false;
2048}
2049
2051 // If RP is not reduced in the unclustered reschedule stage, revert to the
2052 // old schedule.
2053 if ((WavesAfter <=
2054 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
2055 mayCauseSpilling(WavesAfter)) ||
2057 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
2058 return true;
2059 }
2060
2061 // Do not attempt to relax schedule even more if we are already spilling.
2063 return false;
2064
2065 LLVM_DEBUG(
2066 dbgs()
2067 << "\n\t *** In shouldRevertScheduling ***\n"
2068 << " *********** BEFORE UnclusteredHighRPStage ***********\n");
2069 ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
2070 LLVM_DEBUG(
2071 dbgs()
2072 << "\n *********** AFTER UnclusteredHighRPStage ***********\n");
2074 unsigned OldMetric = MBefore.getMetric();
2075 unsigned NewMetric = MAfter.getMetric();
2076 unsigned WavesBefore = std::min(
2077 S.getTargetOccupancy(),
2078 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
2079 unsigned Profit =
2080 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
2082 NewMetric) /
2084 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
2085 << MAfter << "Profit: " << Profit << "\n");
2086 return Profit < ScheduleMetrics::ScaleFactor;
2087}
2088
2091 return false;
2092
2094 return true;
2095
2096 if (mayCauseSpilling(WavesAfter))
2097 return true;
2098
2099 return false;
2100}
2101
2103 // When trying to increase occupancy (TargetOcc == true) the stage manages
2104 // region reverts globally (all or none), so we always return false here.
2105 return !TargetOcc && mayCauseSpilling(WavesAfter);
2106}
2107
2109 if (mayCauseSpilling(WavesAfter))
2110 return true;
2111
2112 return false;
2113}
2114
2116 unsigned WavesAfter) {
2117 return mayCauseSpilling(WavesAfter);
2118}
2119
2120bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
2121 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
2123 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
2124 return true;
2125 }
2126
2127 return false;
2128}
2129
2132 ArrayRef<MachineInstr *> MIOrder) {
2133 assert(static_cast<size_t>(std::distance(DAG.Regions[RegionIdx].first,
2134 DAG.Regions[RegionIdx].second)) ==
2135 MIOrder.size() &&
2136 "instruction number mismatch");
2137 if (MIOrder.empty())
2138 return;
2139
2140 LLVM_DEBUG(dbgs() << "Reverting scheduling for region " << RegionIdx << '\n');
2141
2142 // Reconstruct MI sequence by moving instructions in desired order before
2143 // the current region's start.
2144 MachineBasicBlock::iterator RegionEnd = DAG.Regions[RegionIdx].first;
2145 for (MachineInstr *MI : MIOrder) {
2146 // Either move the next MI in order before the end of the region or move the
2147 // region end past the MI if it is at the correct position.
2148 MachineBasicBlock::iterator MII = MI->getIterator();
2149 if (MII != RegionEnd) {
2150 // Will subsequent splice move MI up past a non-debug instruction?
2151 bool NonDebugReordered =
2152 !MI->isDebugInstr() &&
2153 skipDebugInstructionsForward(RegionEnd, MII) != MII;
2154 MBB->splice(RegionEnd, MBB, MI);
2155 // Only update LiveIntervals information if non-debug instructions are
2156 // reordered. Otherwise debug instructions could cause code generation to
2157 // change.
2158 if (NonDebugReordered)
2159 DAG.LIS->handleMove(*MI, true);
2160 } else {
2161 ++RegionEnd;
2162 }
2163 if (MI->isDebugInstr()) {
2164 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2165 continue;
2166 }
2167
2168 // Reset read-undef flags and update them later.
2169 for (MachineOperand &Op : MI->all_defs())
2170 Op.setIsUndef(false);
2171 RegisterOperands RegOpers;
2172 RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
2173 if (DAG.ShouldTrackLaneMasks) {
2174 // Adjust liveness and add missing dead+read-undef flags.
2175 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
2176 RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
2177 } else {
2178 // Adjust for missing dead-def flags.
2179 RegOpers.detectDeadDefs(*MI, *DAG.LIS);
2180 }
2181 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2182 }
2183
2184 // The region end doesn't change throughout scheduling since it itself is
2185 // outside the region (whether that is a MBB end or a terminator MI).
2186 assert(RegionEnd == DAG.Regions[RegionIdx].second && "region end mismatch");
2187 DAG.Regions[RegionIdx].first = MIOrder.front();
2188}
2189
2190bool RewriteMFMAFormStage::isRewriteCandidate(MachineInstr *MI) const {
2191
2192 if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
2193 return false;
2194 return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1;
2195}
2196
2197bool RewriteMFMAFormStage::initHeuristics(
2198 std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
2199 DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2200 SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2201 bool Changed = false;
2202
2203 // Prepare for the heuristics
2204 for (MachineBasicBlock &MBB : MF) {
2205 for (MachineInstr &MI : MBB) {
2206 if (!isRewriteCandidate(&MI))
2207 continue;
2208
2209 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
2210 assert(ReplacementOp != -1);
2211
2212 RewriteCands.push_back({&MI, MI.getOpcode()});
2213 MI.setDesc(TII->get(ReplacementOp));
2214
2215 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
2216 if (Src2->isReg()) {
2217 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2218 findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
2219
2220 // For any definition of the src2 register which is non-MFMA, we
2221 // insert a copy.
2222 for (SlotIndex RDIdx : Src2ReachingDefs) {
2223 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
2224 if (!TII->isMAI(*RD))
2225 CopyForDef.insert(RD);
2226 }
2227 }
2228
2229 MachineOperand &Dst = MI.getOperand(0);
2230 SmallVector<MachineOperand *, 8> DstReachingUses;
2231
2232 findReachingUses(&MI, DAG.LIS, DstReachingUses);
2233
2234 for (MachineOperand *RUOp : DstReachingUses) {
2235 if (TII->isMAI(*RUOp->getParent()))
2236 continue;
2237
2238 // For any user of the result of the MFMA which is not an MFMA, we
2239 // insert a copy. For a given register, we will only insert one copy
2240 // per user block.
2241 CopyForUse[RUOp->getParent()->getParent()].insert(RUOp->getReg());
2242
2243 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2244 findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
2245
2246 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2247 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2248 if (TII->isMAI(*RD))
2249 continue;
2250
2251 // For any definition of the user of the MFMA which is not an MFMA,
2252 // we insert a copy. We do this to transform all the reaching defs
2253 // of this use to AGPR. By doing this, we can insert a copy from
2254 // AGPR to VGPR at the user rather than after the MFMA.
2255 CopyForDef.insert(RD);
2256 }
2257 }
2258
2259 // Do the rewrite to allow for updated RP calculation.
2260 const TargetRegisterClass *VDefRC = DAG.MRI.getRegClass(Dst.getReg());
2261 const TargetRegisterClass *ADefRC = SRI->getEquivalentAGPRClass(VDefRC);
2262 DAG.MRI.setRegClass(Dst.getReg(), ADefRC);
2263 if (Src2->isReg()) {
2264 // Have to get src types separately since subregs may cause C and D
2265 // registers to be different types even though the actual operand is
2266 // the same size.
2267 const TargetRegisterClass *VUseRC = DAG.MRI.getRegClass(Src2->getReg());
2268 const TargetRegisterClass *AUseRC = SRI->getEquivalentAGPRClass(VUseRC);
2269 DAG.MRI.setRegClass(Src2->getReg(), AUseRC);
2270 }
2271 Changed = true;
2272 }
2273 }
2274
2275 return Changed;
2276}
2277
2278int64_t RewriteMFMAFormStage::getRewriteCost(
2279 const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
2280 const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2281 const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2282 MachineBlockFrequencyInfo *MBFI = DAG.MBFI;
2283
2284 int64_t BestSpillCost = 0;
2285 int64_t Cost = 0;
2286 uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
2287
2288 std::pair<unsigned, unsigned> MaxVectorRegs =
2289 ST.getMaxNumVectorRegs(MF.getFunction());
2290 unsigned ArchVGPRThreshold = MaxVectorRegs.first;
2291 unsigned AGPRThreshold = MaxVectorRegs.second;
2292 unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
2293
2294 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2295 if (!RegionsWithExcessArchVGPR[Region])
2296 continue;
2297
2298 GCNRegPressure &PressureBefore = DAG.Pressure[Region];
2299 unsigned SpillCostBefore = PressureBefore.getVGPRSpills(
2300 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2301
2302 // For the cases we care about (i.e. ArchVGPR usage is greater than the
2303 // addressable limit), rewriting alone should bring pressure to manageable
2304 // level. If we find any such region, then the rewrite is potentially
2305 // beneficial.
2306 GCNRegPressure PressureAfter = DAG.getRealRegPressure(Region);
2307 unsigned SpillCostAfter = PressureAfter.getVGPRSpills(
2308 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2309
2310 uint64_t BlockFreq =
2311 MBFI->getBlockFreq(DAG.Regions[Region].first->getParent())
2312 .getFrequency();
2313
2314 bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
2315 uint64_t RelativeFreq = EntryFreq && BlockFreq
2316 ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
2317 : BlockFreq / EntryFreq)
2318 : 1;
2319
2320 // This assumes perfect spilling / splitting -- using one spill / copy
2321 // instruction and one restoreFrom / copy for each excess register,
2322 int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2;
2323
2324 // Also account for the block frequency.
2325 if (RelativeFreqIsDenom)
2326 SpillCost /= (int64_t)RelativeFreq;
2327 else
2328 SpillCost *= (int64_t)RelativeFreq;
2329
2330 // If we have increased spilling in any block, just bail.
2331 if (SpillCost > 0)
2332 return SpillCost;
2333
2334 if (SpillCost < BestSpillCost)
2335 BestSpillCost = SpillCost;
2336 }
2337
2338 // Set the cost to the largest decrease in spill cost in order to not double
2339 // count spill reductions.
2340 Cost = BestSpillCost;
2341 assert(Cost <= 0);
2342
2343 unsigned CopyCost = 0;
2344
2345 // For each CopyForDef, increase the cost by the register size while
2346 // accounting for block frequency.
2347 for (MachineInstr *DefMI : CopyForDef) {
2348 Register DefReg = DefMI->getOperand(0).getReg();
2349 uint64_t DefFreq =
2350 EntryFreq
2351 ? MBFI->getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
2352 : 1;
2353
2354 const TargetRegisterClass *RC = DAG.MRI.getRegClass(DefReg);
2355 CopyCost += RC->getCopyCost() * DefFreq;
2356 }
2357
2358 // Account for CopyForUse copies in each block that the register is used.
2359 for (auto &[UseBlock, UseRegs] : CopyForUse) {
2360 uint64_t UseFreq =
2361 EntryFreq ? MBFI->getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
2362
2363 for (Register UseReg : UseRegs) {
2364 const TargetRegisterClass *RC = DAG.MRI.getRegClass(UseReg);
2365 CopyCost += RC->getCopyCost() * UseFreq;
2366 }
2367 }
2368
2369 // Reset the classes that were changed to AGPR for better RB analysis.
2370 // We must do rewriting after copy-insertion, as some defs of the register
2371 // may require VGPR. Additionally, if we bail out and don't perform the
2372 // rewrite then these need to be restored anyway.
2373 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2374 assert(TII->isMAI(*MI));
2375 const TargetRegisterClass *ADefRC =
2376 DAG.MRI.getRegClass(MI->getOperand(0).getReg());
2377 const TargetRegisterClass *VDefRC = SRI->getEquivalentVGPRClass(ADefRC);
2378 DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VDefRC);
2379 MI->setDesc(TII->get(OriginalOpcode));
2380
2381 MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
2382 assert(Src2);
2383 if (!Src2->isReg())
2384 continue;
2385
2386 // Have to get src types separately since subregs may cause C and D
2387 // registers to be different types even though the actual operand is
2388 // the same size.
2389 const TargetRegisterClass *AUseRC = DAG.MRI.getRegClass(Src2->getReg());
2390 const TargetRegisterClass *VUseRC = SRI->getEquivalentVGPRClass(AUseRC);
2391 DAG.MRI.setRegClass(Src2->getReg(), VUseRC);
2392 }
2393
2394 return Cost + CopyCost;
2395}
2396
2397bool RewriteMFMAFormStage::rewrite(
2398 const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
2399 DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
2400 DenseMap<MachineInstr *, unsigned> LastMIToRegion;
2401
2402 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2403 RegionBoundaries Entry = DAG.Regions[Region];
2404 if (Entry.first == Entry.second)
2405 continue;
2406
2407 FirstMIToRegion[&*Entry.first] = Region;
2408 if (Entry.second != Entry.first->getParent()->end())
2409 LastMIToRegion[&*Entry.second] = Region;
2410 }
2411
2412 // Rewrite the MFMAs to AGPR, and insert any copies as needed.
2413 // The general assumption of the algorithm (and the previous cost calculation)
2414 // is that it is better to insert the copies in the MBB of the def of the src2
2415 // operands, and in the MBB of the user of the dest operands. This is based on
2416 // the assumption that the MFMAs are likely to appear in loop bodies, while
2417 // the src2 and dest operands are live-in / live-out of the loop. Due to this
2418 // design, the algorithm for finding copy insertion points is more
2419 // complicated.
2420 //
2421 // There are three main cases to handle: 1. the reaching defs of the src2
2422 // operands, 2. the reaching uses of the dst operands, and 3. the reaching
2423 // defs of the reaching uses of the dst operand.
2424 //
2425 // In the first case, we simply insert copies after each of the reaching
2426 // definitions. In the second case, we collect all the uses of a given dest
2427 // and organize them by MBB. Then, we insert 1 copy for each MBB before the
2428 // earliest use. Since the use may have multiple reaching defs, and since we
2429 // want to replace the register it is using with the result of the copy, we
2430 // must handle case 3. In the third case, we simply insert a copy after each
2431 // of the reaching defs to connect to the copy of the reaching uses of the dst
2432 // reg. This allows us to avoid inserting copies next to the MFMAs.
2433 //
2434 // While inserting the copies, we maintain a map of operands which will use
2435 // different regs (i.e. the result of the copies). For example, a case 1 src2
2436 // operand will use the register result of the copies after the reaching defs,
2437 // as opposed to the original register. Now that we have completed our copy
2438 // analysis and placement, we can bulk update the registers. We do this
2439 // separately as to avoid complicating the reachingDef and reachingUse
2440 // queries.
2441 //
2442 // While inserting the copies, we also maintain a list or registers which we
2443 // will want to reclassify as AGPR. After doing the copy insertion and the
2444 // register replacement, we can finally do the reclassification. This uses the
2445 // redef map, as the registers we are interested in reclassifying may be
2446 // replaced by the result of a copy. We must do this after the copy analysis
2447 // and placement as we must have an accurate redef map -- otherwise we may end
2448 // up creating illegal instructions.
2449
2450 // The original registers of the MFMA that need to be reclassified as AGPR.
2451 DenseSet<Register> RewriteRegs;
2452 // The map of an original register in the MFMA to a new register (result of a
2453 // copy) that it should be replaced with.
2454 DenseMap<Register, Register> RedefMap;
2455 // The map of the original MFMA registers to the relevant MFMA operands.
2456 DenseMap<Register, DenseSet<MachineOperand *>> ReplaceMap;
2457 // The map of reaching defs for a given register -- to avoid duplicate copies.
2458 DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap;
2459 // The map of reaching uses for a given register by basic block -- to avoid
2460 // duplicate copies and to calculate per MBB insert pts.
2461 DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>
2462 ReachingUseTracker;
2463
2464 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2465 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode());
2466 if (ReplacementOp == -1)
2467 continue;
2468 MI->setDesc(TII->get(ReplacementOp));
2469
2470 // Case 1: insert copies for the reaching defs of the Src2Reg.
2471 MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
2472 if (Src2->isReg()) {
2473 Register Src2Reg = Src2->getReg();
2474 if (!Src2Reg.isVirtual())
2475 return false;
2476
2477 Register MappedReg = Src2->getReg();
2478 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2479 findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
2480 SmallSetVector<MachineInstr *, 8> Src2DefsReplace;
2481
2482 for (SlotIndex RDIndex : Src2ReachingDefs) {
2483 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2484 if (TII->isMAI(*RD))
2485 continue;
2486
2487 // If there is a non mai reaching def, then we need a copy.
2488 Src2DefsReplace.insert(RD);
2489 }
2490
2491 if (!Src2DefsReplace.empty()) {
2492 DenseMap<Register, Register>::iterator RI = RedefMap.find(Src2Reg);
2493 if (RI != RedefMap.end()) {
2494 MappedReg = RI->second;
2495 } else {
2496 assert(!ReachingDefCopyMap.contains(Src2Reg));
2497 const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Src2Reg);
2498 const TargetRegisterClass *VGPRRC =
2499 SRI->getEquivalentVGPRClass(Src2RC);
2500
2501 // Track the mapping of the original register to the new register.
2502 MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
2503 RedefMap[Src2Reg] = MappedReg;
2504 }
2505
2506 // If none exists, create a copy from this reaching def.
2507 // We may have inserted a copy already in an earlier iteration.
2508 for (MachineInstr *RD : Src2DefsReplace) {
2509 // Do not create redundant copies.
2510 if (ReachingDefCopyMap[Src2Reg].insert(RD).second) {
2511 MachineInstrBuilder VGPRCopy =
2512 BuildMI(*RD->getParent(), std::next(RD->getIterator()),
2513 RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
2514 .addDef(MappedReg, {}, 0)
2515 .addUse(Src2Reg, {}, 0);
2516 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2517
2518 // If this reaching def was the last MI in the region, update the
2519 // region boundaries.
2520 if (LastMIToRegion.contains(RD)) {
2521 unsigned UpdateRegion = LastMIToRegion[RD];
2522 DAG.Regions[UpdateRegion].second = VGPRCopy;
2523 LastMIToRegion.erase(RD);
2524 }
2525 }
2526 }
2527 }
2528
2529 // Track the register for reclassification
2530 RewriteRegs.insert(Src2Reg);
2531
2532 // Always insert the operand for replacement. If this corresponds with a
2533 // chain of tied-def we may not see the VGPR requirement until later.
2534 ReplaceMap[Src2Reg].insert(Src2);
2535 }
2536
2537 // Case 2 and Case 3: insert copies before the reaching uses of the dsts,
2538 // and after the reaching defs of the reaching uses of the dsts.
2539
2540 MachineOperand *Dst = &MI->getOperand(0);
2541 Register DstReg = Dst->getReg();
2542 if (!DstReg.isVirtual())
2543 return false;
2544
2545 Register MappedReg = DstReg;
2546 SmallVector<MachineOperand *, 8> DstReachingUses;
2547
2548 SmallVector<MachineOperand *, 8> DstReachingUseCopies;
2549 SmallVector<MachineInstr *, 8> DstUseDefsReplace;
2550
2551 findReachingUses(MI, DAG.LIS, DstReachingUses);
2552
2553 for (MachineOperand *RUOp : DstReachingUses) {
2554 if (TII->isMAI(*RUOp->getParent()))
2555 continue;
2556
2557 // If there is a non mai reaching use, then we need a copy.
2558 if (find(DstReachingUseCopies, RUOp) == DstReachingUseCopies.end())
2559 DstReachingUseCopies.push_back(RUOp);
2560 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2561 findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
2562
2563 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2564 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2565 if (TII->isMAI(*RD))
2566 continue;
2567
2568 // If there is a non mai reaching def of this reaching use, then we will
2569 // need a copy.
2570 if (find(DstUseDefsReplace, RD) == DstUseDefsReplace.end())
2571 DstUseDefsReplace.push_back(RD);
2572 }
2573 }
2574
2575 if (!DstUseDefsReplace.empty()) {
2576 DenseMap<Register, Register>::iterator RI = RedefMap.find(DstReg);
2577 if (RI != RedefMap.end()) {
2578 MappedReg = RI->second;
2579 } else {
2580 assert(!ReachingDefCopyMap.contains(DstReg));
2581 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
2582 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2583
2584 // Track the mapping of the original register to the new register.
2585 MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
2586 RedefMap[DstReg] = MappedReg;
2587 }
2588
2589 // If none exists, create a copy from this reaching def.
2590 // We may have inserted a copy already in an earlier iteration.
2591 for (MachineInstr *RD : DstUseDefsReplace) {
2592 // Do not create reundant copies.
2593 if (ReachingDefCopyMap[DstReg].insert(RD).second) {
2594 MachineInstrBuilder VGPRCopy =
2595 BuildMI(*RD->getParent(), std::next(RD->getIterator()),
2596 RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
2597 .addDef(MappedReg, {}, 0)
2598 .addUse(DstReg, {}, 0);
2599 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2600
2601 // If this reaching def was the last MI in the region, update the
2602 // region boundaries.
2604 LastMIToRegion.find(RD);
2605 if (LMI != LastMIToRegion.end()) {
2606 unsigned UpdateRegion = LMI->second;
2607 DAG.Regions[UpdateRegion].second = VGPRCopy;
2608 LastMIToRegion.erase(RD);
2609 }
2610 }
2611 }
2612 }
2613
2614 DenseSet<MachineOperand *> &DstRegSet = ReplaceMap[DstReg];
2615 for (MachineOperand *RU : DstReachingUseCopies) {
2616 MachineBasicBlock *RUBlock = RU->getParent()->getParent();
2617 // Just keep track of the reaching use of this register by block. After we
2618 // have scanned all the MFMAs we can find optimal insert pts.
2619 if (RUBlock != MI->getParent()) {
2620 ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(RU);
2621 continue;
2622 }
2623
2624 // Special case, the use is in the same block as the MFMA. Insert the copy
2625 // just before the use.
2626 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
2627 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2628 Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
2629 MachineInstr *UseInst = RU->getParent();
2630 MachineInstrBuilder VGPRCopy =
2631 BuildMI(*UseInst->getParent(), UseInst->getIterator(),
2632 UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
2633 .addDef(NewUseReg, {}, 0)
2634 .addUse(DstReg, {}, 0);
2635 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2636 // Since we know this use has only one reaching def, we can replace the
2637 // use reg.
2638 RU->setReg(NewUseReg);
2639 // Track the copy source operand for r eplacement.
2640 DstRegSet.insert(&VGPRCopy->getOperand(1));
2641 }
2642
2643 // Track the register for reclassification
2644 RewriteRegs.insert(DstReg);
2645
2646 // Insert the dst operand for replacement. If this dst is in a chain of
2647 // tied-def MFMAs, and the first src2 needs to be replaced with a new reg,
2648 // all the correspond operands need to be replaced.
2649 DstRegSet.insert(Dst);
2650 }
2651
2652 // Handle the copies for dst uses.
2653 using RUBType =
2654 std::pair<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>;
2655 for (RUBType RUBlockEntry : ReachingUseTracker) {
2656 using RUDType = std::pair<Register, SmallPtrSet<MachineOperand *, 8>>;
2657 for (RUDType RUDst : RUBlockEntry.second) {
2658 MachineOperand *OpBegin = *RUDst.second.begin();
2659 SlotIndex InstPt = DAG.LIS->getInstructionIndex(*OpBegin->getParent());
2660
2661 // Find the earliest use in this block.
2662 for (MachineOperand *User : RUDst.second) {
2663 SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(*User->getParent());
2664 if (SlotIndex::isEarlierInstr(NewInstPt, InstPt))
2665 InstPt = NewInstPt;
2666 }
2667
2668 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(RUDst.first);
2669 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2670 Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
2671 MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(InstPt);
2672
2673 MachineInstrBuilder VGPRCopy =
2674 BuildMI(*UseInst->getParent(), UseInst->getIterator(),
2675 UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
2676 .addDef(NewUseReg, {}, 0)
2677 .addUse(RUDst.first, {}, 0);
2678 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2679
2680 // If this UseInst was the first MI in the region, update the region
2681 // boundaries.
2683 FirstMIToRegion.find(UseInst);
2684 if (FI != FirstMIToRegion.end()) {
2685 unsigned UpdateRegion = FI->second;
2686 DAG.Regions[UpdateRegion].first = VGPRCopy;
2687 FirstMIToRegion.erase(UseInst);
2688 }
2689
2690 // Replace the operand for all users.
2691 for (MachineOperand *User : RUDst.second) {
2692 User->setReg(NewUseReg);
2693 }
2694
2695 // Track the copy source operand for replacement.
2696 ReplaceMap[RUDst.first].insert(&VGPRCopy->getOperand(1));
2697 }
2698 }
2699
2700 // We may have needed to insert copies after the reaching defs of the MFMAs.
2701 // Replace the original register with the result of the copy for all relevant
2702 // operands.
2703 for (std::pair<Register, Register> NewDef : RedefMap) {
2704 Register OldReg = NewDef.first;
2705 Register NewReg = NewDef.second;
2706
2707 // Replace the register for any associated operand in the MFMA chain.
2708 for (MachineOperand *ReplaceOp : ReplaceMap[OldReg])
2709 ReplaceOp->setReg(NewReg);
2710 }
2711
2712 // Finally, do the reclassification of the MFMA registers.
2713 for (Register RewriteReg : RewriteRegs) {
2714 Register RegToRewrite = RewriteReg;
2715
2716 // Be sure to update the replacement register and not the original.
2717 DenseMap<Register, Register>::iterator RI = RedefMap.find(RewriteReg);
2718 if (RI != RedefMap.end())
2719 RegToRewrite = RI->second;
2720
2721 const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(RegToRewrite);
2722 const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(CurrRC);
2723
2724 DAG.MRI.setRegClass(RegToRewrite, AGPRRC);
2725 }
2726
2727 // Bulk update the LIS.
2728 DAG.LIS->reanalyze(DAG.MF);
2729 // Liveins may have been modified for cross RC copies
2730 RegionPressureMap LiveInUpdater(&DAG, false);
2731 LiveInUpdater.buildLiveRegMap();
2732
2733 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++)
2734 DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(Region);
2735
2736 DAG.Pressure[RegionIdx] = DAG.getRealRegPressure(RegionIdx);
2737
2738 return true;
2739}
2740
2741unsigned PreRARematStage::getStageTargetOccupancy() const {
2742 return TargetOcc ? *TargetOcc : MFI.getMinWavesPerEU();
2743}
2744
2745bool PreRARematStage::setObjective() {
2746 const Function &F = MF.getFunction();
2747
2748 // Set up "spilling targets" for all regions.
2749 unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
2750 unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
2751 bool HasVectorRegisterExcess = false;
2752 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2753 const GCNRegPressure &RP = DAG.Pressure[I];
2754 GCNRPTarget &Target = RPTargets.emplace_back(MaxSGPRs, MaxVGPRs, MF, RP);
2755 if (!Target.satisfied())
2756 TargetRegions.set(I);
2757 HasVectorRegisterExcess |= Target.hasVectorRegisterExcess();
2758 }
2759
2760 if (HasVectorRegisterExcess || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
2761 // In addition to register usage being above addressable limits, occupancy
2762 // below the minimum is considered like "spilling" as well.
2763 TargetOcc = std::nullopt;
2764 } else {
2765 // There is no spilling and room to improve occupancy; set up "increased
2766 // occupancy targets" for all regions.
2767 TargetOcc = DAG.MinOccupancy + 1;
2768 const unsigned VGPRBlockSize = MFI.getDynamicVGPRBlockSize();
2769 MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
2770 MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
2771 for (auto [I, Target] : enumerate(RPTargets)) {
2772 Target.setTarget(MaxSGPRs, MaxVGPRs);
2773 if (!Target.satisfied())
2774 TargetRegions.set(I);
2775 }
2776 }
2777
2778 return TargetRegions.any();
2779}
2780
2781bool PreRARematStage::collectRematRegs(
2782 const DenseMap<MachineInstr *, unsigned> &MIRegion) {
2783 // We need up-to-date live-out info. to query live-out register masks in
2784 // regions containing rematerializable instructions.
2785 DAG.RegionLiveOuts.buildLiveRegMap();
2786
2787 // Set of registers already marked for potential remterialization; used to
2788 // avoid rematerialization chains.
2789 SmallSet<Register, 4> MarkedRegs;
2790 auto IsMarkedForRemat = [&MarkedRegs](const MachineOperand &MO) -> bool {
2791 return MO.isReg() && MarkedRegs.contains(MO.getReg());
2792 };
2793
2794 // Identify rematerializable instructions in the function.
2795 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2796 RegionBoundaries Bounds = DAG.Regions[I];
2797 for (auto MI = Bounds.first; MI != Bounds.second; ++MI) {
2798 // The instruction must be rematerializable.
2799 MachineInstr &DefMI = *MI;
2800 if (!isReMaterializable(DefMI))
2801 continue;
2802
2803 // We only support rematerializing virtual registers with one
2804 // definition.
2805 Register Reg = DefMI.getOperand(0).getReg();
2806 if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg))
2807 continue;
2808
2809 // We only care to rematerialize the instruction if it has a single
2810 // non-debug user in a different region.
2811 // FIXME: Allow rematerializations with multiple uses. This should be
2812 // relatively easy to support using the current cost model.
2813 MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
2814 if (!UseMI)
2815 continue;
2816 auto UseRegion = MIRegion.find(UseMI);
2817 if (UseRegion == MIRegion.end() || UseRegion->second == I)
2818 continue;
2819
2820 // Do not rematerialize an instruction if it uses or is used by an
2821 // instruction that we have designated for rematerialization.
2822 // FIXME: Allow for rematerialization chains: this requires 1. updating
2823 // remat points to account for uses that are rematerialized, and 2.
2824 // either rematerializing the candidates in careful ordering, or
2825 // deferring the MBB RP walk until the entire chain has been
2826 // rematerialized.
2827 const MachineOperand &UseMO = UseMI->getOperand(0);
2828 if (IsMarkedForRemat(UseMO) ||
2829 llvm::any_of(DefMI.operands(), IsMarkedForRemat))
2830 continue;
2831
2832 // Do not rematerialize an instruction it it uses registers that aren't
2833 // available at its use. This ensures that we are not extending any live
2834 // range while rematerializing.
2835 SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
2836 if (!VirtRegAuxInfo::allUsesAvailableAt(&DefMI, UseIdx, *DAG.LIS, DAG.MRI,
2837 *DAG.TII))
2838 continue;
2839
2840 // Add the instruction to the rematerializable list.
2841 MarkedRegs.insert(Reg);
2842 RematRegs.emplace_back(&DefMI, UseMI, DAG, MIRegion);
2843 }
2844 }
2845
2846 return !RematRegs.empty();
2847}
2848
2849PreRARematStage::RematReg::RematReg(
2850 MachineInstr *DefMI, MachineInstr *UseMI, GCNScheduleDAGMILive &DAG,
2851 const DenseMap<MachineInstr *, unsigned> &MIRegion)
2852 : DefMI(DefMI), UseMI(UseMI), LiveIn(DAG.Regions.size()),
2853 LiveOut(DAG.Regions.size()), Live(DAG.Regions.size()),
2854 DefRegion(MIRegion.at(DefMI)), UseRegion(MIRegion.at(UseMI)) {
2855
2856 // Mark regions in which the rematerializable register is live.
2857 Register Reg = getReg();
2858 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2859 auto LiveInIt = DAG.LiveIns[I].find(Reg);
2860 if (LiveInIt != DAG.LiveIns[I].end())
2861 LiveIn.set(I);
2862 const auto &LiveOuts = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I);
2863 if (auto LiveOutIt = LiveOuts.find(Reg); LiveOutIt != LiveOuts.end())
2864 LiveOut.set(I);
2865 }
2866 Live |= LiveIn;
2867 Live |= LiveOut;
2868 Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(DefRegion).at(Reg);
2869}
2870
2871bool PreRARematStage::RematReg::maybeBeneficial(
2872 const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets) const {
2873 Register Reg = getReg();
2874 for (unsigned I : TargetRegions.set_bits()) {
2875 if (Live[I] && RPTargets[I].isSaveBeneficial(Reg))
2876 return true;
2877 }
2878 return false;
2879}
2880
2881void PreRARematStage::RematReg::insertMI(unsigned RegionIdx,
2882 MachineInstr *RematMI,
2883 GCNScheduleDAGMILive &DAG) const {
2884 RegionBoundaries &Bounds = DAG.Regions[RegionIdx];
2885 if (Bounds.first == std::next(MachineBasicBlock::iterator(RematMI)))
2886 Bounds.first = RematMI;
2887 DAG.LIS->InsertMachineInstrInMaps(*RematMI);
2888 DAG.LIS->createAndComputeVirtRegInterval(RematMI->getOperand(0).getReg());
2889}
2890
2893 assert(DAG.MLI && "MLI not defined in DAG");
2895 MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI);
2896
2897 const unsigned NumRegions = DAG.Regions.size();
2899 MaxFreq = 0;
2900 Regions.reserve(NumRegions);
2901 for (unsigned I = 0; I < NumRegions; ++I) {
2902 MachineBasicBlock *MBB = DAG.Regions[I].first->getParent();
2903 uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency();
2904 Regions.push_back(BlockFreq);
2905 if (BlockFreq && BlockFreq < MinFreq)
2906 MinFreq = BlockFreq;
2907 else if (BlockFreq > MaxFreq)
2908 MaxFreq = BlockFreq;
2909 }
2910 if (!MinFreq)
2911 return;
2912
2913 // Scale everything down if frequencies are high.
2914 if (MinFreq >= ScaleFactor * ScaleFactor) {
2915 for (uint64_t &Freq : Regions)
2916 Freq /= ScaleFactor;
2917 MinFreq /= ScaleFactor;
2918 MaxFreq /= ScaleFactor;
2919 }
2920}
2921
2922PreRARematStage::ScoredRemat::ScoredRemat(RematReg *Remat, const FreqInfo &Freq,
2924 : Remat(Remat), FreqDiff(getFreqDiff(Freq)) {
2925 RPSave.inc(Remat->getReg(), LaneBitmask::getNone(), Remat->Mask, DAG.MRI);
2926}
2927
2928int64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
2929 // Get frequencies of defining and using regions. A rematerialization from the
2930 // least frequent region to the most frequent region will yield the greatest
2931 // latency penalty and therefore should get minimum score. Reciprocally, a
2932 // rematerialization in the other direction should get maximum score. Default
2933 // to values that will yield the worst possible score given known frequencies
2934 // in order to penalize rematerializations from or into regions whose
2935 // frequency is unknown.
2936 int64_t DefOrMin = std::max(Freq.Regions[Remat->DefRegion], Freq.MinFreq);
2937 int64_t UseOrMax = Freq.Regions[Remat->UseRegion];
2938 if (!UseOrMax)
2939 UseOrMax = Freq.MaxFreq;
2940 return DefOrMin - UseOrMax;
2941}
2942
2943void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
2944 ArrayRef<GCNRPTarget> RPTargets,
2945 const FreqInfo &FreqInfo,
2946 bool ReduceSpill) {
2947 MaxFreq = 0;
2948 RegionImpact = 0;
2949 for (unsigned I : TargetRegions.set_bits()) {
2950 if (!Remat->Live[I])
2951 continue;
2952
2953 // The rematerialization must contribute positively in at least one
2954 // register class with usage above the RP target for this region to
2955 // contribute to the score.
2956 const GCNRPTarget &RegionTarget = RPTargets[I];
2957 const unsigned NumRegsBenefit = RegionTarget.getNumRegsBenefit(RPSave);
2958 if (!NumRegsBenefit)
2959 continue;
2960
2961 bool UnusedLT = Remat->isUnusedLiveThrough(I);
2962
2963 // Regions in which RP is guaranteed to decrease have more weight.
2964 RegionImpact += (UnusedLT ? 2 : 1) * NumRegsBenefit;
2965
2966 if (ReduceSpill) {
2967 uint64_t Freq = FreqInfo.Regions[I];
2968 if (!UnusedLT) {
2969 // Apply a frequency penalty in regions in which we are not sure that RP
2970 // will decrease.
2971 Freq /= 2;
2972 }
2973 MaxFreq = std::max(MaxFreq, Freq);
2974 }
2975 }
2976}
2977
2978MachineInstr *PreRARematStage::ScoredRemat::rematerialize(
2979 BitVector &RecomputeRP, SmallVectorImpl<GCNRPTarget> &RPTargets,
2980 GCNScheduleDAGMILive &DAG) const {
2981 const SIInstrInfo *TII = DAG.MF.getSubtarget<GCNSubtarget>().getInstrInfo();
2982 MachineInstr &DefMI = *Remat->DefMI;
2983 Register Reg = DefMI.getOperand(0).getReg();
2985
2986 // Rematerialize the register in the region where it is used.
2987 MachineBasicBlock::iterator InsertPos = Remat->UseMI;
2988 TII->reMaterialize(*InsertPos->getParent(), InsertPos, NewReg, 0, DefMI);
2989 MachineInstr *RematMI = &*std::prev(InsertPos);
2990 Remat->UseMI->substituteRegister(Reg, NewReg, 0, *DAG.TRI);
2991 Remat->insertMI(Remat->UseRegion, RematMI, DAG);
2992
2993#ifdef EXPENSIVE_CHECKS
2994 // All uses are known to be available / live at the remat point. Thus,
2995 // the uses should already be live in to the using region.
2996 for (MachineOperand &MO : DefMI.operands()) {
2997 if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
2998 continue;
2999
3000 Register UseReg = MO.getReg();
3001 if (!UseReg.isVirtual())
3002 continue;
3003
3004 LiveInterval &LI = DAG.LIS->getInterval(UseReg);
3005 LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
3006 if (LI.hasSubRanges() && MO.getSubReg())
3007 LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
3008
3009 LaneBitmask LiveInMask = DAG.LiveIns[Remat->UseRegion].at(UseReg);
3010 LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
3011 // If this register has lanes not covered by the LiveIns, be sure they
3012 // do not map to any subrange. ref:
3013 // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
3014 if (UncoveredLanes.any()) {
3015 assert(LI.hasSubRanges());
3016 for (LiveInterval::SubRange &SR : LI.subranges())
3017 assert((SR.LaneMask & UncoveredLanes).none());
3018 }
3019 }
3020#endif
3021
3022 // Remove the register from all regions where it is a live-in or live-out
3023 // and adjust RP targets. The save is guaranteed in regions in which the
3024 // register is live-through and unused but optimistic in all other regions
3025 // where the register is live.
3026 for (unsigned I : Remat->Live.set_bits()) {
3027 RPTargets[I].saveRP(RPSave);
3028 DAG.LiveIns[I].erase(Reg);
3029 DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).erase(Reg);
3030 if (!Remat->isUnusedLiveThrough(I))
3031 RecomputeRP.set(I);
3032 }
3033
3034 return RematMI;
3035}
3036
3037void PreRARematStage::commitRematerializations() const {
3038 REMAT_DEBUG(dbgs() << "Commiting all rematerializations\n");
3039 for (const RollbackInfo &Rollback : Rollbacks)
3040 DAG.deleteMI(Rollback.Remat->DefRegion, Rollback.Remat->DefMI);
3041}
3042
3043void PreRARematStage::unsetSatisfiedRPTargets(const BitVector &Regions) {
3044 for (unsigned I : Regions.set_bits()) {
3045 if (TargetRegions[I] && RPTargets[I].satisfied()) {
3046 REMAT_DEBUG(dbgs() << " [" << I << "] Target reached!\n");
3047 TargetRegions.reset(I);
3048 }
3049 }
3050}
3051
3052bool PreRARematStage::updateAndVerifyRPTargets(const BitVector &Regions) {
3053 bool TooOptimistic = false;
3054 for (unsigned I : Regions.set_bits()) {
3055 GCNRPTarget &Target = RPTargets[I];
3056 Target.setRP(DAG.getRealRegPressure(I));
3057
3058 // Since we were optimistic in assessing RP decreases in these regions, we
3059 // may need to remark the target as a target region if RP didn't decrease
3060 // as expected.
3061 if (!TargetRegions[I] && !Target.satisfied()) {
3062 REMAT_DEBUG(dbgs() << " [" << I << "] Incorrect RP estimation\n");
3063 TooOptimistic = true;
3064 TargetRegions.set(I);
3065 }
3066 }
3067 return TooOptimistic;
3068}
3069
3070// Copied from MachineLICM
3071bool PreRARematStage::isReMaterializable(const MachineInstr &MI) {
3073 return false;
3074
3075 for (const MachineOperand &MO : MI.all_uses()) {
3076 // We can't remat physreg uses, unless it is a constant or an ignorable
3077 // use (e.g. implicit exec use on VALU instructions)
3078 if (MO.getReg().isPhysical()) {
3079 if (DAG.MRI.isConstantPhysReg(MO.getReg()) || DAG.TII->isIgnorableUse(MO))
3080 continue;
3081 return false;
3082 }
3083 }
3084
3085 return true;
3086}
3087
3089 // We consider that reducing spilling is always beneficial so we never
3090 // rollback rematerializations or revert scheduling in such cases.
3091 if (!TargetOcc)
3092 return;
3093
3094 // When increasing occupancy, it is possible that re-scheduling is not able to
3095 // achieve the target occupancy in all regions, in which case re-scheduling in
3096 // all regions should be reverted.
3097 if (DAG.MinOccupancy >= *TargetOcc) {
3098 commitRematerializations();
3099 return;
3100 }
3101
3102 // It is possible that re-scheduling lowers occupancy over the one achieved
3103 // just through rematerializations, in which case we revert re-scheduling in
3104 // all regions but do not roll back rematerializations.
3105 const bool ShouldRollbackRemats = AchievedOcc < *TargetOcc;
3106
3107 // When we both need to revert re-scheduling and rollback rematerializations,
3108 // restore rematerialized MIs' original state before reverting so that they
3109 // are treated as non-debug instructions by the revert logic.
3110 if (ShouldRollbackRemats) {
3111 for (const RollbackInfo &Rollback : Rollbacks) {
3112 const auto &[Remat, RematMI, RegMap] = Rollback;
3113 Remat->DefMI->setDesc(DAG.TII->get(RematMI->getOpcode()));
3114 for (const auto &[MOIdx, Reg] : RegMap)
3115 Remat->DefMI->getOperand(MOIdx).setReg(Reg);
3116 }
3117 }
3118
3119 // Revert re-scheduling in all affected regions.
3120 for (const auto &[RegionIdx, OrigMIOrder, MaxPressure] : RegionReverts) {
3121 REMAT_DEBUG(dbgs() << "Reverting re-scheduling in region " << RegionIdx
3122 << '\n');
3123 DAG.Pressure[RegionIdx] = MaxPressure;
3124 modifyRegionSchedule(RegionIdx, RegionBB[RegionIdx], OrigMIOrder);
3125 }
3126
3127 if (!ShouldRollbackRemats) {
3128 commitRematerializations();
3129 DAG.setTargetOccupancy(AchievedOcc);
3130 return;
3131 }
3132
3133 // Reset the target occupancy to what it was pre-rematerialization.
3134 DAG.setTargetOccupancy(*TargetOcc - 1);
3135
3136 // Finish rolling back rematerializations, then recompute pressure in all
3137 // affected regions.
3138 REMAT_DEBUG(dbgs() << "==== ROLLBACK ====\n");
3139 BitVector RecomputeRP(DAG.Regions.size());
3140 DenseSet<Register> RecomputeLI;
3141 for (const RollbackInfo &Rollback : Rollbacks) {
3142 const auto &[Remat, RematMI, RegMap] = Rollback;
3143
3144 // Switch back to using the original register and delete the
3145 // rematerialization.
3146 Register Reg = RematMI->getOperand(0).getReg();
3147 Register OriginalReg = Remat->DefMI->getOperand(0).getReg();
3148 Remat->UseMI->substituteRegister(Reg, OriginalReg, 0, *DAG.TRI);
3149 REMAT_DEBUG(dbgs() << '[' << Remat->UseRegion
3150 << "] Deleting rematerialization " << *RematMI);
3151 DAG.deleteMI(Remat->UseRegion, RematMI);
3152
3153 // Re-add the defined register as a live-in/live-out in all regions it used
3154 // to be one in.
3155 std::pair<Register, LaneBitmask> LiveReg(OriginalReg, Remat->Mask);
3156 for (unsigned I : Remat->LiveIn.set_bits())
3157 DAG.LiveIns[I].insert(LiveReg);
3158 for (unsigned I : Remat->LiveOut.set_bits())
3159 DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).insert(LiveReg);
3160
3161 RecomputeRP |= Rollback.Remat->Live;
3162 // Regenerate intervals for all register operands of rematerialized MIs as
3163 // slot indices may have changed slightly from before re-scheduling.
3164 for (MachineOperand &MO : Rollback.Remat->DefMI->operands()) {
3165 if (MO.isReg() && MO.getReg().isVirtual())
3166 RecomputeLI.insert(MO.getReg());
3167 }
3168 }
3169 for (Register Reg : RecomputeLI) {
3172 }
3173#ifdef EXPENSIVE_CHECKS
3174 // In particular, we want to check for coherent MI/slot order in regions in
3175 // which reverts and/or rollbacks may have happened.
3176 MF.verify();
3177#endif
3178 for (unsigned I : RecomputeRP.set_bits())
3179 DAG.Pressure[I] = DAG.getRealRegPressure(I);
3180
3182}
3183
3184void GCNScheduleDAGMILive::deleteMI(unsigned RegionIdx, MachineInstr *MI) {
3185 // It's not possible for the deleted instruction to be upper region boundary
3186 // since we don't delete region terminators.
3187 if (Regions[RegionIdx].first == MI)
3188 Regions[RegionIdx].first = std::next(MachineBasicBlock::iterator(MI));
3189 LIS->removeInterval(MI->getOperand(0).getReg());
3191 MI->eraseFromParent();
3192}
3193
3194void GCNScheduleDAGMILive::setTargetOccupancy(unsigned TargetOccupancy) {
3195 MinOccupancy = TargetOccupancy;
3196 if (MFI.getOccupancy() < TargetOccupancy)
3197 MFI.increaseOccupancy(MF, MinOccupancy);
3198 else
3199 MFI.limitOccupancy(MinOccupancy);
3200}
3201
3203 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
3204 return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) {
3205 return SII->isIGLPMutationOnly(MI->getOpcode());
3206 });
3207}
3208
3213
3215 HasIGLPInstrs = hasIGLPInstrs(this);
3216 if (HasIGLPInstrs) {
3217 SavedMutations.clear();
3218 SavedMutations.swap(Mutations);
3220 }
3221
3223}
3224
3226 if (HasIGLPInstrs)
3227 SavedMutations.swap(Mutations);
3228
3230}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SUnit * pickOnlyChoice(SchedBoundary &Zone)
MachineBasicBlock & MBB
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file defines the GCNRegPressure class, which tracks registry pressure by bookkeeping number of S...
static cl::opt< bool > GCNTrackers("amdgpu-use-amdgpu-trackers", cl::Hidden, cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false))
static cl::opt< bool > DisableClusteredLowOccupancy("amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden, cl::desc("Disable clustered low occupancy " "rescheduling for ILP scheduling stage."), cl::init(false))
#define REMAT_PREFIX
Allows to easily filter for this stage's debug output.
static MachineInstr * getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, MachineBasicBlock::iterator RegionEnd)
static bool shouldCheckPending(SchedBoundary &Zone, const TargetSchedModel *SchedModel)
static cl::opt< bool > RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, cl::desc("Relax occupancy targets for kernels which are memory " "bound (amdgpu-membound-threshold), or " "Wave Limited (amdgpu-limit-wave-threshold)."), cl::init(false))
#define REMAT_DEBUG(X)
static cl::opt< bool > DisableUnclusterHighRP("amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden, cl::desc("Disable unclustered high register pressure " "reduction scheduling stage."), cl::init(false))
static void printScheduleModel(std::set< std::pair< MachineInstr *, unsigned >, EarlierIssuingCycle > &ReadyCycles)
static cl::opt< bool > PrintMaxRPRegUsageAfterScheduler("amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure after scheduling."), cl::init(false))
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG)
static cl::opt< bool > DisableRewriteMFMAFormSchedStage("amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden, cl::desc("Disable rewrite mfma rewrite scheduling stage"), cl::init(true))
static bool canUsePressureDiffs(const SUnit &SU)
Checks whether SU can use the cached DAG pressure diffs to compute the current register pressure.
static cl::opt< unsigned > PendingQueueLimit("amdgpu-scheduler-pending-queue-limit", cl::Hidden, cl::desc("Max (Available+Pending) size to inspect pending queue (0 disables)"), cl::init(256))
static cl::opt< bool > PrintMaxRPRegUsageBeforeScheduler("amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure before scheduling."), cl::init(false))
static cl::opt< unsigned > ScheduleMetricBias("amdgpu-schedule-metric-bias", cl::Hidden, cl::desc("Sets the bias which adds weight to occupancy vs latency. Set it to " "100 to chase the occupancy only."), cl::init(10))
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
static constexpr std::pair< StringLiteral, StringLiteral > ReplaceMap[]
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static constexpr unsigned SM(unsigned Version)
if(PassOpts->AAPipeline)
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
BitVector & reset()
Definition BitVector.h:411
BitVector & set()
Definition BitVector.h:370
iterator_range< const_set_bits_iterator > set_bits() const
Definition BitVector.h:159
size_type size() const
size - Returns the number of bits in this bitvector.
Definition BitVector.h:178
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
bool shouldRevertScheduling(unsigned WavesAfter) override
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
bool erase(const KeyT &Val)
Definition DenseMap.h:330
DenseMapIterator< KeyT, ValueT, KeyInfoT, BucketT > iterator
Definition DenseMap.h:74
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
GCNRegPressure bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI) const
Mostly copy/paste from CodeGen/RegisterPressure.cpp Calculate the impact MI will have on CurPressure ...
GCNMaxILPSchedStrategy(const MachineSchedContext *C)
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
Apply a set of heuristics to a new candidate.
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as much as possible.
GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext *C)
GCNMaxOccupancySchedStrategy(const MachineSchedContext *C, bool IsLegacyScheduler=false)
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNPostScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
Models a register pressure target, allowing to evaluate and track register savings against that targe...
unsigned getNumRegsBenefit(const GCNRegPressure &SaveRP) const
Returns the benefit towards achieving the RP target that saving SaveRP represents,...
GCNRegPressure getPressure() const
DenseMap< unsigned, LaneBitmask > LiveRegSet
GCNSchedStrategy & S
GCNRegPressure PressureBefore
bool isRegionWithExcessRP() const
bool mayCauseSpilling(unsigned WavesAfter)
ScheduleMetrics getScheduleMetrics(const std::vector< SUnit > &InputSchedule)
GCNScheduleDAGMILive & DAG
const GCNSchedStageID StageID
std::vector< MachineInstr * > Unsched
GCNRegPressure PressureAfter
MachineFunction & MF
virtual void finalizeGCNRegion()
SIMachineFunctionInfo & MFI
unsigned computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, DenseMap< unsigned, unsigned > &ReadyCycles, const TargetSchedModel &SM)
virtual void finalizeGCNSchedStage()
virtual bool initGCNSchedStage()
virtual bool shouldRevertScheduling(unsigned WavesAfter)
std::vector< std::unique_ptr< ScheduleDAGMutation > > SavedMutations
GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
void modifyRegionSchedule(unsigned RegionIdx, MachineBasicBlock *MBB, ArrayRef< MachineInstr * > MIOrder)
Sets the schedule of region RegionIdx in block MBB to MIOrder.
MachineBasicBlock * CurrentMBB
const GCNSubtarget & ST
This is a minimal scheduler strategy.
GCNDownwardRPTracker DownwardTracker
void getRegisterPressures(bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, std::vector< unsigned > &Pressure, std::vector< unsigned > &MaxPressure, GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, ScheduleDAGMI *DAG, const SIRegisterInfo *SRI)
GCNSchedStrategy(const MachineSchedContext *C)
SmallVector< GCNSchedStageID, 4 > SchedStages
std::vector< unsigned > MaxPressure
SUnit * pickNodeBidirectional(bool &IsTopNode, bool &PickedPending)
GCNSchedStageID getCurrentStage()
bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Evaluates instructions in the pending queue using a subset of scheduling heuristics.
SmallVectorImpl< GCNSchedStageID >::iterator CurrentStage
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
std::optional< bool > GCNTrackersOverride
GCNDownwardRPTracker * getDownwardTracker()
std::vector< unsigned > Pressure
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
GCNUpwardRPTracker UpwardTracker
void printCandidateDecision(const SchedCandidate &Current, const SchedCandidate &Preferred)
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand, bool &IsPending, bool IsBottomUp)
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp)
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
GCNUpwardRPTracker * getUpwardTracker()
GCNSchedStageID getNextStage() const
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
void recede(const MachineInstr &MI)
Move to the state of RP just before the MI .
void traceCandidate(const SchedCandidate &Cand)
LLVM_ABI void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone, SchedBoundary *OtherZone)
Set the CandPolicy given a scheduling zone given the current resources and latencies inside and outsi...
MachineSchedPolicy RegionPolicy
const TargetSchedModel * SchedModel
const MachineSchedContext * Context
const TargetRegisterInfo * TRI
SchedCandidate BotCand
Candidate last picked from Bot boundary.
SchedCandidate TopCand
Candidate last picked from Top boundary.
virtual bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Apply a set of heuristics to a new candidate.
ScheduleDAGMILive * DAG
void initialize(ScheduleDAGMI *dag) override
Initialize the strategy after building the DAG for a new region.
void schedNode(SUnit *SU, bool IsTopNode) override
Update the scheduler's state after scheduling a node.
GenericScheduler(const MachineSchedContext *C)
bool shouldRevertScheduling(unsigned WavesAfter) override
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
void RemoveMachineInstrFromMaps(MachineInstr &MI)
SlotIndex getMBBEndIdx(const MachineBasicBlock *mbb) const
Return the last index in the given basic block.
LiveInterval & getInterval(Register Reg)
void removeInterval(Register Reg)
Interval removal.
LLVM_ABI void dump() const
MachineBasicBlock * getMBBFromIndex(SlotIndex index) const
LiveInterval & createAndComputeVirtRegInterval(Register Reg)
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
Definition MCInstrInfo.h:90
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
LLVM_ABI BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const
getblockFreq - Return block frequency.
LLVM_ABI BlockFrequency getEntryFreq() const
Divide a block's BlockFrequency::getFrequency() value by this value to obtain the entry block - relat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
bool verify(Pass *p=nullptr, const char *Banner=nullptr, raw_ostream *OS=nullptr, bool AbortOnError=true) const
Run the current MachineFunction through the machine code verifier, useful for debugger use.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Register getReg() const
getReg - Returns the register number.
LLVM_ABI LaneBitmask getMaxLaneMaskForVReg(Register Reg) const
Returns a mask covering all bits that can appear in lane masks of subregisters of the virtual registe...
LLVM_ABI bool isConstantPhysReg(MCRegister PhysReg) const
Returns true if PhysReg is unallocatable and constant throughout the function.
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
void finalizeGCNRegion() override
bool initGCNSchedStage() override
Capture a change in pressure for a single pressure set.
Simple wrapper around std::function<void(raw_ostream&)>.
Definition Printable.h:38
Helpers for implementing custom MachineSchedStrategy classes.
unsigned size() const
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void advance()
Advance across the current instruction.
LLVM_ABI void getDownwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction top-down.
const std::vector< unsigned > & getRegSetPressureAtPos() const
Get the register set pressure at the current position, which may be less than the pressure across the...
LLVM_ABI void getUpwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction bottom-up.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
GCNRPTracker::LiveRegSet & getLiveRegsForRegionIdx(unsigned RegionIdx)
List of registers defined and used by a machine instruction.
LLVM_ABI void collect(const MachineInstr &MI, const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, bool TrackLaneMasks, bool IgnoreDead)
Analyze the given instruction MI and fill in the Uses, Defs and DeadDefs list based on the MachineOpe...
LLVM_ABI void adjustLaneLiveness(const LiveIntervals &LIS, const MachineRegisterInfo &MRI, SlotIndex Pos, MachineInstr *AddFlagsMI=nullptr)
Use liveness information to find out which uses/defs are partially undefined/dead and adjust the VReg...
LLVM_ABI void detectDeadDefs(const MachineInstr &MI, const LiveIntervals &LIS)
Use liveness information to find dead defs not marked with a dead flag and move them to the DeadDefs ...
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
bool isIGLPMutationOnly(unsigned Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void increaseOccupancy(const MachineFunction &MF, unsigned Limit)
void limitOccupancy(const MachineFunction &MF)
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
unsigned TopReadyCycle
Cycle relative to start when node is ready.
unsigned NodeNum
Entry # of node in the node vector.
unsigned short Latency
Node latency.
bool isScheduled
True once scheduled.
unsigned ParentClusterIdx
The parent cluster id.
unsigned BotReadyCycle
Cycle relative to end when node is ready.
bool isBottomReady() const
bool isTopReady() const
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Each Scheduling boundary is associated with ready queues.
LLVM_ABI void releasePending()
Release pending ready nodes in to the available queue.
LLVM_ABI unsigned getLatencyStallCycles(SUnit *SU)
Get the difference between the given SUnit's ready time and the current cycle.
LLVM_ABI SUnit * pickOnlyChoice()
Call this before applying any other heuristics to the Available queue.
LLVM_ABI void bumpCycle(unsigned NextCycle)
Move the boundary of scheduled code by one cycle.
unsigned getCurrMOps() const
Micro-ops issued in the current cycle.
unsigned getCurrCycle() const
Number of cycles to issue the instructions scheduled in this zone.
LLVM_ABI bool checkHazard(SUnit *SU)
Does this SU have a hazard within the current instruction group.
A ScheduleDAG for scheduling lists of MachineInstr.
bool ScheduleSingleMIRegions
True if regions with a single MI should be scheduled.
MachineBasicBlock::iterator RegionEnd
The end of the range to be scheduled.
virtual void finalizeSchedule()
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
virtual void exitRegion()
Called when the scheduler has finished scheduling the current region.
const MachineLoopInfo * MLI
bool RemoveKillFlags
True if the DAG builder should remove kill flags (in preparation for rescheduling).
MachineBasicBlock::iterator RegionBegin
The beginning of the range to be scheduled.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
RegPressureTracker RPTracker
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMI(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
std::vector< std::unique_ptr< ScheduleDAGMutation > > Mutations
Ordered list of DAG postprocessing steps.
MachineRegisterInfo & MRI
Virtual/real register map.
const TargetInstrInfo * TII
Target instruction information.
const TargetRegisterInfo * TRI
Target processor register info.
MachineFunction & MF
Machine function.
static const unsigned ScaleFactor
unsigned getMetric() const
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
static bool isSameInstr(SlotIndex A, SlotIndex B)
isSameInstr - Return true if A and B refer to the same instruction.
static bool isEarlierInstr(SlotIndex A, SlotIndex B)
isEarlierInstr - Return true if A refers to an instruction earlier than B.
SlotIndex getPrevSlot() const
Returns the previous slot in the index list.
SlotIndex getMBBStartIdx(unsigned Num) const
Returns the first index in the given basic block number.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:229
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool isReMaterializable(const MachineInstr &MI) const
Return true if the instruction would be materializable at a point in the containing function where al...
virtual bool isIgnorableUse(const MachineOperand &MO) const
Given MO is a PhysReg use return if it can be ignored for the purpose of instruction rematerializatio...
uint8_t getCopyCost() const
Return the cost of copying a value between two registers in this class.
LaneBitmask getSubRegIndexLaneMask(unsigned SubIdx) const
Return a bitmask representing the parts of a register that are covered by SubIdx.
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI bool hasInstrSchedModel() const
Return true if this machine model includes an instruction-level scheduling model.
unsigned getMicroOpBufferSize() const
Number of micro-ops that may be buffered for OOO execution.
bool shouldRevertScheduling(unsigned WavesAfter) override
VNInfo - Value Number Information.
SlotIndex def
The index of the defining instruction.
bool isPHIDef() const
Returns true if this value is defined by a PHI instruction (or was, PHI instructions may have been el...
static bool allUsesAvailableAt(const MachineInstr *MI, SlotIndex UseIdx, const LiveIntervals &LIS, const MachineRegisterInfo &MRI, const TargetInstrInfo &TII)
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize)
LLVM_READONLY int32_t getMFMASrcCVDstAGPROp(uint32_t Opcode)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1765
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI unsigned getWeakLeft(const SUnit *SU, bool isTop)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI, Range &&LiveRegs)
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
std::pair< MachineBasicBlock::iterator, MachineBasicBlock::iterator > RegionBoundaries
A region's boundaries i.e.
IterT skipDebugInstructionsForward(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It until it points to a non-debug instruction or to End and return the resulting iterator.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool tryPressure(const PressureChange &TryP, const PressureChange &CandP, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason, const TargetRegisterInfo *TRI, const MachineFunction &MF)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
LLVM_ABI cl::opt< bool > VerifyScheduling
LLVM_ABI bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone)
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
IterT skipDebugInstructionsBackward(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It until it points to a non-debug instruction or to Begin and return the resulting iterator...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool isTheSameCluster(unsigned A, unsigned B)
Return whether the input cluster ID's are the same and valid.
DWARFExpression::Operation Op
LLVM_ABI bool tryGreater(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1917
DenseMap< MachineInstr *, GCNRPTracker::LiveRegSet > getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS)
creates a map MachineInstr -> LiveRegSet R - range of iterators on instructions After - upon entry or...
GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI, const LiveIntervals &LIS)
LLVM_ABI bool tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
Return true if this heuristic determines order.
LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF, GCNRegPressure::RegKind Kind, LiveIntervals &LIS, const MachineLoopInfo *MLI)
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
LLVM_ABI int biasPhysReg(const SUnit *SU, bool isTop)
Minimize physical register live ranges.
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
bool operator()(std::pair< MachineInstr *, unsigned > A, std::pair< MachineInstr *, unsigned > B) const
unsigned getArchVGPRNum() const
unsigned getAGPRNum() const
unsigned getSGPRNum() const
Policy for scheduling the next instruction in the candidate's zone.
Store the state used by GenericScheduler heuristics, required for the lifetime of one invocation of p...
void reset(const CandPolicy &NewPolicy)
LLVM_ABI void initResourceDelta(const ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel)
Status of an instruction's critical resource consumption.
constexpr bool any() const
Definition LaneBitmask.h:53
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
Execution frequency information required by scoring heuristics.
SmallVector< uint64_t > Regions
Per-region execution frequencies. 0 when unknown.
uint64_t MinFreq
Minimum and maximum observed frequencies.
FreqInfo(MachineFunction &MF, const GCNScheduleDAGMILive &DAG)