LLVM 23.0.0git
GCNSchedStrategy.cpp
Go to the documentation of this file.
1//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This contains a MachineSchedStrategy implementation for maximizing wave
11/// occupancy on GCN hardware.
12///
13/// This pass will apply multiple scheduling stages to the same function.
14/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15/// entry point for the scheduling of those regions is
16/// GCNScheduleDAGMILive::runSchedStages.
17
18/// Generally, the reason for having multiple scheduling stages is to account
19/// for the kernel-wide effect of register usage on occupancy. Usually, only a
20/// few scheduling regions will have register pressure high enough to limit
21/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22/// other regions.
23///
24//===----------------------------------------------------------------------===//
25
26#include "GCNSchedStrategy.h"
27#include "AMDGPUIGroupLP.h"
28#include "GCNRegPressure.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/STLExtras.h"
40#include "llvm/MC/LaneBitmask.h"
42#include "llvm/MC/MCSchedule.h"
45
46#define DEBUG_TYPE "machine-scheduler"
47
48using namespace llvm;
49
51 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
52 cl::desc("Disable unclustered high register pressure "
53 "reduction scheduling stage."),
54 cl::init(false));
55
57 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
58 cl::desc("Disable clustered low occupancy "
59 "rescheduling for ILP scheduling stage."),
60 cl::init(false));
61
63 "amdgpu-schedule-metric-bias", cl::Hidden,
65 "Sets the bias which adds weight to occupancy vs latency. Set it to "
66 "100 to chase the occupancy only."),
67 cl::init(10));
68
69static cl::opt<bool>
70 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
71 cl::desc("Relax occupancy targets for kernels which are memory "
72 "bound (amdgpu-membound-threshold), or "
73 "Wave Limited (amdgpu-limit-wave-threshold)."),
74 cl::init(false));
75
77 "amdgpu-use-amdgpu-trackers", cl::Hidden,
78 cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
79 cl::init(false));
80
82 "amdgpu-scheduler-pending-queue-limit", cl::Hidden,
84 "Max (Available+Pending) size to inspect pending queue (0 disables)"),
85 cl::init(256));
86
87#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
88#define DUMP_MAX_REG_PRESSURE
90 "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
91 cl::desc("Print a list of live registers along with their def/uses at the "
92 "point of maximum register pressure before scheduling."),
93 cl::init(false));
94
96 "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
97 cl::desc("Print a list of live registers along with their def/uses at the "
98 "point of maximum register pressure after scheduling."),
99 cl::init(false));
100#endif
101
103 "amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden,
104 cl::desc("Disable rewrite mfma rewrite scheduling stage"), cl::init(false));
105
106const unsigned ScheduleMetrics::ScaleFactor = 100;
107
112
115
116 MF = &DAG->MF;
117
118 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
119
121 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
123 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
124
126 // Set the initial TargetOccupnacy to the maximum occupancy that we can
127 // achieve for this function. This effectively sets a lower bound on the
128 // 'Critical' register limits in the scheduler.
129 // Allow for lower occupancy targets if kernel is wave limited or memory
130 // bound, and using the relaxed occupancy feature.
134 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
135
136 if (!KnownExcessRP) {
137 VGPRCriticalLimit = std::min(
138 ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
140 } else {
141 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
142 // returns a reasonably small number for targets with lots of VGPRs, such
143 // as GFX10 and GFX11.
144 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
145 "VGPRCriticalLimit calculation method.\n");
146 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
147 unsigned Granule =
148 AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, DynamicVGPRBlockSize);
149 unsigned Addressable =
150 AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, DynamicVGPRBlockSize);
151 unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
152 VGPRBudget = std::max(VGPRBudget, Granule);
153 VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
154 }
155
156 // Subtract error margin and bias from register limits and avoid overflow.
161 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
162 << ", VGPRExcessLimit = " << VGPRExcessLimit
163 << ", SGPRCriticalLimit = " << SGPRCriticalLimit
164 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
165}
166
167/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
168/// current register pressure.
169///
170/// This works for the common case, but it has a few exceptions that have been
171/// observed through trial and error:
172/// - Explicit physical register operands
173/// - Subregister definitions
174///
175/// In both of those cases, PressureDiff doesn't represent the actual pressure,
176/// and querying LiveIntervals through the RegPressureTracker is needed to get
177/// an accurate value.
178///
179/// We should eventually only use PressureDiff for maximum performance, but this
180/// already allows 80% of SUs to take the fast path without changing scheduling
181/// at all. Further changes would either change scheduling, or require a lot
182/// more logic to recover an accurate pressure estimate from the PressureDiffs.
183static bool canUsePressureDiffs(const SUnit &SU) {
184 if (!SU.isInstr())
185 return false;
186
187 // Cannot use pressure diffs for subregister defs or with physregs, it's
188 // imprecise in both cases.
189 for (const auto &Op : SU.getInstr()->operands()) {
190 if (!Op.isReg() || Op.isImplicit())
191 continue;
192 if (Op.getReg().isPhysical() ||
193 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
194 return false;
195 }
196 return true;
197}
198
200 bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
201 std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
202 GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
203 ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
204 // getDownwardPressure() and getUpwardPressure() make temporary changes to
205 // the tracker, so we need to pass those function a non-const copy.
206 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
207 if (!GCNTrackers) {
208 AtTop
209 ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
210 : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
211
212 return;
213 }
214
215 // GCNTrackers
216 Pressure.resize(4, 0);
217 MachineInstr *MI = SU->getInstr();
218 GCNRegPressure NewPressure;
219 if (AtTop) {
220 GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
221 NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
222 } else {
223 GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
224 TempUpwardTracker.recede(*MI);
225 NewPressure = TempUpwardTracker.getPressure();
226 }
227 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
228 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
229 NewPressure.getArchVGPRNum();
230 Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
231}
232
234 bool AtTop,
235 const RegPressureTracker &RPTracker,
236 const SIRegisterInfo *SRI,
237 unsigned SGPRPressure,
238 unsigned VGPRPressure, bool IsBottomUp) {
239 Cand.SU = SU;
240 Cand.AtTop = AtTop;
241
242 if (!DAG->isTrackingPressure())
243 return;
244
245 Pressure.clear();
246 MaxPressure.clear();
247
248 // We try to use the cached PressureDiffs in the ScheduleDAG whenever
249 // possible over querying the RegPressureTracker.
250 //
251 // RegPressureTracker will make a lot of LIS queries which are very
252 // expensive, it is considered a slow function in this context.
253 //
254 // PressureDiffs are precomputed and cached, and getPressureDiff is just a
255 // trivial lookup into an array. It is pretty much free.
256 //
257 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
258 // PressureDiffs.
259 if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
260 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
262 } else {
263 // Reserve 4 slots.
264 Pressure.resize(4, 0);
265 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
266 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
267
268 for (const auto &Diff : DAG->getPressureDiff(SU)) {
269 if (!Diff.isValid())
270 continue;
271 // PressureDiffs is always bottom-up so if we're working top-down we need
272 // to invert its sign.
273 Pressure[Diff.getPSet()] +=
274 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
275 }
276
277#ifdef EXPENSIVE_CHECKS
278 std::vector<unsigned> CheckPressure, CheckMaxPressure;
279 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
281 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
282 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
283 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
284 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
285 errs() << "Register Pressure is inaccurate when calculated through "
286 "PressureDiff\n"
287 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
288 << ", expected "
289 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
290 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
291 << ", expected "
292 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
293 report_fatal_error("inaccurate register pressure calculation");
294 }
295#endif
296 }
297
298 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
299 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
300
301 // If two instructions increase the pressure of different register sets
302 // by the same amount, the generic scheduler will prefer to schedule the
303 // instruction that increases the set with the least amount of registers,
304 // which in our case would be SGPRs. This is rarely what we want, so
305 // when we report excess/critical register pressure, we do it either
306 // only for VGPRs or only for SGPRs.
307
308 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
309 const unsigned MaxVGPRPressureInc = 16;
310 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
311 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
312
313 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
314 // to increase the likelihood we don't go over the limits. We should improve
315 // the analysis to look through dependencies to find the path with the least
316 // register pressure.
317
318 // We only need to update the RPDelta for instructions that increase register
319 // pressure. Instructions that decrease or keep reg pressure the same will be
320 // marked as RegExcess in tryCandidate() when they are compared with
321 // instructions that increase the register pressure.
322 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
323 HasHighPressure = true;
324 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
325 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
326 }
327
328 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
329 HasHighPressure = true;
330 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
331 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
332 }
333
334 // Register pressure is considered 'CRITICAL' if it is approaching a value
335 // that would reduce the wave occupancy for the execution unit. When
336 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
337 // has the same cost, so we don't need to prefer one over the other.
338
339 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
340 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
341
342 if (SGPRDelta >= 0 || VGPRDelta >= 0) {
343 HasHighPressure = true;
344 if (SGPRDelta > VGPRDelta) {
345 Cand.RPDelta.CriticalMax =
346 PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
347 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
348 } else {
349 Cand.RPDelta.CriticalMax =
350 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
351 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
352 }
353 }
354}
355
357 const TargetSchedModel *SchedModel) {
358 bool HasBufferedModel =
359 SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
360 unsigned Combined = Zone.Available.size() + Zone.Pending.size();
361 return Combined <= PendingQueueLimit && HasBufferedModel;
362}
363
365 const TargetSchedModel *SchedModel) {
366 // pickOnlyChoice() releases pending instructions and checks for new hazards.
367 SUnit *OnlyChoice = Zone.pickOnlyChoice();
368 if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty())
369 return OnlyChoice;
370
371 return nullptr;
372}
373
375 const SchedCandidate &Preferred) {
376 LLVM_DEBUG({
377 dbgs() << "Prefer:\t\t";
378 DAG->dumpNode(*Preferred.SU);
379
380 if (Current.SU) {
381 dbgs() << "Not:\t";
382 DAG->dumpNode(*Current.SU);
383 }
384
385 dbgs() << "Reason:\t\t";
386 traceCandidate(Preferred);
387 });
388}
389
390// This function is mostly cut and pasted from
391// GenericScheduler::pickNodeFromQueue()
393 const CandPolicy &ZonePolicy,
394 const RegPressureTracker &RPTracker,
395 SchedCandidate &Cand, bool &IsPending,
396 bool IsBottomUp) {
397 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
399 unsigned SGPRPressure = 0;
400 unsigned VGPRPressure = 0;
401 IsPending = false;
402 if (DAG->isTrackingPressure()) {
403 if (!GCNTrackers) {
404 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
405 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
406 } else {
407 GCNRPTracker *T = IsBottomUp
408 ? static_cast<GCNRPTracker *>(&UpwardTracker)
409 : static_cast<GCNRPTracker *>(&DownwardTracker);
410 SGPRPressure = T->getPressure().getSGPRNum();
411 VGPRPressure = T->getPressure().getArchVGPRNum();
412 }
413 }
414 LLVM_DEBUG(dbgs() << "Available Q:\n");
415 ReadyQueue &AQ = Zone.Available;
416 for (SUnit *SU : AQ) {
417
418 SchedCandidate TryCand(ZonePolicy);
419 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
420 VGPRPressure, IsBottomUp);
421 // Pass SchedBoundary only when comparing nodes from the same boundary.
422 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
423 tryCandidate(Cand, TryCand, ZoneArg);
424 if (TryCand.Reason != NoCand) {
425 // Initialize resource delta if needed in case future heuristics query it.
426 if (TryCand.ResDelta == SchedResourceDelta())
427 TryCand.initResourceDelta(Zone.DAG, SchedModel);
428 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
429 Cand.setBest(TryCand);
430 } else {
431 printCandidateDecision(TryCand, Cand);
432 }
433 }
434
435 if (!shouldCheckPending(Zone, SchedModel))
436 return;
437
438 LLVM_DEBUG(dbgs() << "Pending Q:\n");
439 ReadyQueue &PQ = Zone.Pending;
440 for (SUnit *SU : PQ) {
441
442 SchedCandidate TryCand(ZonePolicy);
443 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
444 VGPRPressure, IsBottomUp);
445 // Pass SchedBoundary only when comparing nodes from the same boundary.
446 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
447 tryPendingCandidate(Cand, TryCand, ZoneArg);
448 if (TryCand.Reason != NoCand) {
449 // Initialize resource delta if needed in case future heuristics query it.
450 if (TryCand.ResDelta == SchedResourceDelta())
451 TryCand.initResourceDelta(Zone.DAG, SchedModel);
452 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
453 IsPending = true;
454 Cand.setBest(TryCand);
455 } else {
456 printCandidateDecision(TryCand, Cand);
457 }
458 }
459}
460
461// This function is mostly cut and pasted from
462// GenericScheduler::pickNodeBidirectional()
464 bool &PickedPending) {
465 // Schedule as far as possible in the direction of no choice. This is most
466 // efficient, but also provides the best heuristics for CriticalPSets.
467 if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) {
468 IsTopNode = false;
469 return SU;
470 }
471 if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) {
472 IsTopNode = true;
473 return SU;
474 }
475 // Set the bottom-up policy based on the state of the current bottom zone
476 // and the instructions outside the zone, including the top zone.
477 CandPolicy BotPolicy;
478 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
479 // Set the top-down policy based on the state of the current top zone and
480 // the instructions outside the zone, including the bottom zone.
481 CandPolicy TopPolicy;
482 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
483
484 bool BotPending = false;
485 // See if BotCand is still valid (because we previously scheduled from Top).
486 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
487 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
488 BotCand.Policy != BotPolicy) {
489 BotCand.reset(CandPolicy());
490 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
491 BotPending,
492 /*IsBottomUp=*/true);
493 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
494 } else {
496#ifndef NDEBUG
497 if (VerifyScheduling) {
498 SchedCandidate TCand;
499 TCand.reset(CandPolicy());
500 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
501 BotPending,
502 /*IsBottomUp=*/true);
503 assert(TCand.SU == BotCand.SU &&
504 "Last pick result should correspond to re-picking right now");
505 }
506#endif
507 }
508
509 bool TopPending = false;
510 // Check if the top Q has a better candidate.
511 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
512 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
513 TopCand.Policy != TopPolicy) {
514 TopCand.reset(CandPolicy());
515 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
516 TopPending,
517 /*IsBottomUp=*/false);
518 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
519 } else {
521#ifndef NDEBUG
522 if (VerifyScheduling) {
523 SchedCandidate TCand;
524 TCand.reset(CandPolicy());
525 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
526 TopPending,
527 /*IsBottomUp=*/false);
528 assert(TCand.SU == TopCand.SU &&
529 "Last pick result should correspond to re-picking right now");
530 }
531#endif
532 }
533
534 // Pick best from BotCand and TopCand.
535 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
536 dbgs() << "Bot Cand: "; traceCandidate(BotCand););
537 SchedCandidate Cand = BotPending ? TopCand : BotCand;
538 SchedCandidate TryCand = BotPending ? BotCand : TopCand;
539 PickedPending = BotPending && TopPending;
540
541 TryCand.Reason = NoCand;
542 if (BotPending || TopPending) {
543 PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr);
544 } else {
545 tryCandidate(Cand, TryCand, nullptr);
546 }
547
548 if (TryCand.Reason != NoCand) {
549 Cand.setBest(TryCand);
550 }
551
552 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
553
554 IsTopNode = Cand.AtTop;
555 return Cand.SU;
556}
557
558// This function is mostly cut and pasted from
559// GenericScheduler::pickNode()
561 if (DAG->top() == DAG->bottom()) {
562 assert(Top.Available.empty() && Top.Pending.empty() &&
563 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
564 return nullptr;
565 }
566 bool PickedPending;
567 SUnit *SU;
568 do {
569 PickedPending = false;
570 if (RegionPolicy.OnlyTopDown) {
572 if (!SU) {
573 CandPolicy NoPolicy;
574 TopCand.reset(NoPolicy);
575 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
576 PickedPending,
577 /*IsBottomUp=*/false);
578 assert(TopCand.Reason != NoCand && "failed to find a candidate");
579 SU = TopCand.SU;
580 }
581 IsTopNode = true;
582 } else if (RegionPolicy.OnlyBottomUp) {
584 if (!SU) {
585 CandPolicy NoPolicy;
586 BotCand.reset(NoPolicy);
587 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
588 PickedPending,
589 /*IsBottomUp=*/true);
590 assert(BotCand.Reason != NoCand && "failed to find a candidate");
591 SU = BotCand.SU;
592 }
593 IsTopNode = false;
594 } else {
595 SU = pickNodeBidirectional(IsTopNode, PickedPending);
596 }
597 } while (SU->isScheduled);
598
599 if (PickedPending) {
600 unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
601 SchedBoundary &Zone = IsTopNode ? Top : Bot;
602 unsigned CurrentCycle = Zone.getCurrCycle();
603 if (ReadyCycle > CurrentCycle)
604 Zone.bumpCycle(ReadyCycle);
605
606 // FIXME: checkHazard() doesn't give information about which cycle the
607 // hazard will resolve so just keep bumping the cycle by 1. This could be
608 // made more efficient if checkHazard() returned more details.
609 while (Zone.checkHazard(SU))
610 Zone.bumpCycle(Zone.getCurrCycle() + 1);
611
612 Zone.releasePending();
613 }
614
615 if (SU->isTopReady())
616 Top.removeReady(SU);
617 if (SU->isBottomReady())
618 Bot.removeReady(SU);
619
620 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
621 << *SU->getInstr());
622 return SU;
623}
624
625void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
626 if (GCNTrackers) {
627 MachineInstr *MI = SU->getInstr();
628 IsTopNode ? (void)DownwardTracker.advance(MI, false)
629 : UpwardTracker.recede(*MI);
630 }
631
632 return GenericScheduler::schedNode(SU, IsTopNode);
633}
634
639
642 if (!CurrentStage)
643 CurrentStage = SchedStages.begin();
644 else
645 CurrentStage++;
646
647 return CurrentStage != SchedStages.end();
648}
649
652 return std::next(CurrentStage) != SchedStages.end();
653}
654
656 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
657 return *std::next(CurrentStage);
658}
659
661 SchedCandidate &TryCand,
662 SchedBoundary *Zone) const {
663 // Initialize the candidate if needed.
664 if (!Cand.isValid()) {
665 TryCand.Reason = NodeOrder;
666 return true;
667 }
668
669 // Bias PhysReg Defs and copies to their uses and defined respectively.
670 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
671 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
672 return TryCand.Reason != NoCand;
673
674 // Avoid exceeding the target's limit.
675 if (DAG->isTrackingPressure() &&
676 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
677 RegExcess, TRI, DAG->MF))
678 return TryCand.Reason != NoCand;
679
680 // Avoid increasing the max critical pressure in the scheduled region.
681 if (DAG->isTrackingPressure() &&
683 TryCand, Cand, RegCritical, TRI, DAG->MF))
684 return TryCand.Reason != NoCand;
685
686 bool SameBoundary = Zone != nullptr;
687 if (SameBoundary) {
690 TryCand, Cand, ResourceReduce))
691 return TryCand.Reason != NoCand;
693 Cand.ResDelta.DemandedResources, TryCand, Cand,
695 return TryCand.Reason != NoCand;
696 }
697
698 return false;
699}
700
712
717
719 SchedCandidate &TryCand,
720 SchedBoundary *Zone) const {
721 // Initialize the candidate if needed.
722 if (!Cand.isValid()) {
723 TryCand.Reason = NodeOrder;
724 return true;
725 }
726
727 // Avoid spilling by exceeding the register limit.
728 if (DAG->isTrackingPressure() &&
729 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
730 RegExcess, TRI, DAG->MF))
731 return TryCand.Reason != NoCand;
732
733 // Bias PhysReg Defs and copies to their uses and defined respectively.
734 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
735 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
736 return TryCand.Reason != NoCand;
737
738 bool SameBoundary = Zone != nullptr;
739 if (SameBoundary) {
740 // Prioritize instructions that read unbuffered resources by stall cycles.
741 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
742 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
743 return TryCand.Reason != NoCand;
744
745 // Avoid critical resource consumption and balance the schedule.
748 TryCand, Cand, ResourceReduce))
749 return TryCand.Reason != NoCand;
751 Cand.ResDelta.DemandedResources, TryCand, Cand,
753 return TryCand.Reason != NoCand;
754
755 // Unconditionally try to reduce latency.
756 if (tryLatency(TryCand, Cand, *Zone))
757 return TryCand.Reason != NoCand;
758
759 // Weak edges are for clustering and other constraints.
760 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
761 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
762 return TryCand.Reason != NoCand;
763 }
764
765 // Keep clustered nodes together to encourage downstream peephole
766 // optimizations which may reduce resource requirements.
767 //
768 // This is a best effort to set things up for a post-RA pass. Optimizations
769 // like generating loads of multiple registers should ideally be done within
770 // the scheduler pass by combining the loads during DAG postprocessing.
771 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
772 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
773 bool CandIsClusterSucc =
774 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
775 bool TryCandIsClusterSucc =
776 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
777 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
778 Cluster))
779 return TryCand.Reason != NoCand;
780
781 // Avoid increasing the max critical pressure in the scheduled region.
782 if (DAG->isTrackingPressure() &&
784 TryCand, Cand, RegCritical, TRI, DAG->MF))
785 return TryCand.Reason != NoCand;
786
787 // Avoid increasing the max pressure of the entire region.
788 if (DAG->isTrackingPressure() &&
789 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
790 Cand, RegMax, TRI, DAG->MF))
791 return TryCand.Reason != NoCand;
792
793 if (SameBoundary) {
794 // Fall through to original instruction order.
795 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
796 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
797 TryCand.Reason = NodeOrder;
798 return true;
799 }
800 }
801 return false;
802}
803
809
810/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
811/// much as possible. This is achieved by:
812// 1. Prioritize clustered operations before stall latency heuristic.
813// 2. Prioritize long-latency-load before stall latency heuristic.
814///
815/// \param Cand provides the policy and current best candidate.
816/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
817/// \param Zone describes the scheduled zone that we are extending, or nullptr
818/// if Cand is from a different zone than TryCand.
819/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
821 SchedCandidate &TryCand,
822 SchedBoundary *Zone) const {
823 // Initialize the candidate if needed.
824 if (!Cand.isValid()) {
825 TryCand.Reason = NodeOrder;
826 return true;
827 }
828
829 // Bias PhysReg Defs and copies to their uses and defined respectively.
830 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
831 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
832 return TryCand.Reason != NoCand;
833
834 if (DAG->isTrackingPressure()) {
835 // Avoid exceeding the target's limit.
836 if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
837 RegExcess, TRI, DAG->MF))
838 return TryCand.Reason != NoCand;
839
840 // Avoid increasing the max critical pressure in the scheduled region.
842 TryCand, Cand, RegCritical, TRI, DAG->MF))
843 return TryCand.Reason != NoCand;
844 }
845
846 // MaxMemoryClause-specific: We prioritize clustered instructions as we would
847 // get more benefit from clausing these memory instructions.
848 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
849 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
850 bool CandIsClusterSucc =
851 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
852 bool TryCandIsClusterSucc =
853 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
854 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
855 Cluster))
856 return TryCand.Reason != NoCand;
857
858 // We only compare a subset of features when comparing nodes between
859 // Top and Bottom boundary. Some properties are simply incomparable, in many
860 // other instances we should only override the other boundary if something
861 // is a clear good pick on one boundary. Skip heuristics that are more
862 // "tie-breaking" in nature.
863 bool SameBoundary = Zone != nullptr;
864 if (SameBoundary) {
865 // For loops that are acyclic path limited, aggressively schedule for
866 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
867 // heuristics to take precedence.
868 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
869 tryLatency(TryCand, Cand, *Zone))
870 return TryCand.Reason != NoCand;
871
872 // MaxMemoryClause-specific: Prioritize long latency memory load
873 // instructions in top-bottom order to hide more latency. The mayLoad check
874 // is used to exclude store-like instructions, which we do not want to
875 // scheduler them too early.
876 bool TryMayLoad =
877 TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
878 bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
879
880 if (TryMayLoad || CandMayLoad) {
881 bool TryLongLatency =
882 TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
883 bool CandLongLatency =
884 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
885
886 if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
887 Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
888 Cand, Stall))
889 return TryCand.Reason != NoCand;
890 }
891 // Prioritize instructions that read unbuffered resources by stall cycles.
892 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
893 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
894 return TryCand.Reason != NoCand;
895 }
896
897 if (SameBoundary) {
898 // Weak edges are for clustering and other constraints.
899 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
900 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
901 return TryCand.Reason != NoCand;
902 }
903
904 // Avoid increasing the max pressure of the entire region.
905 if (DAG->isTrackingPressure() &&
906 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
907 Cand, RegMax, TRI, DAG->MF))
908 return TryCand.Reason != NoCand;
909
910 if (SameBoundary) {
911 // Avoid critical resource consumption and balance the schedule.
914 TryCand, Cand, ResourceReduce))
915 return TryCand.Reason != NoCand;
917 Cand.ResDelta.DemandedResources, TryCand, Cand,
919 return TryCand.Reason != NoCand;
920
921 // Avoid serializing long latency dependence chains.
922 // For acyclic path limited loops, latency was already checked above.
923 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
924 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
925 return TryCand.Reason != NoCand;
926
927 // Fall through to original instruction order.
928 if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
929 assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
930 TryCand.Reason = NodeOrder;
931 return true;
932 }
933 }
934
935 return false;
936}
937
939 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
940 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
941 MFI(*MF.getInfo<SIMachineFunctionInfo>()),
942 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
943 RegionLiveOuts(this, /*IsLiveOut=*/true) {
944
945 // We want regions with a single MI to be scheduled so that we can reason
946 // about them correctly during scheduling stages that move MIs between regions
947 // (e.g., rematerialization).
949 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
950 if (RelaxedOcc) {
951 MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
952 if (MinOccupancy != StartingOccupancy)
953 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
954 << ".\n");
955 }
956}
957
958std::unique_ptr<GCNSchedStage>
959GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
960 switch (SchedStageID) {
962 return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
964 return std::make_unique<RewriteMFMAFormStage>(SchedStageID, *this);
966 return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
968 return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
970 return std::make_unique<PreRARematStage>(SchedStageID, *this);
972 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
974 return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
975 *this);
976 }
977
978 llvm_unreachable("Unknown SchedStageID.");
979}
980
982 // Collect all scheduling regions. The actual scheduling is performed in
983 // GCNScheduleDAGMILive::finalizeSchedule.
984 Regions.push_back(std::pair(RegionBegin, RegionEnd));
985}
986
988GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
989 if (Regions[RegionIdx].first == Regions[RegionIdx].second)
990 return llvm::getRegPressure(MRI, LiveIns[RegionIdx]);
992 RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
993 &LiveIns[RegionIdx]);
994 return RPTracker.moveMaxPressure();
995}
996
998 MachineBasicBlock::iterator RegionEnd) {
999 assert(RegionBegin != RegionEnd && "Region must not be empty");
1000 return &*skipDebugInstructionsBackward(std::prev(RegionEnd), RegionBegin);
1001}
1002
1003void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
1004 const MachineBasicBlock *MBB) {
1005 GCNDownwardRPTracker RPTracker(*LIS);
1006
1007 // If the block has the only successor then live-ins of that successor are
1008 // live-outs of the current block. We can reuse calculated live set if the
1009 // successor will be sent to scheduling past current block.
1010
1011 // However, due to the bug in LiveInterval analysis it may happen that two
1012 // predecessors of the same successor block have different lane bitmasks for
1013 // a live-out register. Workaround that by sticking to one-to-one relationship
1014 // i.e. one predecessor with one successor block.
1015 const MachineBasicBlock *OnlySucc = nullptr;
1016 if (MBB->succ_size() == 1) {
1017 auto *Candidate = *MBB->succ_begin();
1018 if (!Candidate->empty() && Candidate->pred_size() == 1) {
1019 SlotIndexes *Ind = LIS->getSlotIndexes();
1020 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
1021 OnlySucc = Candidate;
1022 }
1023 }
1024
1025 // Scheduler sends regions from the end of the block upwards.
1026 size_t CurRegion = RegionIdx;
1027 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
1028 if (Regions[CurRegion].first->getParent() != MBB)
1029 break;
1030 --CurRegion;
1031
1032 auto I = MBB->begin();
1033 auto LiveInIt = MBBLiveIns.find(MBB);
1034 auto &Rgn = Regions[CurRegion];
1035 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
1036 if (LiveInIt != MBBLiveIns.end()) {
1037 auto LiveIn = std::move(LiveInIt->second);
1038 RPTracker.reset(*MBB->begin(), &LiveIn);
1039 MBBLiveIns.erase(LiveInIt);
1040 } else {
1041 I = Rgn.first;
1042 auto LRS = BBLiveInMap.lookup(NonDbgMI);
1043#ifdef EXPENSIVE_CHECKS
1044 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
1045#endif
1046 RPTracker.reset(*I, &LRS);
1047 }
1048
1049 for (;;) {
1050 I = RPTracker.getNext();
1051
1052 if (Regions[CurRegion].first == I || NonDbgMI == I) {
1053 LiveIns[CurRegion] = RPTracker.getLiveRegs();
1054 RPTracker.clearMaxPressure();
1055 }
1056
1057 if (Regions[CurRegion].second == I) {
1058 Pressure[CurRegion] = RPTracker.moveMaxPressure();
1059 if (CurRegion-- == RegionIdx)
1060 break;
1061 auto &Rgn = Regions[CurRegion];
1062 NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
1063 }
1064 RPTracker.advanceBeforeNext();
1065 RPTracker.advanceToNext();
1066 }
1067
1068 if (OnlySucc) {
1069 if (I != MBB->end()) {
1070 RPTracker.advanceBeforeNext();
1071 RPTracker.advanceToNext();
1072 RPTracker.advance(MBB->end());
1073 }
1074 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
1075 }
1076}
1077
1079GCNScheduleDAGMILive::getRegionLiveInMap() const {
1080 assert(!Regions.empty());
1081 std::vector<MachineInstr *> RegionFirstMIs;
1082 RegionFirstMIs.reserve(Regions.size());
1083 for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
1084 RegionFirstMIs.push_back(
1086
1087 return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
1088}
1089
1091GCNScheduleDAGMILive::getRegionLiveOutMap() const {
1092 assert(!Regions.empty());
1093 std::vector<MachineInstr *> RegionLastMIs;
1094 RegionLastMIs.reserve(Regions.size());
1095 for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) {
1096 // Skip empty regions.
1097 if (RegionBegin == RegionEnd)
1098 continue;
1099 RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
1100 }
1101 return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
1102}
1103
1105 IdxToInstruction.clear();
1106
1107 RegionLiveRegMap =
1108 IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
1109 for (unsigned I = 0; I < DAG->Regions.size(); I++) {
1110 auto &[RegionBegin, RegionEnd] = DAG->Regions[I];
1111 // Skip empty regions.
1112 if (RegionBegin == RegionEnd)
1113 continue;
1114 MachineInstr *RegionKey =
1115 IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin;
1116 IdxToInstruction[I] = RegionKey;
1117 }
1118}
1119
1121 // Start actual scheduling here. This function is called by the base
1122 // MachineScheduler after all regions have been recorded by
1123 // GCNScheduleDAGMILive::schedule().
1124 LiveIns.resize(Regions.size());
1125 Pressure.resize(Regions.size());
1126 RegionsWithHighRP.resize(Regions.size());
1127 RegionsWithExcessRP.resize(Regions.size());
1128 RegionsWithIGLPInstrs.resize(Regions.size());
1129 RegionsWithHighRP.reset();
1130 RegionsWithExcessRP.reset();
1131 RegionsWithIGLPInstrs.reset();
1132
1133 runSchedStages();
1134}
1135
1136void GCNScheduleDAGMILive::runSchedStages() {
1137 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
1138
1139 if (!Regions.empty()) {
1140 BBLiveInMap = getRegionLiveInMap();
1141 if (GCNTrackers)
1142 RegionLiveOuts.buildLiveRegMap();
1143 }
1144
1145#ifdef DUMP_MAX_REG_PRESSURE
1149 LIS->dump();
1150 }
1151#endif
1152
1153 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
1154 while (S.advanceStage()) {
1155 auto Stage = createSchedStage(S.getCurrentStage());
1156 if (!Stage->initGCNSchedStage())
1157 continue;
1158
1159 for (auto Region : Regions) {
1160 RegionBegin = Region.first;
1161 RegionEnd = Region.second;
1162 // Setup for scheduling the region and check whether it should be skipped.
1163 if (!Stage->initGCNRegion()) {
1164 Stage->advanceRegion();
1165 exitRegion();
1166 continue;
1167 }
1168
1169 if (GCNTrackers) {
1170 GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
1171 GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
1172 GCNRPTracker::LiveRegSet *RegionLiveIns =
1173 &LiveIns[Stage->getRegionIdx()];
1174
1175 reinterpret_cast<GCNRPTracker *>(DownwardTracker)
1176 ->reset(MRI, *RegionLiveIns);
1177 reinterpret_cast<GCNRPTracker *>(UpwardTracker)
1178 ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
1179 Stage->getRegionIdx()));
1180 }
1181
1183 Stage->finalizeGCNRegion();
1184 Stage->advanceRegion();
1185 exitRegion();
1186 }
1187
1188 Stage->finalizeGCNSchedStage();
1189 }
1190
1191#ifdef DUMP_MAX_REG_PRESSURE
1195 LIS->dump();
1196 }
1197#endif
1198}
1199
1200#ifndef NDEBUG
1202 switch (StageID) {
1204 OS << "Max Occupancy Initial Schedule";
1205 break;
1207 OS << "Instruction Rewriting Reschedule";
1208 break;
1210 OS << "Unclustered High Register Pressure Reschedule";
1211 break;
1213 OS << "Clustered Low Occupancy Reschedule";
1214 break;
1216 OS << "Pre-RA Rematerialize";
1217 break;
1219 OS << "Max ILP Initial Schedule";
1220 break;
1222 OS << "Max memory clause Initial Schedule";
1223 break;
1224 }
1225
1226 return OS;
1227}
1228#endif
1229
1233
1235 if (!DAG.LIS)
1236 return false;
1237
1238 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1239 return true;
1240}
1241
1242void RewriteMFMAFormStage::findReachingDefs(
1243 MachineOperand &UseMO, LiveIntervals *LIS,
1244 SmallVectorImpl<SlotIndex> &DefIdxs) {
1245 MachineInstr *UseMI = UseMO.getParent();
1246 LiveInterval &UseLI = LIS->getInterval(UseMO.getReg());
1247 VNInfo *VNI = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI));
1248
1249 // If the def is not a PHI, then it must be the only reaching def.
1250 if (!VNI->isPHIDef()) {
1251 DefIdxs.push_back(VNI->def);
1252 return;
1253 }
1254
1255 SmallPtrSet<MachineBasicBlock *, 8> Visited = {UseMI->getParent()};
1256 SmallVector<MachineBasicBlock *, 8> Worklist;
1257
1258 // Mark the predecessor blocks for traversal
1259 for (MachineBasicBlock *PredMBB : UseMI->getParent()->predecessors()) {
1260 Worklist.push_back(PredMBB);
1261 Visited.insert(PredMBB);
1262 }
1263
1264 while (!Worklist.empty()) {
1265 MachineBasicBlock *CurrMBB = Worklist.pop_back_val();
1266
1267 SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(CurrMBB);
1268 VNInfo *VNI = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot());
1269
1270 MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNI->def);
1271
1272 // If there is a def in this block, then add it to the list. This is the
1273 // reaching def of this path.
1274 if (!VNI->isPHIDef()) {
1275 DefIdxs.push_back(VNI->def);
1276 continue;
1277 }
1278
1279 for (MachineBasicBlock *PredMBB : DefMBB->predecessors()) {
1280 if (Visited.insert(PredMBB).second)
1281 Worklist.push_back(PredMBB);
1282 }
1283 }
1284}
1285
1286void RewriteMFMAFormStage::findReachingUses(
1288 SmallVectorImpl<MachineOperand *> &ReachingUses) {
1289 SlotIndex DefIdx = LIS->getInstructionIndex(*DefMI);
1290 for (MachineOperand &UseMO :
1291 DAG.MRI.use_nodbg_operands(DefMI->getOperand(0).getReg())) {
1292 SmallVector<SlotIndex, 8> ReachingDefIndexes;
1293 findReachingDefs(UseMO, LIS, ReachingDefIndexes);
1294
1295 // If we find a use that contains this DefMI in its reachingDefs, then it is
1296 // a reaching use.
1297 if (any_of(ReachingDefIndexes, [DefIdx](SlotIndex RDIdx) {
1298 return SlotIndex::isSameInstr(RDIdx, DefIdx);
1299 }))
1300 ReachingUses.push_back(&UseMO);
1301 }
1302}
1303
1305 // We only need to run this pass if the architecture supports AGPRs.
1306 // Additionally, we don't use AGPRs at occupancy levels above 1 so there
1307 // is no need for this pass in that case, either.
1308 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1309 if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1)
1310 return false;
1311
1312 RegionsWithExcessArchVGPR.resize(DAG.Regions.size());
1313 RegionsWithExcessArchVGPR.reset();
1314 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
1316 if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
1317 RegionsWithExcessArchVGPR[Region] = true;
1318 }
1319
1320 if (RegionsWithExcessArchVGPR.none())
1321 return false;
1322
1323 TII = ST.getInstrInfo();
1324 SRI = ST.getRegisterInfo();
1325
1326 std::vector<std::pair<MachineInstr *, unsigned>> RewriteCands;
1329
1330 if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef))
1331 return false;
1332
1333 int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef);
1334
1335 // If we haven't found the beneficial conditions, prefer the VGPR form which
1336 // may result in less cross RC copies.
1337 if (Cost > 0)
1338 return false;
1339
1340 return rewrite(RewriteCands);
1341}
1342
1345 return false;
1346
1348 return false;
1349
1350 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1351 return false;
1352
1353 SavedMutations.swap(DAG.Mutations);
1354 DAG.addMutation(
1356
1357 InitialOccupancy = DAG.MinOccupancy;
1358 // Aggressively try to reduce register pressure in the unclustered high RP
1359 // stage. Temporarily increase occupancy target in the region.
1360 TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
1361 ? InitialOccupancy + 1
1362 : InitialOccupancy;
1363 IsAnyRegionScheduled = false;
1364 S.SGPRLimitBias = S.HighRPSGPRBias;
1365 S.VGPRLimitBias = S.HighRPVGPRBias;
1366
1367 LLVM_DEBUG(
1368 dbgs()
1369 << "Retrying function scheduling without clustering. "
1370 "Aggressively try to reduce register pressure to achieve occupancy "
1371 << TempTargetOccupancy << ".\n");
1372
1373 return true;
1374}
1375
1378 return false;
1379
1381 return false;
1382
1383 // Don't bother trying to improve ILP in lower RP regions if occupancy has not
1384 // been dropped. All regions will have already been scheduled with the ideal
1385 // occupancy targets.
1386 if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1387 return false;
1388
1389 LLVM_DEBUG(
1390 dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1391 << DAG.MinOccupancy << ".\n");
1392 return true;
1393}
1394
1395/// Allows to easily filter for this stage's debug output.
1396#define REMAT_PREFIX "[PreRARemat] "
1397#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
1398
1399#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1400Printable PreRARematStage::ScoredRemat::print() const {
1401 return Printable([&](raw_ostream &OS) {
1402 OS << '(' << MaxFreq << ", " << FreqDiff << ", " << RegionImpact << ')';
1403 });
1404}
1405#endif
1406
1408 // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
1409 // regions inbetween the defs and region we sinked the def to. Will need to be
1410 // fixed if there is another pass after this pass.
1411 assert(!S.hasNextStage());
1412
1413 if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() <= 1)
1414 return false;
1415
1416 // Maps all MIs (except lone terminators, which are not part of any region) to
1417 // their parent region. Non-lone terminators are considered part of the region
1418 // they delimitate.
1419 DenseMap<MachineInstr *, unsigned> MIRegion(MF.getInstructionCount());
1420
1421 // Before performing any IR modification record the parent region of each MI
1422 // and the parent MBB of each region.
1423 const unsigned NumRegions = DAG.Regions.size();
1424 for (unsigned I = 0; I < NumRegions; ++I) {
1425 RegionBoundaries Region = DAG.Regions[I];
1426 for (auto MI = Region.first; MI != Region.second; ++MI)
1427 MIRegion.insert({&*MI, I});
1428 MachineBasicBlock *ParentMBB = Region.first->getParent();
1429 if (Region.second != ParentMBB->end())
1430 MIRegion.insert({&*Region.second, I});
1431 RegionBB.push_back(ParentMBB);
1432 }
1433
1434#ifndef NDEBUG
1435 auto PrintTargetRegions = [&]() -> void {
1436 if (TargetRegions.none()) {
1437 dbgs() << REMAT_PREFIX << "No target regions\n";
1438 return;
1439 }
1440 dbgs() << REMAT_PREFIX << "Target regions:\n";
1441 for (unsigned I : TargetRegions.set_bits())
1442 dbgs() << REMAT_PREFIX << " [" << I << "] " << RPTargets[I] << '\n';
1443 };
1444 auto PrintRematReg = [&](const RematReg &Remat) -> Printable {
1445 return Printable([&, Remat](raw_ostream &OS) {
1446 // Concatenate all region numbers in which the register is unused and
1447 // live-through.
1448 bool HasLiveThroughRegion = false;
1449 OS << '[' << Remat.DefRegion << " -";
1450 for (unsigned I = 0; I < NumRegions; ++I) {
1451 if (Remat.isUnusedLiveThrough(I)) {
1452 if (HasLiveThroughRegion) {
1453 OS << ',';
1454 } else {
1455 OS << "- ";
1456 HasLiveThroughRegion = true;
1457 }
1458 OS << I;
1459 }
1460 }
1461 if (HasLiveThroughRegion)
1462 OS << " -";
1463 OS << "-> " << Remat.UseRegion << "] ";
1464 Remat.DefMI->print(OS, /*IsStandalone=*/true, /*SkipOpers=*/false,
1465 /*SkipDebugLoc=*/false, /*AddNewLine=*/false);
1466 });
1467 };
1468#endif
1469
1470 // Set an objective for the stage based on current RP in each region.
1471 REMAT_DEBUG({
1472 dbgs() << "Analyzing ";
1473 MF.getFunction().printAsOperand(dbgs(), false);
1474 dbgs() << ": ";
1475 });
1476 if (!setObjective()) {
1477 LLVM_DEBUG(dbgs() << "no objective to achieve, occupancy is maximal at "
1478 << MFI.getMaxWavesPerEU() << '\n');
1479 return false;
1480 }
1481 LLVM_DEBUG({
1482 if (TargetOcc) {
1483 dbgs() << "increase occupancy from " << *TargetOcc - 1 << '\n';
1484 } else {
1485 dbgs() << "reduce spilling (minimum target occupancy is "
1486 << MFI.getMinWavesPerEU() << ")\n";
1487 }
1488 PrintTargetRegions();
1489 });
1490
1491 if (!collectRematRegs(MIRegion)) {
1492 REMAT_DEBUG(dbgs() << "No rematerializable registers\n");
1493 return false;
1494 }
1495 const ScoredRemat::FreqInfo FreqInfo(MF, DAG);
1496 REMAT_DEBUG({
1497 dbgs() << "Rematerializable registers:\n";
1498 for (const RematReg &Remat : RematRegs)
1499 dbgs() << REMAT_PREFIX << " " << PrintRematReg(Remat) << '\n';
1500 dbgs() << REMAT_PREFIX << "Region frequencies\n";
1501 for (auto [I, Freq] : enumerate(FreqInfo.Regions)) {
1502 dbgs() << REMAT_PREFIX << " [" << I << "] ";
1503 if (Freq)
1504 dbgs() << Freq;
1505 else
1506 dbgs() << "unknown ";
1507 dbgs() << " | " << *DAG.Regions[I].first;
1508 }
1509 });
1510
1511 SmallVector<ScoredRemat> ScoredRemats;
1512 for (RematReg &Remat : RematRegs)
1513 ScoredRemats.emplace_back(&Remat, FreqInfo, DAG);
1514
1515// Rematerialize registers in successive rounds until all RP targets are
1516// satisifed or until we run out of rematerialization candidates.
1517#ifndef NDEBUG
1518 unsigned RoundNum = 0;
1519#endif
1520 BitVector RecomputeRP(NumRegions);
1521 do {
1522 assert(!ScoredRemats.empty() && "no more remat candidates");
1523
1524 // (Re-)Score and (re-)sort all remats in increasing score order.
1525 for (ScoredRemat &Remat : ScoredRemats)
1526 Remat.update(TargetRegions, RPTargets, FreqInfo, !TargetOcc);
1527 sort(ScoredRemats);
1528
1529 REMAT_DEBUG({
1530 dbgs() << "==== ROUND " << RoundNum++ << " ====\n"
1531 << REMAT_PREFIX
1532 << "Candidates with non-null score, in rematerialization order:\n";
1533 for (const ScoredRemat &RematDecision : reverse(ScoredRemats)) {
1534 if (RematDecision.hasNullScore())
1535 break;
1536 dbgs() << REMAT_PREFIX << " " << RematDecision.print() << " | "
1537 << *RematDecision.Remat->DefMI;
1538 }
1539 PrintTargetRegions();
1540 });
1541
1542 RecomputeRP.reset();
1543 unsigned RematIdx = ScoredRemats.size();
1544
1545 // Rematerialize registers in decreasing score order until we estimate
1546 // that all RP targets are satisfied or until rematerialization candidates
1547 // are no longer useful to decrease RP.
1548 for (; RematIdx && TargetRegions.any(); --RematIdx) {
1549 const ScoredRemat &Candidate = ScoredRemats[RematIdx - 1];
1550 // Stop rematerializing on encountering a null score. Since scores
1551 // monotonically decrease as we rematerialize, we know there is nothing
1552 // useful left to do in such cases, even if we were to re-score.
1553 if (Candidate.hasNullScore()) {
1554 RematIdx = 0;
1555 break;
1556 }
1557
1558 RematReg &Remat = *Candidate.Remat;
1559 // When previous rematerializations in this round have already satisfied
1560 // RP targets in all regions this rematerialization can impact, we have a
1561 // good indication that our scores have diverged significantly from
1562 // reality, in which case we interrupt this round and re-score. This also
1563 // ensures that every rematerialization we perform is possibly impactful
1564 // in at least one target region.
1565 if (!Remat.maybeBeneficial(TargetRegions, RPTargets))
1566 break;
1567
1568 REMAT_DEBUG(dbgs() << "** REMAT " << PrintRematReg(Remat) << '\n';);
1569 MachineInstr *RematMI =
1570 Candidate.rematerialize(RecomputeRP, RPTargets, DAG);
1571 RescheduleRegions |= Remat.Live;
1572
1573 // Every rematerialization we do here is likely to move the instruction
1574 // into a higher frequency region, increasing the total sum latency of the
1575 // instruction itself. This is acceptable if we are eliminating a spill in
1576 // the process, but when the goal is increasing occupancy we get nothing
1577 // out of rematerialization if occupancy is not increased in the end; in
1578 // such cases we want to roll back the rematerialization.
1579 if (TargetOcc) {
1580 RollbackInfo &Rollback = Rollbacks.emplace_back(&Remat);
1581 Rollback.RematMI = RematMI;
1582 // Make the original MI a debug value so that it does not influence
1583 // scheduling and replace all read registers with a sentinel register to
1584 // prevent operands to appear in use-lists of other MIs during LIS
1585 // updates. Store mappings between operand indices and original
1586 // registers for potential rollback.
1587 Remat.DefMI->setDesc(DAG.TII->get(TargetOpcode::DBG_VALUE));
1588 for (auto [Idx, MO] : enumerate(Remat.DefMI->operands())) {
1589 if (MO.isReg() && MO.readsReg()) {
1590 Rollback.RegMap.insert({Idx, MO.getReg()});
1591 MO.setReg(Register());
1592 }
1593 }
1594 } else {
1595 // Just delete the original instruction if it cannot be rolled back.
1596 DAG.deleteMI(Remat.DefRegion, Remat.DefMI);
1597 }
1598
1599 unsetSatisfiedRPTargets(Remat.Live);
1600 }
1601
1602 REMAT_DEBUG({
1603 if (!TargetRegions.any()) {
1604 dbgs() << "** Interrupt round on all targets achieved\n";
1605 } else if (RematIdx) {
1606 dbgs() << "** Interrupt round on stale score for "
1607 << *ScoredRemats[RematIdx - 1].Remat->DefMI;
1608 } else {
1609 dbgs() << "** Stop on exhausted rematerialization candidates\n";
1610 }
1611 });
1612
1613 // Peel off registers we already rematerialized from the vector's tail.
1614 ScoredRemats.truncate(RematIdx);
1615 } while ((updateAndVerifyRPTargets(RecomputeRP) || TargetRegions.any()) &&
1616 !ScoredRemats.empty());
1617 if (RescheduleRegions.none())
1618 return false;
1619
1620 // Commit all pressure changes to the DAG and compute minimum achieved
1621 // occupancy in impacted regions.
1622 REMAT_DEBUG(dbgs() << "==== REMAT RESULTS ====\n");
1623 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
1624 for (unsigned I : RescheduleRegions.set_bits()) {
1625 DAG.Pressure[I] = RPTargets[I].getCurrentRP();
1626 REMAT_DEBUG(dbgs() << '[' << I << "] Achieved occupancy "
1627 << DAG.Pressure[I].getOccupancy(ST, DynamicVGPRBlockSize)
1628 << " (" << RPTargets[I] << ")\n");
1629 }
1630 AchievedOcc = MFI.getMaxWavesPerEU();
1631 for (const GCNRegPressure &RP : DAG.Pressure) {
1632 AchievedOcc =
1633 std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
1634 }
1635
1636 REMAT_DEBUG({
1637 dbgs() << "Retrying function scheduling with new min. occupancy of "
1638 << AchievedOcc << " from rematerializing (original was "
1639 << DAG.MinOccupancy;
1640 if (TargetOcc)
1641 dbgs() << ", target was " << *TargetOcc;
1642 dbgs() << ")\n";
1643 });
1644
1645 DAG.setTargetOccupancy(getStageTargetOccupancy());
1646 return true;
1647}
1648
1650 DAG.finishBlock();
1651 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1652}
1653
1655 SavedMutations.swap(DAG.Mutations);
1656 S.SGPRLimitBias = S.VGPRLimitBias = 0;
1657 if (DAG.MinOccupancy > InitialOccupancy) {
1658 assert(IsAnyRegionScheduled);
1660 << " stage successfully increased occupancy to "
1661 << DAG.MinOccupancy << '\n');
1662 } else if (!IsAnyRegionScheduled) {
1663 assert(DAG.MinOccupancy == InitialOccupancy);
1665 << ": No regions scheduled, min occupancy stays at "
1666 << DAG.MinOccupancy << ", MFI occupancy stays at "
1667 << MFI.getOccupancy() << ".\n");
1668 }
1669
1671}
1672
1674 // Skip empty scheduling region.
1675 if (DAG.begin() == DAG.end())
1676 return false;
1677
1678 // Check whether this new region is also a new block.
1679 if (DAG.RegionBegin->getParent() != CurrentMBB)
1680 setupNewBlock();
1681
1682 unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
1683 DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
1684
1685 // Skip regions with 1 schedulable instruction.
1686 if (DAG.begin() == std::prev(DAG.end()))
1687 return false;
1688
1689 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1690 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1691 << " " << CurrentMBB->getName()
1692 << "\n From: " << *DAG.begin() << " To: ";
1693 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1694 else dbgs() << "End";
1695 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1696
1697 // Save original instruction order before scheduling for possible revert.
1698 Unsched.clear();
1699 Unsched.reserve(DAG.NumRegionInstrs);
1702 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
1703 for (auto &I : DAG) {
1704 Unsched.push_back(&I);
1705 if (SII->isIGLPMutationOnly(I.getOpcode()))
1706 DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1707 }
1708 } else {
1709 for (auto &I : DAG)
1710 Unsched.push_back(&I);
1711 }
1712
1713 PressureBefore = DAG.Pressure[RegionIdx];
1714
1715 LLVM_DEBUG(
1716 dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1717 << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1718 << "Region live-in pressure: "
1719 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1720 << "Region register pressure: " << print(PressureBefore));
1721
1722 S.HasHighPressure = false;
1723 S.KnownExcessRP = isRegionWithExcessRP();
1724
1725 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1727 SavedMutations.clear();
1728 SavedMutations.swap(DAG.Mutations);
1729 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1731 DAG.addMutation(createIGroupLPDAGMutation(
1732 IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1734 }
1735
1736 return true;
1737}
1738
1740 // Only reschedule regions that have excess register pressure (i.e. spilling)
1741 // or had minimum occupancy at the beginning of the stage (as long as
1742 // rescheduling of previous regions did not make occupancy drop back down to
1743 // the initial minimum).
1744 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1745 // If no region has been scheduled yet, the DAG has not yet been updated with
1746 // the occupancy target. So retrieve it from the temporary.
1747 unsigned CurrentTargetOccupancy =
1748 IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
1749 if (!DAG.RegionsWithExcessRP[RegionIdx] &&
1750 (CurrentTargetOccupancy <= InitialOccupancy ||
1751 DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
1752 InitialOccupancy))
1753 return false;
1754
1755 bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
1756 // If this is the first region scheduled during this stage, make the target
1757 // occupancy changes in the DAG and MFI.
1758 if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
1759 IsAnyRegionScheduled = true;
1760 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
1761 DAG.setTargetOccupancy(TempTargetOccupancy);
1762 }
1763 return IsSchedulingThisRegion;
1764}
1765
1767 // We may need to reschedule this region if it wasn't rescheduled in the last
1768 // stage, or if we found it was testing critical register pressure limits in
1769 // the unclustered reschedule stage. The later is because we may not have been
1770 // able to raise the min occupancy in the previous stage so the region may be
1771 // overly constrained even if it was already rescheduled.
1772 if (!DAG.RegionsWithHighRP[RegionIdx])
1773 return false;
1774
1776}
1777
1779 return RescheduleRegions[RegionIdx] && GCNSchedStage::initGCNRegion();
1780}
1781
1783 if (CurrentMBB)
1784 DAG.finishBlock();
1785
1786 CurrentMBB = DAG.RegionBegin->getParent();
1787 DAG.startBlock(CurrentMBB);
1788 // Get real RP for the region if it hasn't be calculated before. After the
1789 // initial schedule stage real RP will be collected after scheduling.
1793 DAG.computeBlockPressure(RegionIdx, CurrentMBB);
1794}
1795
1797 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1798 if (S.HasHighPressure)
1799 DAG.RegionsWithHighRP[RegionIdx] = true;
1800
1801 // Revert scheduling if we have dropped occupancy or there is some other
1802 // reason that the original schedule is better.
1804
1805 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1807 SavedMutations.swap(DAG.Mutations);
1808}
1809
1812 // When the goal is to increase occupancy, all regions must reach the target
1813 // occupancy for rematerializations to be possibly useful, otherwise we will
1814 // just hurt latency for no benefit. If minimum occupancy drops below the
1815 // target there is no point in trying to re-schedule further regions.
1816 if (!TargetOcc)
1817 return;
1818 RegionReverts.emplace_back(RegionIdx, Unsched, PressureBefore);
1819 if (DAG.MinOccupancy < *TargetOcc) {
1820 REMAT_DEBUG(dbgs() << "Region " << RegionIdx
1821 << " cannot meet occupancy target, interrupting "
1822 "re-scheduling in all regions\n");
1823 RescheduleRegions.reset();
1824 }
1825}
1826
1828 // Check the results of scheduling.
1829 PressureAfter = DAG.getRealRegPressure(RegionIdx);
1830
1831 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1832 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1833
1834 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1835
1836 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1837 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1838 DAG.Pressure[RegionIdx] = PressureAfter;
1839
1840 // Early out if we have achieved the occupancy target.
1841 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1842 return;
1843 }
1844
1845 unsigned TargetOccupancy = std::min(
1846 S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
1847 unsigned WavesAfter = std::min(
1848 TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
1849 unsigned WavesBefore = std::min(
1850 TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
1851 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1852 << ", after " << WavesAfter << ".\n");
1853
1854 // We may not be able to keep the current target occupancy because of the just
1855 // scheduled region. We might still be able to revert scheduling if the
1856 // occupancy before was higher, or if the current schedule has register
1857 // pressure higher than the excess limits which could lead to more spilling.
1858 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
1859
1860 // Allow memory bound functions to drop to 4 waves if not limited by an
1861 // attribute.
1862 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1863 WavesAfter >= MFI.getMinAllowedOccupancy()) {
1864 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1865 << MFI.getMinAllowedOccupancy() << " waves\n");
1866 NewOccupancy = WavesAfter;
1867 }
1868
1869 if (NewOccupancy < DAG.MinOccupancy) {
1870 DAG.MinOccupancy = NewOccupancy;
1871 MFI.limitOccupancy(DAG.MinOccupancy);
1872 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1873 << DAG.MinOccupancy << ".\n");
1874 }
1875 // The maximum number of arch VGPR on non-unified register file, or the
1876 // maximum VGPR + AGPR in the unified register file case.
1877 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1878 // The maximum number of arch VGPR for both unified and non-unified register
1879 // file.
1880 unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
1881 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1882
1883 if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
1884 PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
1885 PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1886 PressureAfter.getSGPRNum() > MaxSGPRs) {
1887 DAG.RegionsWithHighRP[RegionIdx] = true;
1888 DAG.RegionsWithExcessRP[RegionIdx] = true;
1889 }
1890
1891 // Revert if this region's schedule would cause a drop in occupancy or
1892 // spilling.
1893 if (shouldRevertScheduling(WavesAfter)) {
1895 std::tie(DAG.RegionBegin, DAG.RegionEnd) = DAG.Regions[RegionIdx];
1896 } else {
1897 DAG.Pressure[RegionIdx] = PressureAfter;
1898 }
1899}
1900
1901unsigned
1902GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1903 DenseMap<unsigned, unsigned> &ReadyCycles,
1904 const TargetSchedModel &SM) {
1905 unsigned ReadyCycle = CurrCycle;
1906 for (auto &D : SU.Preds) {
1907 if (D.isAssignedRegDep()) {
1908 MachineInstr *DefMI = D.getSUnit()->getInstr();
1909 unsigned Latency = SM.computeInstrLatency(DefMI);
1910 unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
1911 ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
1912 }
1913 }
1914 ReadyCycles[SU.NodeNum] = ReadyCycle;
1915 return ReadyCycle;
1916}
1917
1918#ifndef NDEBUG
1920 bool operator()(std::pair<MachineInstr *, unsigned> A,
1921 std::pair<MachineInstr *, unsigned> B) const {
1922 return A.second < B.second;
1923 }
1924};
1925
1926static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1927 EarlierIssuingCycle> &ReadyCycles) {
1928 if (ReadyCycles.empty())
1929 return;
1930 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1931 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1932 << " ##################\n# Cycle #\t\t\tInstruction "
1933 " "
1934 " \n";
1935 unsigned IPrev = 1;
1936 for (auto &I : ReadyCycles) {
1937 if (I.second > IPrev + 1)
1938 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1939 << " CYCLES DETECTED ******************************\n\n";
1940 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
1941 IPrev = I.second;
1942 }
1943}
1944#endif
1945
1946ScheduleMetrics
1947GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1948#ifndef NDEBUG
1949 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1950 ReadyCyclesSorted;
1951#endif
1952 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1953 unsigned SumBubbles = 0;
1954 DenseMap<unsigned, unsigned> ReadyCycles;
1955 unsigned CurrCycle = 0;
1956 for (auto &SU : InputSchedule) {
1957 unsigned ReadyCycle =
1958 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1959 SumBubbles += ReadyCycle - CurrCycle;
1960#ifndef NDEBUG
1961 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1962#endif
1963 CurrCycle = ++ReadyCycle;
1964 }
1965#ifndef NDEBUG
1966 LLVM_DEBUG(
1967 printScheduleModel(ReadyCyclesSorted);
1968 dbgs() << "\n\t"
1969 << "Metric: "
1970 << (SumBubbles
1971 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1972 : 1)
1973 << "\n\n");
1974#endif
1975
1976 return ScheduleMetrics(CurrCycle, SumBubbles);
1977}
1978
1981#ifndef NDEBUG
1982 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1983 ReadyCyclesSorted;
1984#endif
1985 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1986 unsigned SumBubbles = 0;
1987 DenseMap<unsigned, unsigned> ReadyCycles;
1988 unsigned CurrCycle = 0;
1989 for (auto &MI : DAG) {
1990 SUnit *SU = DAG.getSUnit(&MI);
1991 if (!SU)
1992 continue;
1993 unsigned ReadyCycle =
1994 computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
1995 SumBubbles += ReadyCycle - CurrCycle;
1996#ifndef NDEBUG
1997 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1998#endif
1999 CurrCycle = ++ReadyCycle;
2000 }
2001#ifndef NDEBUG
2002 LLVM_DEBUG(
2003 printScheduleModel(ReadyCyclesSorted);
2004 dbgs() << "\n\t"
2005 << "Metric: "
2006 << (SumBubbles
2007 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
2008 : 1)
2009 << "\n\n");
2010#endif
2011
2012 return ScheduleMetrics(CurrCycle, SumBubbles);
2013}
2014
2015bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
2016 if (WavesAfter < DAG.MinOccupancy)
2017 return true;
2018
2019 // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
2020 if (DAG.MFI.isDynamicVGPREnabled()) {
2021 unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2022 &ST, DAG.MFI.getDynamicVGPRBlockSize(),
2023 PressureBefore.getVGPRNum(false));
2024 unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
2025 &ST, DAG.MFI.getDynamicVGPRBlockSize(),
2026 PressureAfter.getVGPRNum(false));
2027 if (BlocksAfter > BlocksBefore)
2028 return true;
2029 }
2030
2031 return false;
2032}
2033
2036 return false;
2037
2039 return true;
2040
2041 if (mayCauseSpilling(WavesAfter))
2042 return true;
2043
2044 return false;
2045}
2046
2048 // If RP is not reduced in the unclustered reschedule stage, revert to the
2049 // old schedule.
2050 if ((WavesAfter <=
2051 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
2052 mayCauseSpilling(WavesAfter)) ||
2054 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
2055 return true;
2056 }
2057
2058 // Do not attempt to relax schedule even more if we are already spilling.
2060 return false;
2061
2062 LLVM_DEBUG(
2063 dbgs()
2064 << "\n\t *** In shouldRevertScheduling ***\n"
2065 << " *********** BEFORE UnclusteredHighRPStage ***********\n");
2066 ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
2067 LLVM_DEBUG(
2068 dbgs()
2069 << "\n *********** AFTER UnclusteredHighRPStage ***********\n");
2071 unsigned OldMetric = MBefore.getMetric();
2072 unsigned NewMetric = MAfter.getMetric();
2073 unsigned WavesBefore = std::min(
2074 S.getTargetOccupancy(),
2075 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
2076 unsigned Profit =
2077 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
2079 NewMetric) /
2081 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
2082 << MAfter << "Profit: " << Profit << "\n");
2083 return Profit < ScheduleMetrics::ScaleFactor;
2084}
2085
2088 return false;
2089
2091 return true;
2092
2093 if (mayCauseSpilling(WavesAfter))
2094 return true;
2095
2096 return false;
2097}
2098
2100 // When trying to increase occupancy (TargetOcc == true) the stage manages
2101 // region reverts globally (all or none), so we always return false here.
2102 return !TargetOcc && mayCauseSpilling(WavesAfter);
2103}
2104
2106 if (mayCauseSpilling(WavesAfter))
2107 return true;
2108
2109 return false;
2110}
2111
2113 unsigned WavesAfter) {
2114 return mayCauseSpilling(WavesAfter);
2115}
2116
2117bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
2118 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
2120 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
2121 return true;
2122 }
2123
2124 return false;
2125}
2126
2129 ArrayRef<MachineInstr *> MIOrder) {
2130 assert(static_cast<size_t>(std::distance(DAG.Regions[RegionIdx].first,
2131 DAG.Regions[RegionIdx].second)) ==
2132 MIOrder.size() &&
2133 "instruction number mismatch");
2134 if (MIOrder.empty())
2135 return;
2136
2137 LLVM_DEBUG(dbgs() << "Reverting scheduling for region " << RegionIdx << '\n');
2138
2139 // Reconstruct MI sequence by moving instructions in desired order before
2140 // the current region's start.
2141 MachineBasicBlock::iterator RegionEnd = DAG.Regions[RegionIdx].first;
2142 for (MachineInstr *MI : MIOrder) {
2143 // Either move the next MI in order before the end of the region or move the
2144 // region end past the MI if it is at the correct position.
2145 MachineBasicBlock::iterator MII = MI->getIterator();
2146 if (MII != RegionEnd) {
2147 // Will subsequent splice move MI up past a non-debug instruction?
2148 bool NonDebugReordered =
2149 !MI->isDebugInstr() &&
2150 skipDebugInstructionsForward(RegionEnd, MII) != MII;
2151 MBB->splice(RegionEnd, MBB, MI);
2152 // Only update LiveIntervals information if non-debug instructions are
2153 // reordered. Otherwise debug instructions could cause code generation to
2154 // change.
2155 if (NonDebugReordered)
2156 DAG.LIS->handleMove(*MI, true);
2157 } else {
2158 ++RegionEnd;
2159 }
2160 if (MI->isDebugInstr()) {
2161 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2162 continue;
2163 }
2164
2165 // Reset read-undef flags and update them later.
2166 for (MachineOperand &Op : MI->all_defs())
2167 Op.setIsUndef(false);
2168 RegisterOperands RegOpers;
2169 RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
2170 if (DAG.ShouldTrackLaneMasks) {
2171 // Adjust liveness and add missing dead+read-undef flags.
2172 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
2173 RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
2174 } else {
2175 // Adjust for missing dead-def flags.
2176 RegOpers.detectDeadDefs(*MI, *DAG.LIS);
2177 }
2178 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
2179 }
2180
2181 // The region end doesn't change throughout scheduling since it itself is
2182 // outside the region (whether that is a MBB end or a terminator MI).
2183 assert(RegionEnd == DAG.Regions[RegionIdx].second && "region end mismatch");
2184 DAG.Regions[RegionIdx].first = MIOrder.front();
2185}
2186
2187bool RewriteMFMAFormStage::isRewriteCandidate(MachineInstr *MI) const {
2188
2189 if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
2190 return false;
2191 return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1;
2192}
2193
2194bool RewriteMFMAFormStage::initHeuristics(
2195 std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
2196 DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2197 SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2198 bool Changed = false;
2199
2200 // Prepare for the heuristics
2201 for (MachineBasicBlock &MBB : MF) {
2202 for (MachineInstr &MI : MBB) {
2203 if (!isRewriteCandidate(&MI))
2204 continue;
2205
2206 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
2207 assert(ReplacementOp != -1);
2208
2209 RewriteCands.push_back({&MI, MI.getOpcode()});
2210 MI.setDesc(TII->get(ReplacementOp));
2211
2212 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
2213 if (Src2->isReg()) {
2214 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2215 findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
2216
2217 // For any definition of the src2 register which is non-MFMA, we
2218 // insert a copy.
2219 for (SlotIndex RDIdx : Src2ReachingDefs) {
2220 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
2221 if (!TII->isMAI(*RD))
2222 CopyForDef.insert(RD);
2223 }
2224 }
2225
2226 MachineOperand &Dst = MI.getOperand(0);
2227 SmallVector<MachineOperand *, 8> DstReachingUses;
2228
2229 findReachingUses(&MI, DAG.LIS, DstReachingUses);
2230
2231 for (MachineOperand *RUOp : DstReachingUses) {
2232 if (TII->isMAI(*RUOp->getParent()))
2233 continue;
2234
2235 // For any user of the result of the MFMA which is not an MFMA, we
2236 // insert a copy. For a given register, we will only insert one copy
2237 // per user block.
2238 CopyForUse[RUOp->getParent()->getParent()].insert(RUOp->getReg());
2239
2240 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2241 findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
2242
2243 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2244 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2245 if (TII->isMAI(*RD))
2246 continue;
2247
2248 // For any definition of the user of the MFMA which is not an MFMA,
2249 // we insert a copy. We do this to transform all the reaching defs
2250 // of this use to AGPR. By doing this, we can insert a copy from
2251 // AGPR to VGPR at the user rather than after the MFMA.
2252 CopyForDef.insert(RD);
2253 }
2254 }
2255
2256 // Do the rewrite to allow for updated RP calculation.
2257 const TargetRegisterClass *VDefRC = DAG.MRI.getRegClass(Dst.getReg());
2258 const TargetRegisterClass *ADefRC = SRI->getEquivalentAGPRClass(VDefRC);
2259 DAG.MRI.setRegClass(Dst.getReg(), ADefRC);
2260 if (Src2->isReg()) {
2261 // Have to get src types separately since subregs may cause C and D
2262 // registers to be different types even though the actual operand is
2263 // the same size.
2264 const TargetRegisterClass *VUseRC = DAG.MRI.getRegClass(Src2->getReg());
2265 const TargetRegisterClass *AUseRC = SRI->getEquivalentAGPRClass(VUseRC);
2266 DAG.MRI.setRegClass(Src2->getReg(), AUseRC);
2267 }
2268 Changed = true;
2269 }
2270 }
2271
2272 return Changed;
2273}
2274
2275int64_t RewriteMFMAFormStage::getRewriteCost(
2276 const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
2277 const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2278 const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2279 MachineBlockFrequencyInfo *MBFI = DAG.MBFI;
2280
2281 int64_t BestSpillCost = 0;
2282 int64_t Cost = 0;
2283 uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
2284
2285 std::pair<unsigned, unsigned> MaxVectorRegs =
2286 ST.getMaxNumVectorRegs(MF.getFunction());
2287 unsigned ArchVGPRThreshold = MaxVectorRegs.first;
2288 unsigned AGPRThreshold = MaxVectorRegs.second;
2289 unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
2290
2291 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2292 if (!RegionsWithExcessArchVGPR[Region])
2293 continue;
2294
2295 GCNRegPressure &PressureBefore = DAG.Pressure[Region];
2296 unsigned SpillCostBefore = PressureBefore.getVGPRSpills(
2297 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2298
2299 // For the cases we care about (i.e. ArchVGPR usage is greater than the
2300 // addressable limit), rewriting alone should bring pressure to manageable
2301 // level. If we find any such region, then the rewrite is potentially
2302 // beneficial.
2303 GCNRegPressure PressureAfter = DAG.getRealRegPressure(Region);
2304 unsigned SpillCostAfter = PressureAfter.getVGPRSpills(
2305 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2306
2307 uint64_t BlockFreq =
2308 MBFI->getBlockFreq(DAG.Regions[Region].first->getParent())
2309 .getFrequency();
2310
2311 bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
2312 uint64_t RelativeFreq = EntryFreq && BlockFreq
2313 ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
2314 : BlockFreq / EntryFreq)
2315 : 1;
2316
2317 // This assumes perfect spilling / splitting -- using one spill / copy
2318 // instruction and one restoreFrom / copy for each excess register,
2319 int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2;
2320
2321 // Also account for the block frequency.
2322 if (RelativeFreqIsDenom)
2323 SpillCost /= (int64_t)RelativeFreq;
2324 else
2325 SpillCost *= (int64_t)RelativeFreq;
2326
2327 // If we have increased spilling in any block, just bail.
2328 if (SpillCost > 0)
2329 return SpillCost;
2330
2331 if (SpillCost < BestSpillCost)
2332 BestSpillCost = SpillCost;
2333 }
2334
2335 // Set the cost to the largest decrease in spill cost in order to not double
2336 // count spill reductions.
2337 Cost = BestSpillCost;
2338 assert(Cost <= 0);
2339
2340 unsigned CopyCost = 0;
2341
2342 // For each CopyForDef, increase the cost by the register size while
2343 // accounting for block frequency.
2344 for (MachineInstr *DefMI : CopyForDef) {
2345 Register DefReg = DefMI->getOperand(0).getReg();
2346 uint64_t DefFreq =
2347 EntryFreq
2348 ? MBFI->getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
2349 : 1;
2350
2351 const TargetRegisterClass *RC = DAG.MRI.getRegClass(DefReg);
2352 CopyCost += RC->getCopyCost() * DefFreq;
2353 }
2354
2355 // Account for CopyForUse copies in each block that the register is used.
2356 for (auto &[UseBlock, UseRegs] : CopyForUse) {
2357 uint64_t UseFreq =
2358 EntryFreq ? MBFI->getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
2359
2360 for (Register UseReg : UseRegs) {
2361 const TargetRegisterClass *RC = DAG.MRI.getRegClass(UseReg);
2362 CopyCost += RC->getCopyCost() * UseFreq;
2363 }
2364 }
2365
2366 // Reset the classes that were changed to AGPR for better RB analysis.
2367 // We must do rewriting after copy-insertion, as some defs of the register
2368 // may require VGPR. Additionally, if we bail out and don't perform the
2369 // rewrite then these need to be restored anyway.
2370 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2371 assert(TII->isMAI(*MI));
2372 const TargetRegisterClass *ADefRC =
2373 DAG.MRI.getRegClass(MI->getOperand(0).getReg());
2374 const TargetRegisterClass *VDefRC = SRI->getEquivalentVGPRClass(ADefRC);
2375 DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VDefRC);
2376 MI->setDesc(TII->get(OriginalOpcode));
2377
2378 MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
2379 assert(Src2);
2380 if (!Src2->isReg())
2381 continue;
2382
2383 // Have to get src types separately since subregs may cause C and D
2384 // registers to be different types even though the actual operand is
2385 // the same size.
2386 const TargetRegisterClass *AUseRC = DAG.MRI.getRegClass(Src2->getReg());
2387 const TargetRegisterClass *VUseRC = SRI->getEquivalentVGPRClass(AUseRC);
2388 DAG.MRI.setRegClass(Src2->getReg(), VUseRC);
2389 }
2390
2391 return Cost + CopyCost;
2392}
2393
2394bool RewriteMFMAFormStage::rewrite(
2395 const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
2396 DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
2397 DenseMap<MachineInstr *, unsigned> LastMIToRegion;
2398
2399 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2400 RegionBoundaries Entry = DAG.Regions[Region];
2401 if (Entry.first == Entry.second)
2402 continue;
2403
2404 FirstMIToRegion[&*Entry.first] = Region;
2405 if (Entry.second != Entry.first->getParent()->end())
2406 LastMIToRegion[&*Entry.second] = Region;
2407 }
2408
2409 // Rewrite the MFMAs to AGPR, and insert any copies as needed.
2410 // The general assumption of the algorithm (and the previous cost calculation)
2411 // is that it is better to insert the copies in the MBB of the def of the src2
2412 // operands, and in the MBB of the user of the dest operands. This is based on
2413 // the assumption that the MFMAs are likely to appear in loop bodies, while
2414 // the src2 and dest operands are live-in / live-out of the loop. Due to this
2415 // design, the algorithm for finding copy insertion points is more
2416 // complicated.
2417 //
2418 // There are three main cases to handle: 1. the reaching defs of the src2
2419 // operands, 2. the reaching uses of the dst operands, and 3. the reaching
2420 // defs of the reaching uses of the dst operand.
2421 //
2422 // In the first case, we simply insert copies after each of the reaching
2423 // definitions. In the second case, we collect all the uses of a given dest
2424 // and organize them by MBB. Then, we insert 1 copy for each MBB before the
2425 // earliest use. Since the use may have multiple reaching defs, and since we
2426 // want to replace the register it is using with the result of the copy, we
2427 // must handle case 3. In the third case, we simply insert a copy after each
2428 // of the reaching defs to connect to the copy of the reaching uses of the dst
2429 // reg. This allows us to avoid inserting copies next to the MFMAs.
2430 //
2431 // While inserting the copies, we maintain a map of operands which will use
2432 // different regs (i.e. the result of the copies). For example, a case 1 src2
2433 // operand will use the register result of the copies after the reaching defs,
2434 // as opposed to the original register. Now that we have completed our copy
2435 // analysis and placement, we can bulk update the registers. We do this
2436 // separately as to avoid complicating the reachingDef and reachingUse
2437 // queries.
2438 //
2439 // While inserting the copies, we also maintain a list or registers which we
2440 // will want to reclassify as AGPR. After doing the copy insertion and the
2441 // register replacement, we can finally do the reclassification. This uses the
2442 // redef map, as the registers we are interested in reclassifying may be
2443 // replaced by the result of a copy. We must do this after the copy analysis
2444 // and placement as we must have an accurate redef map -- otherwise we may end
2445 // up creating illegal instructions.
2446
2447 // The original registers of the MFMA that need to be reclassified as AGPR.
2448 DenseSet<Register> RewriteRegs;
2449 // The map of an original register in the MFMA to a new register (result of a
2450 // copy) that it should be replaced with.
2451 DenseMap<Register, Register> RedefMap;
2452 // The map of the original MFMA registers to the relevant MFMA operands.
2453 DenseMap<Register, DenseSet<MachineOperand *>> ReplaceMap;
2454 // The map of reaching defs for a given register -- to avoid duplicate copies.
2455 DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap;
2456 // The map of reaching uses for a given register by basic block -- to avoid
2457 // duplicate copies and to calculate per MBB insert pts.
2458 DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>
2459 ReachingUseTracker;
2460
2461 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2462 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode());
2463 if (ReplacementOp == -1)
2464 continue;
2465 MI->setDesc(TII->get(ReplacementOp));
2466
2467 // Case 1: insert copies for the reaching defs of the Src2Reg.
2468 MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
2469 if (Src2->isReg()) {
2470 Register Src2Reg = Src2->getReg();
2471 if (!Src2Reg.isVirtual())
2472 return false;
2473
2474 Register MappedReg = Src2->getReg();
2475 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2476 findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
2477 SmallSetVector<MachineInstr *, 8> Src2DefsReplace;
2478
2479 for (SlotIndex RDIndex : Src2ReachingDefs) {
2480 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2481 if (TII->isMAI(*RD))
2482 continue;
2483
2484 // If there is a non mai reaching def, then we need a copy.
2485 Src2DefsReplace.insert(RD);
2486 }
2487
2488 if (!Src2DefsReplace.empty()) {
2489 DenseMap<Register, Register>::iterator RI = RedefMap.find(Src2Reg);
2490 if (RI != RedefMap.end()) {
2491 MappedReg = RI->second;
2492 } else {
2493 assert(!ReachingDefCopyMap.contains(Src2Reg));
2494 const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Src2Reg);
2495 const TargetRegisterClass *VGPRRC =
2496 SRI->getEquivalentVGPRClass(Src2RC);
2497
2498 // Track the mapping of the original register to the new register.
2499 MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
2500 RedefMap[Src2Reg] = MappedReg;
2501 }
2502
2503 // If none exists, create a copy from this reaching def.
2504 // We may have inserted a copy already in an earlier iteration.
2505 for (MachineInstr *RD : Src2DefsReplace) {
2506 // Do not create redundant copies.
2507 if (ReachingDefCopyMap[Src2Reg].insert(RD).second) {
2508 MachineInstrBuilder VGPRCopy =
2509 BuildMI(*RD->getParent(), std::next(RD->getIterator()),
2510 RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
2511 .addDef(MappedReg, {}, 0)
2512 .addUse(Src2Reg, {}, 0);
2513 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2514
2515 // If this reaching def was the last MI in the region, update the
2516 // region boundaries.
2517 if (LastMIToRegion.contains(RD)) {
2518 unsigned UpdateRegion = LastMIToRegion[RD];
2519 DAG.Regions[UpdateRegion].second = VGPRCopy;
2520 LastMIToRegion.erase(RD);
2521 }
2522 }
2523 }
2524 }
2525
2526 // Track the register for reclassification
2527 RewriteRegs.insert(Src2Reg);
2528
2529 // Always insert the operand for replacement. If this corresponds with a
2530 // chain of tied-def we may not see the VGPR requirement until later.
2531 ReplaceMap[Src2Reg].insert(Src2);
2532 }
2533
2534 // Case 2 and Case 3: insert copies before the reaching uses of the dsts,
2535 // and after the reaching defs of the reaching uses of the dsts.
2536
2537 MachineOperand *Dst = &MI->getOperand(0);
2538 Register DstReg = Dst->getReg();
2539 if (!DstReg.isVirtual())
2540 return false;
2541
2542 Register MappedReg = DstReg;
2543 SmallVector<MachineOperand *, 8> DstReachingUses;
2544
2545 SmallVector<MachineOperand *, 8> DstReachingUseCopies;
2546 SmallVector<MachineInstr *, 8> DstUseDefsReplace;
2547
2548 findReachingUses(MI, DAG.LIS, DstReachingUses);
2549
2550 for (MachineOperand *RUOp : DstReachingUses) {
2551 if (TII->isMAI(*RUOp->getParent()))
2552 continue;
2553
2554 // If there is a non mai reaching use, then we need a copy.
2555 if (find(DstReachingUseCopies, RUOp) == DstReachingUseCopies.end())
2556 DstReachingUseCopies.push_back(RUOp);
2557 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2558 findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
2559
2560 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2561 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2562 if (TII->isMAI(*RD))
2563 continue;
2564
2565 // If there is a non mai reaching def of this reaching use, then we will
2566 // need a copy.
2567 if (find(DstUseDefsReplace, RD) == DstUseDefsReplace.end())
2568 DstUseDefsReplace.push_back(RD);
2569 }
2570 }
2571
2572 if (!DstUseDefsReplace.empty()) {
2573 DenseMap<Register, Register>::iterator RI = RedefMap.find(DstReg);
2574 if (RI != RedefMap.end()) {
2575 MappedReg = RI->second;
2576 } else {
2577 assert(!ReachingDefCopyMap.contains(DstReg));
2578 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
2579 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2580
2581 // Track the mapping of the original register to the new register.
2582 MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
2583 RedefMap[DstReg] = MappedReg;
2584 }
2585
2586 // If none exists, create a copy from this reaching def.
2587 // We may have inserted a copy already in an earlier iteration.
2588 for (MachineInstr *RD : DstUseDefsReplace) {
2589 // Do not create reundant copies.
2590 if (ReachingDefCopyMap[DstReg].insert(RD).second) {
2591 MachineInstrBuilder VGPRCopy =
2592 BuildMI(*RD->getParent(), std::next(RD->getIterator()),
2593 RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
2594 .addDef(MappedReg, {}, 0)
2595 .addUse(DstReg, {}, 0);
2596 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2597
2598 // If this reaching def was the last MI in the region, update the
2599 // region boundaries.
2601 LastMIToRegion.find(RD);
2602 if (LMI != LastMIToRegion.end()) {
2603 unsigned UpdateRegion = LMI->second;
2604 DAG.Regions[UpdateRegion].second = VGPRCopy;
2605 LastMIToRegion.erase(RD);
2606 }
2607 }
2608 }
2609 }
2610
2611 DenseSet<MachineOperand *> &DstRegSet = ReplaceMap[DstReg];
2612 for (MachineOperand *RU : DstReachingUseCopies) {
2613 MachineBasicBlock *RUBlock = RU->getParent()->getParent();
2614 // Just keep track of the reaching use of this register by block. After we
2615 // have scanned all the MFMAs we can find optimal insert pts.
2616 if (RUBlock != MI->getParent()) {
2617 ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(RU);
2618 continue;
2619 }
2620
2621 // Special case, the use is in the same block as the MFMA. Insert the copy
2622 // just before the use.
2623 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
2624 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2625 Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
2626 MachineInstr *UseInst = RU->getParent();
2627 MachineInstrBuilder VGPRCopy =
2628 BuildMI(*UseInst->getParent(), UseInst->getIterator(),
2629 UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
2630 .addDef(NewUseReg, {}, 0)
2631 .addUse(DstReg, {}, 0);
2632 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2633 // Since we know this use has only one reaching def, we can replace the
2634 // use reg.
2635 RU->setReg(NewUseReg);
2636 // Track the copy source operand for r eplacement.
2637 DstRegSet.insert(&VGPRCopy->getOperand(1));
2638 }
2639
2640 // Track the register for reclassification
2641 RewriteRegs.insert(DstReg);
2642
2643 // Insert the dst operand for replacement. If this dst is in a chain of
2644 // tied-def MFMAs, and the first src2 needs to be replaced with a new reg,
2645 // all the correspond operands need to be replaced.
2646 DstRegSet.insert(Dst);
2647 }
2648
2649 // Handle the copies for dst uses.
2650 using RUBType =
2651 std::pair<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>;
2652 for (RUBType RUBlockEntry : ReachingUseTracker) {
2653 using RUDType = std::pair<Register, SmallPtrSet<MachineOperand *, 8>>;
2654 for (RUDType RUDst : RUBlockEntry.second) {
2655 MachineOperand *OpBegin = *RUDst.second.begin();
2656 SlotIndex InstPt = DAG.LIS->getInstructionIndex(*OpBegin->getParent());
2657
2658 // Find the earliest use in this block.
2659 for (MachineOperand *User : RUDst.second) {
2660 SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(*User->getParent());
2661 if (SlotIndex::isEarlierInstr(NewInstPt, InstPt))
2662 InstPt = NewInstPt;
2663 }
2664
2665 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(RUDst.first);
2666 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2667 Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
2668 MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(InstPt);
2669
2670 MachineInstrBuilder VGPRCopy =
2671 BuildMI(*UseInst->getParent(), UseInst->getIterator(),
2672 UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
2673 .addDef(NewUseReg, {}, 0)
2674 .addUse(RUDst.first, {}, 0);
2675 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2676
2677 // If this UseInst was the first MI in the region, update the region
2678 // boundaries.
2680 FirstMIToRegion.find(UseInst);
2681 if (FI != FirstMIToRegion.end()) {
2682 unsigned UpdateRegion = FI->second;
2683 DAG.Regions[UpdateRegion].first = VGPRCopy;
2684 FirstMIToRegion.erase(UseInst);
2685 }
2686
2687 // Replace the operand for all users.
2688 for (MachineOperand *User : RUDst.second) {
2689 User->setReg(NewUseReg);
2690 }
2691
2692 // Track the copy source operand for replacement.
2693 ReplaceMap[RUDst.first].insert(&VGPRCopy->getOperand(1));
2694 }
2695 }
2696
2697 // We may have needed to insert copies after the reaching defs of the MFMAs.
2698 // Replace the original register with the result of the copy for all relevant
2699 // operands.
2700 for (std::pair<Register, Register> NewDef : RedefMap) {
2701 Register OldReg = NewDef.first;
2702 Register NewReg = NewDef.second;
2703
2704 // Replace the register for any associated operand in the MFMA chain.
2705 for (MachineOperand *ReplaceOp : ReplaceMap[OldReg])
2706 ReplaceOp->setReg(NewReg);
2707 }
2708
2709 // Finally, do the reclassification of the MFMA registers.
2710 for (Register RewriteReg : RewriteRegs) {
2711 Register RegToRewrite = RewriteReg;
2712
2713 // Be sure to update the replacement register and not the original.
2714 DenseMap<Register, Register>::iterator RI = RedefMap.find(RewriteReg);
2715 if (RI != RedefMap.end())
2716 RegToRewrite = RI->second;
2717
2718 const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(RegToRewrite);
2719 const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(CurrRC);
2720
2721 DAG.MRI.setRegClass(RegToRewrite, AGPRRC);
2722 }
2723
2724 // Bulk update the LIS.
2725 DAG.LIS->reanalyze(DAG.MF);
2726 // Liveins may have been modified for cross RC copies
2727 RegionPressureMap LiveInUpdater(&DAG, false);
2728 LiveInUpdater.buildLiveRegMap();
2729
2730 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++)
2731 DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(Region);
2732
2733 DAG.Pressure[RegionIdx] = DAG.getRealRegPressure(RegionIdx);
2734
2735 return true;
2736}
2737
2738unsigned PreRARematStage::getStageTargetOccupancy() const {
2739 return TargetOcc ? *TargetOcc : MFI.getMinWavesPerEU();
2740}
2741
2742bool PreRARematStage::setObjective() {
2743 const Function &F = MF.getFunction();
2744
2745 // Set up "spilling targets" for all regions.
2746 unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
2747 unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
2748 bool HasVectorRegisterExcess = false;
2749 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2750 const GCNRegPressure &RP = DAG.Pressure[I];
2751 GCNRPTarget &Target = RPTargets.emplace_back(MaxSGPRs, MaxVGPRs, MF, RP);
2752 if (!Target.satisfied())
2753 TargetRegions.set(I);
2754 HasVectorRegisterExcess |= Target.hasVectorRegisterExcess();
2755 }
2756
2757 if (HasVectorRegisterExcess || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
2758 // In addition to register usage being above addressable limits, occupancy
2759 // below the minimum is considered like "spilling" as well.
2760 TargetOcc = std::nullopt;
2761 } else {
2762 // There is no spilling and room to improve occupancy; set up "increased
2763 // occupancy targets" for all regions.
2764 TargetOcc = DAG.MinOccupancy + 1;
2765 const unsigned VGPRBlockSize = MFI.getDynamicVGPRBlockSize();
2766 MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
2767 MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
2768 for (auto [I, Target] : enumerate(RPTargets)) {
2769 Target.setTarget(MaxSGPRs, MaxVGPRs);
2770 if (!Target.satisfied())
2771 TargetRegions.set(I);
2772 }
2773 }
2774
2775 return TargetRegions.any();
2776}
2777
2778bool PreRARematStage::collectRematRegs(
2779 const DenseMap<MachineInstr *, unsigned> &MIRegion) {
2780 // We need up-to-date live-out info. to query live-out register masks in
2781 // regions containing rematerializable instructions.
2782 DAG.RegionLiveOuts.buildLiveRegMap();
2783
2784 // Set of registers already marked for potential remterialization; used to
2785 // avoid rematerialization chains.
2786 SmallSet<Register, 4> MarkedRegs;
2787 auto IsMarkedForRemat = [&MarkedRegs](const MachineOperand &MO) -> bool {
2788 return MO.isReg() && MarkedRegs.contains(MO.getReg());
2789 };
2790
2791 // Identify rematerializable instructions in the function.
2792 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2793 RegionBoundaries Bounds = DAG.Regions[I];
2794 for (auto MI = Bounds.first; MI != Bounds.second; ++MI) {
2795 // The instruction must be rematerializable.
2796 MachineInstr &DefMI = *MI;
2797 if (!isReMaterializable(DefMI))
2798 continue;
2799
2800 // We only support rematerializing virtual registers with one
2801 // definition.
2802 Register Reg = DefMI.getOperand(0).getReg();
2803 if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg))
2804 continue;
2805
2806 // We only care to rematerialize the instruction if it has a single
2807 // non-debug user in a different region.
2808 // FIXME: Allow rematerializations with multiple uses. This should be
2809 // relatively easy to support using the current cost model.
2810 MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
2811 if (!UseMI)
2812 continue;
2813 auto UseRegion = MIRegion.find(UseMI);
2814 if (UseRegion == MIRegion.end() || UseRegion->second == I)
2815 continue;
2816
2817 // Do not rematerialize an instruction if it uses or is used by an
2818 // instruction that we have designated for rematerialization.
2819 // FIXME: Allow for rematerialization chains: this requires 1. updating
2820 // remat points to account for uses that are rematerialized, and 2.
2821 // either rematerializing the candidates in careful ordering, or
2822 // deferring the MBB RP walk until the entire chain has been
2823 // rematerialized.
2824 const MachineOperand &UseMO = UseMI->getOperand(0);
2825 if (IsMarkedForRemat(UseMO) ||
2826 llvm::any_of(DefMI.operands(), IsMarkedForRemat))
2827 continue;
2828
2829 // Do not rematerialize an instruction it it uses registers that aren't
2830 // available at its use. This ensures that we are not extending any live
2831 // range while rematerializing.
2832 SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
2833 if (!VirtRegAuxInfo::allUsesAvailableAt(&DefMI, UseIdx, *DAG.LIS, DAG.MRI,
2834 *DAG.TII))
2835 continue;
2836
2837 // Add the instruction to the rematerializable list.
2838 MarkedRegs.insert(Reg);
2839 RematRegs.emplace_back(&DefMI, UseMI, DAG, MIRegion);
2840 }
2841 }
2842
2843 return !RematRegs.empty();
2844}
2845
2846PreRARematStage::RematReg::RematReg(
2847 MachineInstr *DefMI, MachineInstr *UseMI, GCNScheduleDAGMILive &DAG,
2848 const DenseMap<MachineInstr *, unsigned> &MIRegion)
2849 : DefMI(DefMI), UseMI(UseMI), LiveIn(DAG.Regions.size()),
2850 LiveOut(DAG.Regions.size()), Live(DAG.Regions.size()),
2851 DefRegion(MIRegion.at(DefMI)), UseRegion(MIRegion.at(UseMI)) {
2852
2853 // Mark regions in which the rematerializable register is live.
2854 Register Reg = getReg();
2855 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2856 auto LiveInIt = DAG.LiveIns[I].find(Reg);
2857 if (LiveInIt != DAG.LiveIns[I].end())
2858 LiveIn.set(I);
2859 const auto &LiveOuts = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I);
2860 if (auto LiveOutIt = LiveOuts.find(Reg); LiveOutIt != LiveOuts.end())
2861 LiveOut.set(I);
2862 }
2863 Live |= LiveIn;
2864 Live |= LiveOut;
2865 Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(DefRegion).at(Reg);
2866}
2867
2868bool PreRARematStage::RematReg::maybeBeneficial(
2869 const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets) const {
2870 Register Reg = getReg();
2871 for (unsigned I : TargetRegions.set_bits()) {
2872 if (Live[I] && RPTargets[I].isSaveBeneficial(Reg))
2873 return true;
2874 }
2875 return false;
2876}
2877
2878void PreRARematStage::RematReg::insertMI(unsigned RegionIdx,
2879 MachineInstr *RematMI,
2880 GCNScheduleDAGMILive &DAG) const {
2881 RegionBoundaries &Bounds = DAG.Regions[RegionIdx];
2882 if (Bounds.first == std::next(MachineBasicBlock::iterator(RematMI)))
2883 Bounds.first = RematMI;
2884 DAG.LIS->InsertMachineInstrInMaps(*RematMI);
2885 DAG.LIS->createAndComputeVirtRegInterval(RematMI->getOperand(0).getReg());
2886}
2887
2890 assert(DAG.MLI && "MLI not defined in DAG");
2892 MachineBlockFrequencyInfo MBFI(MF, MBPI, *DAG.MLI);
2893
2894 const unsigned NumRegions = DAG.Regions.size();
2896 MaxFreq = 0;
2897 Regions.reserve(NumRegions);
2898 for (unsigned I = 0; I < NumRegions; ++I) {
2899 MachineBasicBlock *MBB = DAG.Regions[I].first->getParent();
2900 uint64_t BlockFreq = MBFI.getBlockFreq(MBB).getFrequency();
2901 Regions.push_back(BlockFreq);
2902 if (BlockFreq && BlockFreq < MinFreq)
2903 MinFreq = BlockFreq;
2904 else if (BlockFreq > MaxFreq)
2905 MaxFreq = BlockFreq;
2906 }
2907 if (!MinFreq)
2908 return;
2909
2910 // Scale everything down if frequencies are high.
2911 if (MinFreq >= ScaleFactor * ScaleFactor) {
2912 for (uint64_t &Freq : Regions)
2913 Freq /= ScaleFactor;
2914 MinFreq /= ScaleFactor;
2915 MaxFreq /= ScaleFactor;
2916 }
2917}
2918
2919PreRARematStage::ScoredRemat::ScoredRemat(RematReg *Remat, const FreqInfo &Freq,
2921 : Remat(Remat), FreqDiff(getFreqDiff(Freq)) {
2922 RPSave.inc(Remat->getReg(), LaneBitmask::getNone(), Remat->Mask, DAG.MRI);
2923}
2924
2925int64_t PreRARematStage::ScoredRemat::getFreqDiff(const FreqInfo &Freq) const {
2926 // Get frequencies of defining and using regions. A rematerialization from the
2927 // least frequent region to the most frequent region will yield the greatest
2928 // latency penalty and therefore should get minimum score. Reciprocally, a
2929 // rematerialization in the other direction should get maximum score. Default
2930 // to values that will yield the worst possible score given known frequencies
2931 // in order to penalize rematerializations from or into regions whose
2932 // frequency is unknown.
2933 int64_t DefOrMin = std::max(Freq.Regions[Remat->DefRegion], Freq.MinFreq);
2934 int64_t UseOrMax = Freq.Regions[Remat->UseRegion];
2935 if (!UseOrMax)
2936 UseOrMax = Freq.MaxFreq;
2937 return DefOrMin - UseOrMax;
2938}
2939
2940void PreRARematStage::ScoredRemat::update(const BitVector &TargetRegions,
2941 ArrayRef<GCNRPTarget> RPTargets,
2942 const FreqInfo &FreqInfo,
2943 bool ReduceSpill) {
2944 MaxFreq = 0;
2945 RegionImpact = 0;
2946 for (unsigned I : TargetRegions.set_bits()) {
2947 if (!Remat->Live[I])
2948 continue;
2949
2950 // The rematerialization must contribute positively in at least one
2951 // register class with usage above the RP target for this region to
2952 // contribute to the score.
2953 const GCNRPTarget &RegionTarget = RPTargets[I];
2954 const unsigned NumRegsBenefit = RegionTarget.getNumRegsBenefit(RPSave);
2955 if (!NumRegsBenefit)
2956 continue;
2957
2958 bool UnusedLT = Remat->isUnusedLiveThrough(I);
2959
2960 // Regions in which RP is guaranteed to decrease have more weight.
2961 RegionImpact += (UnusedLT ? 2 : 1) * NumRegsBenefit;
2962
2963 if (ReduceSpill) {
2964 uint64_t Freq = FreqInfo.Regions[I];
2965 if (!UnusedLT) {
2966 // Apply a frequency penalty in regions in which we are not sure that RP
2967 // will decrease.
2968 Freq /= 2;
2969 }
2970 MaxFreq = std::max(MaxFreq, Freq);
2971 }
2972 }
2973}
2974
2975MachineInstr *PreRARematStage::ScoredRemat::rematerialize(
2976 BitVector &RecomputeRP, SmallVectorImpl<GCNRPTarget> &RPTargets,
2977 GCNScheduleDAGMILive &DAG) const {
2978 const SIInstrInfo *TII = DAG.MF.getSubtarget<GCNSubtarget>().getInstrInfo();
2979 MachineInstr &DefMI = *Remat->DefMI;
2980 Register Reg = DefMI.getOperand(0).getReg();
2982
2983 // Rematerialize the register in the region where it is used.
2984 MachineBasicBlock::iterator InsertPos = Remat->UseMI;
2985 TII->reMaterialize(*InsertPos->getParent(), InsertPos, NewReg, 0, DefMI);
2986 MachineInstr *RematMI = &*std::prev(InsertPos);
2987 Remat->UseMI->substituteRegister(Reg, NewReg, 0, *DAG.TRI);
2988 Remat->insertMI(Remat->UseRegion, RematMI, DAG);
2989
2990#ifdef EXPENSIVE_CHECKS
2991 // All uses are known to be available / live at the remat point. Thus,
2992 // the uses should already be live in to the using region.
2993 for (MachineOperand &MO : DefMI.operands()) {
2994 if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
2995 continue;
2996
2997 Register UseReg = MO.getReg();
2998 if (!UseReg.isVirtual())
2999 continue;
3000
3001 LiveInterval &LI = DAG.LIS->getInterval(UseReg);
3002 LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
3003 if (LI.hasSubRanges() && MO.getSubReg())
3004 LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
3005
3006 LaneBitmask LiveInMask = DAG.LiveIns[Remat->UseRegion].at(UseReg);
3007 LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
3008 // If this register has lanes not covered by the LiveIns, be sure they
3009 // do not map to any subrange. ref:
3010 // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
3011 if (UncoveredLanes.any()) {
3012 assert(LI.hasSubRanges());
3013 for (LiveInterval::SubRange &SR : LI.subranges())
3014 assert((SR.LaneMask & UncoveredLanes).none());
3015 }
3016 }
3017#endif
3018
3019 // Remove the register from all regions where it is a live-in or live-out
3020 // and adjust RP targets. The save is guaranteed in regions in which the
3021 // register is live-through and unused but optimistic in all other regions
3022 // where the register is live.
3023 for (unsigned I : Remat->Live.set_bits()) {
3024 RPTargets[I].saveRP(RPSave);
3025 DAG.LiveIns[I].erase(Reg);
3026 DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).erase(Reg);
3027 if (!Remat->isUnusedLiveThrough(I))
3028 RecomputeRP.set(I);
3029 }
3030
3031 return RematMI;
3032}
3033
3034void PreRARematStage::commitRematerializations() const {
3035 REMAT_DEBUG(dbgs() << "Commiting all rematerializations\n");
3036 for (const RollbackInfo &Rollback : Rollbacks)
3037 DAG.deleteMI(Rollback.Remat->DefRegion, Rollback.Remat->DefMI);
3038}
3039
3040void PreRARematStage::unsetSatisfiedRPTargets(const BitVector &Regions) {
3041 for (unsigned I : Regions.set_bits()) {
3042 if (TargetRegions[I] && RPTargets[I].satisfied()) {
3043 REMAT_DEBUG(dbgs() << " [" << I << "] Target reached!\n");
3044 TargetRegions.reset(I);
3045 }
3046 }
3047}
3048
3049bool PreRARematStage::updateAndVerifyRPTargets(const BitVector &Regions) {
3050 bool TooOptimistic = false;
3051 for (unsigned I : Regions.set_bits()) {
3052 GCNRPTarget &Target = RPTargets[I];
3053 Target.setRP(DAG.getRealRegPressure(I));
3054
3055 // Since we were optimistic in assessing RP decreases in these regions, we
3056 // may need to remark the target as a target region if RP didn't decrease
3057 // as expected.
3058 if (!TargetRegions[I] && !Target.satisfied()) {
3059 REMAT_DEBUG(dbgs() << " [" << I << "] Incorrect RP estimation\n");
3060 TooOptimistic = true;
3061 TargetRegions.set(I);
3062 }
3063 }
3064 return TooOptimistic;
3065}
3066
3067// Copied from MachineLICM
3068bool PreRARematStage::isReMaterializable(const MachineInstr &MI) {
3070 return false;
3071
3072 for (const MachineOperand &MO : MI.all_uses()) {
3073 // We can't remat physreg uses, unless it is a constant or an ignorable
3074 // use (e.g. implicit exec use on VALU instructions)
3075 if (MO.getReg().isPhysical()) {
3076 if (DAG.MRI.isConstantPhysReg(MO.getReg()) || DAG.TII->isIgnorableUse(MO))
3077 continue;
3078 return false;
3079 }
3080 }
3081
3082 return true;
3083}
3084
3086 // We consider that reducing spilling is always beneficial so we never
3087 // rollback rematerializations or revert scheduling in such cases.
3088 if (!TargetOcc)
3089 return;
3090
3091 // When increasing occupancy, it is possible that re-scheduling is not able to
3092 // achieve the target occupancy in all regions, in which case re-scheduling in
3093 // all regions should be reverted.
3094 if (DAG.MinOccupancy >= *TargetOcc) {
3095 commitRematerializations();
3096 return;
3097 }
3098
3099 // It is possible that re-scheduling lowers occupancy over the one achieved
3100 // just through rematerializations, in which case we revert re-scheduling in
3101 // all regions but do not roll back rematerializations.
3102 const bool ShouldRollbackRemats = AchievedOcc < *TargetOcc;
3103
3104 // When we both need to revert re-scheduling and rollback rematerializations,
3105 // restore rematerialized MIs' original state before reverting so that they
3106 // are treated as non-debug instructions by the revert logic.
3107 if (ShouldRollbackRemats) {
3108 for (const RollbackInfo &Rollback : Rollbacks) {
3109 const auto &[Remat, RematMI, RegMap] = Rollback;
3110 Remat->DefMI->setDesc(DAG.TII->get(RematMI->getOpcode()));
3111 for (const auto &[MOIdx, Reg] : RegMap)
3112 Remat->DefMI->getOperand(MOIdx).setReg(Reg);
3113 }
3114 }
3115
3116 // Revert re-scheduling in all affected regions.
3117 for (const auto &[RegionIdx, OrigMIOrder, MaxPressure] : RegionReverts) {
3118 REMAT_DEBUG(dbgs() << "Reverting re-scheduling in region " << RegionIdx
3119 << '\n');
3120 DAG.Pressure[RegionIdx] = MaxPressure;
3121 modifyRegionSchedule(RegionIdx, RegionBB[RegionIdx], OrigMIOrder);
3122 }
3123
3124 if (!ShouldRollbackRemats) {
3125 commitRematerializations();
3126 DAG.setTargetOccupancy(AchievedOcc);
3127 return;
3128 }
3129
3130 // Reset the target occupancy to what it was pre-rematerialization.
3131 DAG.setTargetOccupancy(*TargetOcc - 1);
3132
3133 // Finish rolling back rematerializations, then recompute pressure in all
3134 // affected regions.
3135 REMAT_DEBUG(dbgs() << "==== ROLLBACK ====\n");
3136 BitVector RecomputeRP(DAG.Regions.size());
3137 DenseSet<Register> RecomputeLI;
3138 for (const RollbackInfo &Rollback : Rollbacks) {
3139 const auto &[Remat, RematMI, RegMap] = Rollback;
3140
3141 // Switch back to using the original register and delete the
3142 // rematerialization.
3143 Register Reg = RematMI->getOperand(0).getReg();
3144 Register OriginalReg = Remat->DefMI->getOperand(0).getReg();
3145 Remat->UseMI->substituteRegister(Reg, OriginalReg, 0, *DAG.TRI);
3146 REMAT_DEBUG(dbgs() << '[' << Remat->UseRegion
3147 << "] Deleting rematerialization " << *RematMI);
3148 DAG.deleteMI(Remat->UseRegion, RematMI);
3149
3150 // Re-add the defined register as a live-in/live-out in all regions it used
3151 // to be one in.
3152 std::pair<Register, LaneBitmask> LiveReg(OriginalReg, Remat->Mask);
3153 for (unsigned I : Remat->LiveIn.set_bits())
3154 DAG.LiveIns[I].insert(LiveReg);
3155 for (unsigned I : Remat->LiveOut.set_bits())
3156 DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I).insert(LiveReg);
3157
3158 RecomputeRP |= Rollback.Remat->Live;
3159 // Regenerate intervals for all register operands of rematerialized MIs as
3160 // slot indices may have changed slightly from before re-scheduling.
3161 for (MachineOperand &MO : Rollback.Remat->DefMI->operands()) {
3162 if (MO.isReg() && MO.getReg().isVirtual())
3163 RecomputeLI.insert(MO.getReg());
3164 }
3165 }
3166 for (Register Reg : RecomputeLI) {
3169 }
3170#ifdef EXPENSIVE_CHECKS
3171 // In particular, we want to check for coherent MI/slot order in regions in
3172 // which reverts and/or rollbacks may have happened.
3173 MF.verify();
3174#endif
3175 for (unsigned I : RecomputeRP.set_bits())
3176 DAG.Pressure[I] = DAG.getRealRegPressure(I);
3177
3179}
3180
3181void GCNScheduleDAGMILive::deleteMI(unsigned RegionIdx, MachineInstr *MI) {
3182 // It's not possible for the deleted instruction to be upper region boundary
3183 // since we don't delete region terminators.
3184 if (Regions[RegionIdx].first == MI)
3185 Regions[RegionIdx].first = std::next(MachineBasicBlock::iterator(MI));
3186 LIS->removeInterval(MI->getOperand(0).getReg());
3188 MI->eraseFromParent();
3189}
3190
3191void GCNScheduleDAGMILive::setTargetOccupancy(unsigned TargetOccupancy) {
3192 MinOccupancy = TargetOccupancy;
3193 if (MFI.getOccupancy() < TargetOccupancy)
3194 MFI.increaseOccupancy(MF, MinOccupancy);
3195 else
3196 MFI.limitOccupancy(MinOccupancy);
3197}
3198
3200 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
3201 return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) {
3202 return SII->isIGLPMutationOnly(MI->getOpcode());
3203 });
3204}
3205
3210
3212 HasIGLPInstrs = hasIGLPInstrs(this);
3213 if (HasIGLPInstrs) {
3214 SavedMutations.clear();
3215 SavedMutations.swap(Mutations);
3217 }
3218
3220}
3221
3223 if (HasIGLPInstrs)
3224 SavedMutations.swap(Mutations);
3225
3227}
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file defines the GCNRegPressure class, which tracks registry pressure by bookkeeping number of S...
static cl::opt< bool > GCNTrackers("amdgpu-use-amdgpu-trackers", cl::Hidden, cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false))
static cl::opt< bool > DisableClusteredLowOccupancy("amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden, cl::desc("Disable clustered low occupancy " "rescheduling for ILP scheduling stage."), cl::init(false))
#define REMAT_PREFIX
Allows to easily filter for this stage's debug output.
static cl::opt< bool > DisableRewriteMFMAFormSchedStage("amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden, cl::desc("Disable rewrite mfma rewrite scheduling stage"), cl::init(false))
static MachineInstr * getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, MachineBasicBlock::iterator RegionEnd)
static bool shouldCheckPending(SchedBoundary &Zone, const TargetSchedModel *SchedModel)
static cl::opt< bool > RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, cl::desc("Relax occupancy targets for kernels which are memory " "bound (amdgpu-membound-threshold), or " "Wave Limited (amdgpu-limit-wave-threshold)."), cl::init(false))
#define REMAT_DEBUG(X)
static cl::opt< bool > DisableUnclusterHighRP("amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden, cl::desc("Disable unclustered high register pressure " "reduction scheduling stage."), cl::init(false))
static void printScheduleModel(std::set< std::pair< MachineInstr *, unsigned >, EarlierIssuingCycle > &ReadyCycles)
static cl::opt< bool > PrintMaxRPRegUsageAfterScheduler("amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure after scheduling."), cl::init(false))
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG)
static bool canUsePressureDiffs(const SUnit &SU)
Checks whether SU can use the cached DAG pressure diffs to compute the current register pressure.
static cl::opt< unsigned > PendingQueueLimit("amdgpu-scheduler-pending-queue-limit", cl::Hidden, cl::desc("Max (Available+Pending) size to inspect pending queue (0 disables)"), cl::init(256))
static void getRegisterPressures(bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, std::vector< unsigned > &Pressure, std::vector< unsigned > &MaxPressure, GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, ScheduleDAGMI *DAG, const SIRegisterInfo *SRI)
static cl::opt< bool > PrintMaxRPRegUsageBeforeScheduler("amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure before scheduling."), cl::init(false))
static SUnit * pickOnlyChoice(SchedBoundary &Zone, const TargetSchedModel *SchedModel)
static cl::opt< unsigned > ScheduleMetricBias("amdgpu-schedule-metric-bias", cl::Hidden, cl::desc("Sets the bias which adds weight to occupancy vs latency. Set it to " "100 to chase the occupancy only."), cl::init(10))
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
static constexpr std::pair< StringLiteral, StringLiteral > ReplaceMap[]
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static constexpr unsigned SM(unsigned Version)
if(PassOpts->AAPipeline)
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
BitVector & reset()
Definition BitVector.h:411
BitVector & set()
Definition BitVector.h:370
iterator_range< const_set_bits_iterator > set_bits() const
Definition BitVector.h:159
size_type size() const
size - Returns the number of bits in this bitvector.
Definition BitVector.h:178
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
bool shouldRevertScheduling(unsigned WavesAfter) override
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
bool erase(const KeyT &Val)
Definition DenseMap.h:330
DenseMapIterator< KeyT, ValueT, KeyInfoT, BucketT > iterator
Definition DenseMap.h:74
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
GCNRegPressure bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI) const
Mostly copy/paste from CodeGen/RegisterPressure.cpp Calculate the impact MI will have on CurPressure ...
GCNMaxILPSchedStrategy(const MachineSchedContext *C)
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
Apply a set of heuristics to a new candidate.
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as much as possible.
GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext *C)
GCNMaxOccupancySchedStrategy(const MachineSchedContext *C, bool IsLegacyScheduler=false)
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNPostScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
Models a register pressure target, allowing to evaluate and track register savings against that targe...
unsigned getNumRegsBenefit(const GCNRegPressure &SaveRP) const
Returns the benefit towards achieving the RP target that saving SaveRP represents,...
GCNRegPressure getPressure() const
DenseMap< unsigned, LaneBitmask > LiveRegSet
GCNSchedStrategy & S
GCNRegPressure PressureBefore
bool isRegionWithExcessRP() const
bool mayCauseSpilling(unsigned WavesAfter)
ScheduleMetrics getScheduleMetrics(const std::vector< SUnit > &InputSchedule)
GCNScheduleDAGMILive & DAG
const GCNSchedStageID StageID
std::vector< MachineInstr * > Unsched
GCNRegPressure PressureAfter
MachineFunction & MF
virtual void finalizeGCNRegion()
SIMachineFunctionInfo & MFI
unsigned computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, DenseMap< unsigned, unsigned > &ReadyCycles, const TargetSchedModel &SM)
virtual void finalizeGCNSchedStage()
virtual bool initGCNSchedStage()
virtual bool shouldRevertScheduling(unsigned WavesAfter)
std::vector< std::unique_ptr< ScheduleDAGMutation > > SavedMutations
GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
void modifyRegionSchedule(unsigned RegionIdx, MachineBasicBlock *MBB, ArrayRef< MachineInstr * > MIOrder)
Sets the schedule of region RegionIdx in block MBB to MIOrder.
MachineBasicBlock * CurrentMBB
const GCNSubtarget & ST
This is a minimal scheduler strategy.
GCNDownwardRPTracker DownwardTracker
GCNSchedStrategy(const MachineSchedContext *C)
SmallVector< GCNSchedStageID, 4 > SchedStages
std::vector< unsigned > MaxPressure
SUnit * pickNodeBidirectional(bool &IsTopNode, bool &PickedPending)
GCNSchedStageID getCurrentStage()
bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Evaluates instructions in the pending queue using a subset of scheduling heuristics.
SmallVectorImpl< GCNSchedStageID >::iterator CurrentStage
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
GCNDownwardRPTracker * getDownwardTracker()
std::vector< unsigned > Pressure
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
GCNUpwardRPTracker UpwardTracker
void printCandidateDecision(const SchedCandidate &Current, const SchedCandidate &Preferred)
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand, bool &IsPending, bool IsBottomUp)
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp)
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
GCNUpwardRPTracker * getUpwardTracker()
GCNSchedStageID getNextStage() const
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
void recede(const MachineInstr &MI)
Move to the state of RP just before the MI .
void traceCandidate(const SchedCandidate &Cand)
LLVM_ABI void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone, SchedBoundary *OtherZone)
Set the CandPolicy given a scheduling zone given the current resources and latencies inside and outsi...
MachineSchedPolicy RegionPolicy
const TargetSchedModel * SchedModel
const MachineSchedContext * Context
const TargetRegisterInfo * TRI
SchedCandidate BotCand
Candidate last picked from Bot boundary.
SchedCandidate TopCand
Candidate last picked from Top boundary.
virtual bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Apply a set of heuristics to a new candidate.
ScheduleDAGMILive * DAG
void initialize(ScheduleDAGMI *dag) override
Initialize the strategy after building the DAG for a new region.
void schedNode(SUnit *SU, bool IsTopNode) override
Update the scheduler's state after scheduling a node.
GenericScheduler(const MachineSchedContext *C)
bool shouldRevertScheduling(unsigned WavesAfter) override
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
void RemoveMachineInstrFromMaps(MachineInstr &MI)
SlotIndex getMBBEndIdx(const MachineBasicBlock *mbb) const
Return the last index in the given basic block.
LiveInterval & getInterval(Register Reg)
void removeInterval(Register Reg)
Interval removal.
LLVM_ABI void dump() const
MachineBasicBlock * getMBBFromIndex(SlotIndex index) const
LiveInterval & createAndComputeVirtRegInterval(Register Reg)
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
const MCInstrDesc & get(unsigned Opcode) const
Return the machine instruction descriptor that corresponds to the specified instruction opcode.
Definition MCInstrInfo.h:90
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate machine basic b...
LLVM_ABI BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const
getblockFreq - Return block frequency.
LLVM_ABI BlockFrequency getEntryFreq() const
Divide a block's BlockFrequency::getFrequency() value by this value to obtain the entry block - relat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
bool verify(Pass *p=nullptr, const char *Banner=nullptr, raw_ostream *OS=nullptr, bool AbortOnError=true) const
Run the current MachineFunction through the machine code verifier, useful for debugger use.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Register getReg() const
getReg - Returns the register number.
LLVM_ABI LaneBitmask getMaxLaneMaskForVReg(Register Reg) const
Returns a mask covering all bits that can appear in lane masks of subregisters of the virtual registe...
LLVM_ABI bool isConstantPhysReg(MCRegister PhysReg) const
Returns true if PhysReg is unallocatable and constant throughout the function.
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
void finalizeGCNRegion() override
bool initGCNSchedStage() override
Capture a change in pressure for a single pressure set.
Simple wrapper around std::function<void(raw_ostream&)>.
Definition Printable.h:38
Helpers for implementing custom MachineSchedStrategy classes.
unsigned size() const
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void advance()
Advance across the current instruction.
LLVM_ABI void getDownwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction top-down.
const std::vector< unsigned > & getRegSetPressureAtPos() const
Get the register set pressure at the current position, which may be less than the pressure across the...
LLVM_ABI void getUpwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction bottom-up.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
GCNRPTracker::LiveRegSet & getLiveRegsForRegionIdx(unsigned RegionIdx)
List of registers defined and used by a machine instruction.
LLVM_ABI void collect(const MachineInstr &MI, const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, bool TrackLaneMasks, bool IgnoreDead)
Analyze the given instruction MI and fill in the Uses, Defs and DeadDefs list based on the MachineOpe...
LLVM_ABI void adjustLaneLiveness(const LiveIntervals &LIS, const MachineRegisterInfo &MRI, SlotIndex Pos, MachineInstr *AddFlagsMI=nullptr)
Use liveness information to find out which uses/defs are partially undefined/dead and adjust the VReg...
LLVM_ABI void detectDeadDefs(const MachineInstr &MI, const LiveIntervals &LIS)
Use liveness information to find dead defs not marked with a dead flag and move them to the DeadDefs ...
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
bool isIGLPMutationOnly(unsigned Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void increaseOccupancy(const MachineFunction &MF, unsigned Limit)
void limitOccupancy(const MachineFunction &MF)
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
unsigned TopReadyCycle
Cycle relative to start when node is ready.
unsigned NodeNum
Entry # of node in the node vector.
unsigned short Latency
Node latency.
bool isScheduled
True once scheduled.
unsigned ParentClusterIdx
The parent cluster id.
unsigned BotReadyCycle
Cycle relative to end when node is ready.
bool isBottomReady() const
bool isTopReady() const
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Each Scheduling boundary is associated with ready queues.
LLVM_ABI void releasePending()
Release pending ready nodes in to the available queue.
LLVM_ABI unsigned getLatencyStallCycles(SUnit *SU)
Get the difference between the given SUnit's ready time and the current cycle.
LLVM_ABI SUnit * pickOnlyChoice()
Call this before applying any other heuristics to the Available queue.
LLVM_ABI void bumpCycle(unsigned NextCycle)
Move the boundary of scheduled code by one cycle.
unsigned getCurrMOps() const
Micro-ops issued in the current cycle.
unsigned getCurrCycle() const
Number of cycles to issue the instructions scheduled in this zone.
LLVM_ABI bool checkHazard(SUnit *SU)
Does this SU have a hazard within the current instruction group.
A ScheduleDAG for scheduling lists of MachineInstr.
bool ScheduleSingleMIRegions
True if regions with a single MI should be scheduled.
MachineBasicBlock::iterator RegionEnd
The end of the range to be scheduled.
virtual void finalizeSchedule()
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
virtual void exitRegion()
Called when the scheduler has finished scheduling the current region.
const MachineLoopInfo * MLI
bool RemoveKillFlags
True if the DAG builder should remove kill flags (in preparation for rescheduling).
MachineBasicBlock::iterator RegionBegin
The beginning of the range to be scheduled.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
RegPressureTracker RPTracker
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMI(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
std::vector< std::unique_ptr< ScheduleDAGMutation > > Mutations
Ordered list of DAG postprocessing steps.
MachineRegisterInfo & MRI
Virtual/real register map.
const TargetInstrInfo * TII
Target instruction information.
const TargetRegisterInfo * TRI
Target processor register info.
MachineFunction & MF
Machine function.
static const unsigned ScaleFactor
unsigned getMetric() const
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
static bool isSameInstr(SlotIndex A, SlotIndex B)
isSameInstr - Return true if A and B refer to the same instruction.
static bool isEarlierInstr(SlotIndex A, SlotIndex B)
isEarlierInstr - Return true if A refers to an instruction earlier than B.
SlotIndex getPrevSlot() const
Returns the previous slot in the index list.
SlotIndex getMBBStartIdx(unsigned Num) const
Returns the first index in the given basic block number.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:229
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool isReMaterializable(const MachineInstr &MI) const
Return true if the instruction would be materializable at a point in the containing function where al...
virtual bool isIgnorableUse(const MachineOperand &MO) const
Given MO is a PhysReg use return if it can be ignored for the purpose of instruction rematerializatio...
uint8_t getCopyCost() const
Return the cost of copying a value between two registers in this class.
LaneBitmask getSubRegIndexLaneMask(unsigned SubIdx) const
Return a bitmask representing the parts of a register that are covered by SubIdx.
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI bool hasInstrSchedModel() const
Return true if this machine model includes an instruction-level scheduling model.
unsigned getMicroOpBufferSize() const
Number of micro-ops that may be buffered for OOO execution.
bool shouldRevertScheduling(unsigned WavesAfter) override
VNInfo - Value Number Information.
SlotIndex def
The index of the defining instruction.
bool isPHIDef() const
Returns true if this value is defined by a PHI instruction (or was, PHI instructions may have been el...
static bool allUsesAvailableAt(const MachineInstr *MI, SlotIndex UseIdx, const LiveIntervals &LIS, const MachineRegisterInfo &MRI, const TargetInstrInfo &TII)
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize)
LLVM_READONLY int32_t getMFMASrcCVDstAGPROp(uint32_t Opcode)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1765
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
LLVM_ABI unsigned getWeakLeft(const SUnit *SU, bool isTop)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI, Range &&LiveRegs)
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
std::pair< MachineBasicBlock::iterator, MachineBasicBlock::iterator > RegionBoundaries
A region's boundaries i.e.
IterT skipDebugInstructionsForward(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It until it points to a non-debug instruction or to End and return the resulting iterator.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool tryPressure(const PressureChange &TryP, const PressureChange &CandP, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason, const TargetRegisterInfo *TRI, const MachineFunction &MF)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
LLVM_ABI cl::opt< bool > VerifyScheduling
LLVM_ABI bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone)
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
IterT skipDebugInstructionsBackward(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It until it points to a non-debug instruction or to Begin and return the resulting iterator...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool isTheSameCluster(unsigned A, unsigned B)
Return whether the input cluster ID's are the same and valid.
DWARFExpression::Operation Op
LLVM_ABI bool tryGreater(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1917
DenseMap< MachineInstr *, GCNRPTracker::LiveRegSet > getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS)
creates a map MachineInstr -> LiveRegSet R - range of iterators on instructions After - upon entry or...
GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI, const LiveIntervals &LIS)
LLVM_ABI bool tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
Return true if this heuristic determines order.
LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF, GCNRegPressure::RegKind Kind, LiveIntervals &LIS, const MachineLoopInfo *MLI)
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
LLVM_ABI int biasPhysReg(const SUnit *SU, bool isTop)
Minimize physical register live ranges.
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
bool operator()(std::pair< MachineInstr *, unsigned > A, std::pair< MachineInstr *, unsigned > B) const
unsigned getArchVGPRNum() const
unsigned getAGPRNum() const
unsigned getSGPRNum() const
Policy for scheduling the next instruction in the candidate's zone.
Store the state used by GenericScheduler heuristics, required for the lifetime of one invocation of p...
void reset(const CandPolicy &NewPolicy)
LLVM_ABI void initResourceDelta(const ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel)
Status of an instruction's critical resource consumption.
constexpr bool any() const
Definition LaneBitmask.h:53
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
Execution frequency information required by scoring heuristics.
SmallVector< uint64_t > Regions
Per-region execution frequencies. 0 when unknown.
uint64_t MinFreq
Minimum and maximum observed frequencies.
FreqInfo(MachineFunction &MF, const GCNScheduleDAGMILive &DAG)