LLVM 23.0.0git
GCNSchedStrategy.cpp
Go to the documentation of this file.
1//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This contains a MachineSchedStrategy implementation for maximizing wave
11/// occupancy on GCN hardware.
12///
13/// This pass will apply multiple scheduling stages to the same function.
14/// Regions are first recorded in GCNScheduleDAGMILive::schedule. The actual
15/// entry point for the scheduling of those regions is
16/// GCNScheduleDAGMILive::runSchedStages.
17
18/// Generally, the reason for having multiple scheduling stages is to account
19/// for the kernel-wide effect of register usage on occupancy. Usually, only a
20/// few scheduling regions will have register pressure high enough to limit
21/// occupancy for the kernel, so constraints can be relaxed to improve ILP in
22/// other regions.
23///
24//===----------------------------------------------------------------------===//
25
26#include "GCNSchedStrategy.h"
27#include "AMDGPUIGroupLP.h"
28#include "GCNRegPressure.h"
31#include "llvm/ADT/STLExtras.h"
35#include "llvm/MC/LaneBitmask.h"
37
38#define DEBUG_TYPE "machine-scheduler"
39
40using namespace llvm;
41
43 "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
44 cl::desc("Disable unclustered high register pressure "
45 "reduction scheduling stage."),
46 cl::init(false));
47
49 "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
50 cl::desc("Disable clustered low occupancy "
51 "rescheduling for ILP scheduling stage."),
52 cl::init(false));
53
55 "amdgpu-schedule-metric-bias", cl::Hidden,
57 "Sets the bias which adds weight to occupancy vs latency. Set it to "
58 "100 to chase the occupancy only."),
59 cl::init(10));
60
61static cl::opt<bool>
62 RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
63 cl::desc("Relax occupancy targets for kernels which are memory "
64 "bound (amdgpu-membound-threshold), or "
65 "Wave Limited (amdgpu-limit-wave-threshold)."),
66 cl::init(false));
67
69 "amdgpu-use-amdgpu-trackers", cl::Hidden,
70 cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
71 cl::init(false));
72
74 "amdgpu-scheduler-pending-queue-limit", cl::Hidden,
76 "Max (Available+Pending) size to inspect pending queue (0 disables)"),
77 cl::init(256));
78
79#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
80#define DUMP_MAX_REG_PRESSURE
82 "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
83 cl::desc("Print a list of live registers along with their def/uses at the "
84 "point of maximum register pressure before scheduling."),
85 cl::init(false));
86
88 "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
89 cl::desc("Print a list of live registers along with their def/uses at the "
90 "point of maximum register pressure after scheduling."),
91 cl::init(false));
92#endif
93
95 "amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden,
96 cl::desc("Disable rewrie mfma rewrite scheduling stage"), cl::init(true));
97
98const unsigned ScheduleMetrics::ScaleFactor = 100;
99
104
107
108 MF = &DAG->MF;
109
110 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
111
113 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
115 Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
116
118 // Set the initial TargetOccupnacy to the maximum occupancy that we can
119 // achieve for this function. This effectively sets a lower bound on the
120 // 'Critical' register limits in the scheduler.
121 // Allow for lower occupancy targets if kernel is wave limited or memory
122 // bound, and using the relaxed occupancy feature.
126 std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
127
128 if (!KnownExcessRP) {
129 VGPRCriticalLimit = std::min(
130 ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
132 } else {
133 // This is similar to ST.getMaxNumVGPRs(TargetOccupancy) result except
134 // returns a reasonably small number for targets with lots of VGPRs, such
135 // as GFX10 and GFX11.
136 LLVM_DEBUG(dbgs() << "Region is known to spill, use alternative "
137 "VGPRCriticalLimit calculation method.\n");
138 unsigned DynamicVGPRBlockSize = MFI.getDynamicVGPRBlockSize();
139 unsigned Granule =
140 AMDGPU::IsaInfo::getVGPRAllocGranule(&ST, DynamicVGPRBlockSize);
141 unsigned Addressable =
142 AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST, DynamicVGPRBlockSize);
143 unsigned VGPRBudget = alignDown(Addressable / TargetOccupancy, Granule);
144 VGPRBudget = std::max(VGPRBudget, Granule);
145 VGPRCriticalLimit = std::min(VGPRBudget, VGPRExcessLimit);
146 }
147
148 // Subtract error margin and bias from register limits and avoid overflow.
153 LLVM_DEBUG(dbgs() << "VGPRCriticalLimit = " << VGPRCriticalLimit
154 << ", VGPRExcessLimit = " << VGPRExcessLimit
155 << ", SGPRCriticalLimit = " << SGPRCriticalLimit
156 << ", SGPRExcessLimit = " << SGPRExcessLimit << "\n\n");
157}
158
159/// Checks whether \p SU can use the cached DAG pressure diffs to compute the
160/// current register pressure.
161///
162/// This works for the common case, but it has a few exceptions that have been
163/// observed through trial and error:
164/// - Explicit physical register operands
165/// - Subregister definitions
166///
167/// In both of those cases, PressureDiff doesn't represent the actual pressure,
168/// and querying LiveIntervals through the RegPressureTracker is needed to get
169/// an accurate value.
170///
171/// We should eventually only use PressureDiff for maximum performance, but this
172/// already allows 80% of SUs to take the fast path without changing scheduling
173/// at all. Further changes would either change scheduling, or require a lot
174/// more logic to recover an accurate pressure estimate from the PressureDiffs.
175static bool canUsePressureDiffs(const SUnit &SU) {
176 if (!SU.isInstr())
177 return false;
178
179 // Cannot use pressure diffs for subregister defs or with physregs, it's
180 // imprecise in both cases.
181 for (const auto &Op : SU.getInstr()->operands()) {
182 if (!Op.isReg() || Op.isImplicit())
183 continue;
184 if (Op.getReg().isPhysical() ||
185 (Op.isDef() && Op.getSubReg() != AMDGPU::NoSubRegister))
186 return false;
187 }
188 return true;
189}
190
192 bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU,
193 std::vector<unsigned> &Pressure, std::vector<unsigned> &MaxPressure,
194 GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker,
195 ScheduleDAGMI *DAG, const SIRegisterInfo *SRI) {
196 // getDownwardPressure() and getUpwardPressure() make temporary changes to
197 // the tracker, so we need to pass those function a non-const copy.
198 RegPressureTracker &TempTracker = const_cast<RegPressureTracker &>(RPTracker);
199 if (!GCNTrackers) {
200 AtTop
201 ? TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure)
202 : TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
203
204 return;
205 }
206
207 // GCNTrackers
208 Pressure.resize(4, 0);
209 MachineInstr *MI = SU->getInstr();
210 GCNRegPressure NewPressure;
211 if (AtTop) {
212 GCNDownwardRPTracker TempDownwardTracker(DownwardTracker);
213 NewPressure = TempDownwardTracker.bumpDownwardPressure(MI, SRI);
214 } else {
215 GCNUpwardRPTracker TempUpwardTracker(UpwardTracker);
216 TempUpwardTracker.recede(*MI);
217 NewPressure = TempUpwardTracker.getPressure();
218 }
219 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = NewPressure.getSGPRNum();
220 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] =
221 NewPressure.getArchVGPRNum();
222 Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = NewPressure.getAGPRNum();
223}
224
226 bool AtTop,
227 const RegPressureTracker &RPTracker,
228 const SIRegisterInfo *SRI,
229 unsigned SGPRPressure,
230 unsigned VGPRPressure, bool IsBottomUp) {
231 Cand.SU = SU;
232 Cand.AtTop = AtTop;
233
234 if (!DAG->isTrackingPressure())
235 return;
236
237 Pressure.clear();
238 MaxPressure.clear();
239
240 // We try to use the cached PressureDiffs in the ScheduleDAG whenever
241 // possible over querying the RegPressureTracker.
242 //
243 // RegPressureTracker will make a lot of LIS queries which are very
244 // expensive, it is considered a slow function in this context.
245 //
246 // PressureDiffs are precomputed and cached, and getPressureDiff is just a
247 // trivial lookup into an array. It is pretty much free.
248 //
249 // In EXPENSIVE_CHECKS, we always query RPTracker to verify the results of
250 // PressureDiffs.
251 if (AtTop || !canUsePressureDiffs(*SU) || GCNTrackers) {
252 getRegisterPressures(AtTop, RPTracker, SU, Pressure, MaxPressure,
254 } else {
255 // Reserve 4 slots.
256 Pressure.resize(4, 0);
257 Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
258 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
259
260 for (const auto &Diff : DAG->getPressureDiff(SU)) {
261 if (!Diff.isValid())
262 continue;
263 // PressureDiffs is always bottom-up so if we're working top-down we need
264 // to invert its sign.
265 Pressure[Diff.getPSet()] +=
266 (IsBottomUp ? Diff.getUnitInc() : -Diff.getUnitInc());
267 }
268
269#ifdef EXPENSIVE_CHECKS
270 std::vector<unsigned> CheckPressure, CheckMaxPressure;
271 getRegisterPressures(AtTop, RPTracker, SU, CheckPressure, CheckMaxPressure,
273 if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
274 CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
275 Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
276 CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
277 errs() << "Register Pressure is inaccurate when calculated through "
278 "PressureDiff\n"
279 << "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
280 << ", expected "
281 << CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
282 << "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
283 << ", expected "
284 << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
285 report_fatal_error("inaccurate register pressure calculation");
286 }
287#endif
288 }
289
290 unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
291 unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
292
293 // If two instructions increase the pressure of different register sets
294 // by the same amount, the generic scheduler will prefer to schedule the
295 // instruction that increases the set with the least amount of registers,
296 // which in our case would be SGPRs. This is rarely what we want, so
297 // when we report excess/critical register pressure, we do it either
298 // only for VGPRs or only for SGPRs.
299
300 // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
301 const unsigned MaxVGPRPressureInc = 16;
302 bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
303 bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
304
305 // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
306 // to increase the likelihood we don't go over the limits. We should improve
307 // the analysis to look through dependencies to find the path with the least
308 // register pressure.
309
310 // We only need to update the RPDelta for instructions that increase register
311 // pressure. Instructions that decrease or keep reg pressure the same will be
312 // marked as RegExcess in tryCandidate() when they are compared with
313 // instructions that increase the register pressure.
314 if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
315 HasHighPressure = true;
316 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
317 Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
318 }
319
320 if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
321 HasHighPressure = true;
322 Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
323 Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
324 }
325
326 // Register pressure is considered 'CRITICAL' if it is approaching a value
327 // that would reduce the wave occupancy for the execution unit. When
328 // register pressure is 'CRITICAL', increasing SGPR and VGPR pressure both
329 // has the same cost, so we don't need to prefer one over the other.
330
331 int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
332 int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
333
334 if (SGPRDelta >= 0 || VGPRDelta >= 0) {
335 HasHighPressure = true;
336 if (SGPRDelta > VGPRDelta) {
337 Cand.RPDelta.CriticalMax =
338 PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
339 Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
340 } else {
341 Cand.RPDelta.CriticalMax =
342 PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
343 Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
344 }
345 }
346}
347
349 const TargetSchedModel *SchedModel) {
350 bool HasBufferedModel =
351 SchedModel->hasInstrSchedModel() && SchedModel->getMicroOpBufferSize();
352 unsigned Combined = Zone.Available.size() + Zone.Pending.size();
353 return Combined <= PendingQueueLimit && HasBufferedModel;
354}
355
357 const TargetSchedModel *SchedModel) {
358 // pickOnlyChoice() releases pending instructions and checks for new hazards.
359 SUnit *OnlyChoice = Zone.pickOnlyChoice();
360 if (!shouldCheckPending(Zone, SchedModel) || Zone.Pending.empty())
361 return OnlyChoice;
362
363 return nullptr;
364}
365
367 const SchedCandidate &Preferred) {
368 LLVM_DEBUG({
369 dbgs() << "Prefer:\t\t";
370 DAG->dumpNode(*Preferred.SU);
371
372 if (Current.SU) {
373 dbgs() << "Not:\t";
374 DAG->dumpNode(*Current.SU);
375 }
376
377 dbgs() << "Reason:\t\t";
378 traceCandidate(Preferred);
379 });
380}
381
382// This function is mostly cut and pasted from
383// GenericScheduler::pickNodeFromQueue()
385 const CandPolicy &ZonePolicy,
386 const RegPressureTracker &RPTracker,
387 SchedCandidate &Cand, bool &IsPending,
388 bool IsBottomUp) {
389 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
391 unsigned SGPRPressure = 0;
392 unsigned VGPRPressure = 0;
393 IsPending = false;
394 if (DAG->isTrackingPressure()) {
395 if (!GCNTrackers) {
396 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
397 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
398 } else {
399 GCNRPTracker *T = IsBottomUp
400 ? static_cast<GCNRPTracker *>(&UpwardTracker)
401 : static_cast<GCNRPTracker *>(&DownwardTracker);
402 SGPRPressure = T->getPressure().getSGPRNum();
403 VGPRPressure = T->getPressure().getArchVGPRNum();
404 }
405 }
406 LLVM_DEBUG(dbgs() << "Available Q:\n");
407 ReadyQueue &AQ = Zone.Available;
408 for (SUnit *SU : AQ) {
409
410 SchedCandidate TryCand(ZonePolicy);
411 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
412 VGPRPressure, IsBottomUp);
413 // Pass SchedBoundary only when comparing nodes from the same boundary.
414 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
415 tryCandidate(Cand, TryCand, ZoneArg);
416 if (TryCand.Reason != NoCand) {
417 // Initialize resource delta if needed in case future heuristics query it.
418 if (TryCand.ResDelta == SchedResourceDelta())
419 TryCand.initResourceDelta(Zone.DAG, SchedModel);
420 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
421 Cand.setBest(TryCand);
422 } else {
423 printCandidateDecision(TryCand, Cand);
424 }
425 }
426
427 if (!shouldCheckPending(Zone, SchedModel))
428 return;
429
430 LLVM_DEBUG(dbgs() << "Pending Q:\n");
431 ReadyQueue &PQ = Zone.Pending;
432 for (SUnit *SU : PQ) {
433
434 SchedCandidate TryCand(ZonePolicy);
435 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
436 VGPRPressure, IsBottomUp);
437 // Pass SchedBoundary only when comparing nodes from the same boundary.
438 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
439 tryPendingCandidate(Cand, TryCand, ZoneArg);
440 if (TryCand.Reason != NoCand) {
441 // Initialize resource delta if needed in case future heuristics query it.
442 if (TryCand.ResDelta == SchedResourceDelta())
443 TryCand.initResourceDelta(Zone.DAG, SchedModel);
444 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
445 IsPending = true;
446 Cand.setBest(TryCand);
447 } else {
448 printCandidateDecision(TryCand, Cand);
449 }
450 }
451}
452
453// This function is mostly cut and pasted from
454// GenericScheduler::pickNodeBidirectional()
456 bool &PickedPending) {
457 // Schedule as far as possible in the direction of no choice. This is most
458 // efficient, but also provides the best heuristics for CriticalPSets.
459 if (SUnit *SU = pickOnlyChoice(Bot, SchedModel)) {
460 IsTopNode = false;
461 return SU;
462 }
463 if (SUnit *SU = pickOnlyChoice(Top, SchedModel)) {
464 IsTopNode = true;
465 return SU;
466 }
467 // Set the bottom-up policy based on the state of the current bottom zone
468 // and the instructions outside the zone, including the top zone.
469 CandPolicy BotPolicy;
470 setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
471 // Set the top-down policy based on the state of the current top zone and
472 // the instructions outside the zone, including the bottom zone.
473 CandPolicy TopPolicy;
474 setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
475
476 bool BotPending = false;
477 // See if BotCand is still valid (because we previously scheduled from Top).
478 LLVM_DEBUG(dbgs() << "Picking from Bot:\n");
479 if (!BotCand.isValid() || BotCand.SU->isScheduled ||
480 BotCand.Policy != BotPolicy) {
481 BotCand.reset(CandPolicy());
482 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand,
483 BotPending,
484 /*IsBottomUp=*/true);
485 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
486 } else {
488#ifndef NDEBUG
489 if (VerifyScheduling) {
490 SchedCandidate TCand;
491 TCand.reset(CandPolicy());
492 pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand,
493 BotPending,
494 /*IsBottomUp=*/true);
495 assert(TCand.SU == BotCand.SU &&
496 "Last pick result should correspond to re-picking right now");
497 }
498#endif
499 }
500
501 bool TopPending = false;
502 // Check if the top Q has a better candidate.
503 LLVM_DEBUG(dbgs() << "Picking from Top:\n");
504 if (!TopCand.isValid() || TopCand.SU->isScheduled ||
505 TopCand.Policy != TopPolicy) {
506 TopCand.reset(CandPolicy());
507 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand,
508 TopPending,
509 /*IsBottomUp=*/false);
510 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
511 } else {
513#ifndef NDEBUG
514 if (VerifyScheduling) {
515 SchedCandidate TCand;
516 TCand.reset(CandPolicy());
517 pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand,
518 TopPending,
519 /*IsBottomUp=*/false);
520 assert(TCand.SU == TopCand.SU &&
521 "Last pick result should correspond to re-picking right now");
522 }
523#endif
524 }
525
526 // Pick best from BotCand and TopCand.
527 LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
528 dbgs() << "Bot Cand: "; traceCandidate(BotCand););
529 SchedCandidate Cand = BotPending ? TopCand : BotCand;
530 SchedCandidate TryCand = BotPending ? BotCand : TopCand;
531 PickedPending = BotPending && TopPending;
532
533 TryCand.Reason = NoCand;
534 if (BotPending || TopPending) {
535 PickedPending |= tryPendingCandidate(Cand, TopCand, nullptr);
536 } else {
537 tryCandidate(Cand, TryCand, nullptr);
538 }
539
540 if (TryCand.Reason != NoCand) {
541 Cand.setBest(TryCand);
542 }
543
544 LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
545
546 IsTopNode = Cand.AtTop;
547 return Cand.SU;
548}
549
550// This function is mostly cut and pasted from
551// GenericScheduler::pickNode()
553 if (DAG->top() == DAG->bottom()) {
554 assert(Top.Available.empty() && Top.Pending.empty() &&
555 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
556 return nullptr;
557 }
558 bool PickedPending;
559 SUnit *SU;
560 do {
561 PickedPending = false;
562 if (RegionPolicy.OnlyTopDown) {
564 if (!SU) {
565 CandPolicy NoPolicy;
566 TopCand.reset(NoPolicy);
567 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
568 PickedPending,
569 /*IsBottomUp=*/false);
570 assert(TopCand.Reason != NoCand && "failed to find a candidate");
571 SU = TopCand.SU;
572 }
573 IsTopNode = true;
574 } else if (RegionPolicy.OnlyBottomUp) {
576 if (!SU) {
577 CandPolicy NoPolicy;
578 BotCand.reset(NoPolicy);
579 pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand,
580 PickedPending,
581 /*IsBottomUp=*/true);
582 assert(BotCand.Reason != NoCand && "failed to find a candidate");
583 SU = BotCand.SU;
584 }
585 IsTopNode = false;
586 } else {
587 SU = pickNodeBidirectional(IsTopNode, PickedPending);
588 }
589 } while (SU->isScheduled);
590
591 if (PickedPending) {
592 unsigned ReadyCycle = IsTopNode ? SU->TopReadyCycle : SU->BotReadyCycle;
593 SchedBoundary &Zone = IsTopNode ? Top : Bot;
594 unsigned CurrentCycle = Zone.getCurrCycle();
595 if (ReadyCycle > CurrentCycle)
596 Zone.bumpCycle(ReadyCycle);
597
598 // FIXME: checkHazard() doesn't give information about which cycle the
599 // hazard will resolve so just keep bumping the cycle by 1. This could be
600 // made more efficient if checkHazard() returned more details.
601 while (Zone.checkHazard(SU))
602 Zone.bumpCycle(Zone.getCurrCycle() + 1);
603
604 Zone.releasePending();
605 }
606
607 if (SU->isTopReady())
608 Top.removeReady(SU);
609 if (SU->isBottomReady())
610 Bot.removeReady(SU);
611
612 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
613 << *SU->getInstr());
614 return SU;
615}
616
617void GCNSchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
618 if (GCNTrackers) {
619 MachineInstr *MI = SU->getInstr();
620 IsTopNode ? (void)DownwardTracker.advance(MI, false)
621 : UpwardTracker.recede(*MI);
622 }
623
624 return GenericScheduler::schedNode(SU, IsTopNode);
625}
626
631
634 if (!CurrentStage)
635 CurrentStage = SchedStages.begin();
636 else
637 CurrentStage++;
638
639 return CurrentStage != SchedStages.end();
640}
641
644 return std::next(CurrentStage) != SchedStages.end();
645}
646
648 assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
649 return *std::next(CurrentStage);
650}
651
653 SchedCandidate &TryCand,
654 SchedBoundary *Zone) const {
655 // Initialize the candidate if needed.
656 if (!Cand.isValid()) {
657 TryCand.Reason = NodeOrder;
658 return true;
659 }
660
661 // Bias PhysReg Defs and copies to their uses and defined respectively.
662 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
663 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
664 return TryCand.Reason != NoCand;
665
666 // Avoid exceeding the target's limit.
667 if (DAG->isTrackingPressure() &&
668 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
669 RegExcess, TRI, DAG->MF))
670 return TryCand.Reason != NoCand;
671
672 // Avoid increasing the max critical pressure in the scheduled region.
673 if (DAG->isTrackingPressure() &&
675 TryCand, Cand, RegCritical, TRI, DAG->MF))
676 return TryCand.Reason != NoCand;
677
678 bool SameBoundary = Zone != nullptr;
679 if (SameBoundary) {
682 TryCand, Cand, ResourceReduce))
683 return TryCand.Reason != NoCand;
685 Cand.ResDelta.DemandedResources, TryCand, Cand,
687 return TryCand.Reason != NoCand;
688 }
689
690 return false;
691}
692
704
709
711 SchedCandidate &TryCand,
712 SchedBoundary *Zone) const {
713 // Initialize the candidate if needed.
714 if (!Cand.isValid()) {
715 TryCand.Reason = NodeOrder;
716 return true;
717 }
718
719 // Avoid spilling by exceeding the register limit.
720 if (DAG->isTrackingPressure() &&
721 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
722 RegExcess, TRI, DAG->MF))
723 return TryCand.Reason != NoCand;
724
725 // Bias PhysReg Defs and copies to their uses and defined respectively.
726 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
727 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
728 return TryCand.Reason != NoCand;
729
730 bool SameBoundary = Zone != nullptr;
731 if (SameBoundary) {
732 // Prioritize instructions that read unbuffered resources by stall cycles.
733 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
734 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
735 return TryCand.Reason != NoCand;
736
737 // Avoid critical resource consumption and balance the schedule.
740 TryCand, Cand, ResourceReduce))
741 return TryCand.Reason != NoCand;
743 Cand.ResDelta.DemandedResources, TryCand, Cand,
745 return TryCand.Reason != NoCand;
746
747 // Unconditionally try to reduce latency.
748 if (tryLatency(TryCand, Cand, *Zone))
749 return TryCand.Reason != NoCand;
750
751 // Weak edges are for clustering and other constraints.
752 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
753 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
754 return TryCand.Reason != NoCand;
755 }
756
757 // Keep clustered nodes together to encourage downstream peephole
758 // optimizations which may reduce resource requirements.
759 //
760 // This is a best effort to set things up for a post-RA pass. Optimizations
761 // like generating loads of multiple registers should ideally be done within
762 // the scheduler pass by combining the loads during DAG postprocessing.
763 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
764 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
765 bool CandIsClusterSucc =
766 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
767 bool TryCandIsClusterSucc =
768 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
769 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
770 Cluster))
771 return TryCand.Reason != NoCand;
772
773 // Avoid increasing the max critical pressure in the scheduled region.
774 if (DAG->isTrackingPressure() &&
776 TryCand, Cand, RegCritical, TRI, DAG->MF))
777 return TryCand.Reason != NoCand;
778
779 // Avoid increasing the max pressure of the entire region.
780 if (DAG->isTrackingPressure() &&
781 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
782 Cand, RegMax, TRI, DAG->MF))
783 return TryCand.Reason != NoCand;
784
785 if (SameBoundary) {
786 // Fall through to original instruction order.
787 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
788 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
789 TryCand.Reason = NodeOrder;
790 return true;
791 }
792 }
793 return false;
794}
795
801
802/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
803/// much as possible. This is achieved by:
804// 1. Prioritize clustered operations before stall latency heuristic.
805// 2. Prioritize long-latency-load before stall latency heuristic.
806///
807/// \param Cand provides the policy and current best candidate.
808/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
809/// \param Zone describes the scheduled zone that we are extending, or nullptr
810/// if Cand is from a different zone than TryCand.
811/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
813 SchedCandidate &TryCand,
814 SchedBoundary *Zone) const {
815 // Initialize the candidate if needed.
816 if (!Cand.isValid()) {
817 TryCand.Reason = NodeOrder;
818 return true;
819 }
820
821 // Bias PhysReg Defs and copies to their uses and defined respectively.
822 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
823 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
824 return TryCand.Reason != NoCand;
825
826 if (DAG->isTrackingPressure()) {
827 // Avoid exceeding the target's limit.
828 if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
829 RegExcess, TRI, DAG->MF))
830 return TryCand.Reason != NoCand;
831
832 // Avoid increasing the max critical pressure in the scheduled region.
834 TryCand, Cand, RegCritical, TRI, DAG->MF))
835 return TryCand.Reason != NoCand;
836 }
837
838 // MaxMemoryClause-specific: We prioritize clustered instructions as we would
839 // get more benefit from clausing these memory instructions.
840 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
841 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
842 bool CandIsClusterSucc =
843 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
844 bool TryCandIsClusterSucc =
845 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
846 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
847 Cluster))
848 return TryCand.Reason != NoCand;
849
850 // We only compare a subset of features when comparing nodes between
851 // Top and Bottom boundary. Some properties are simply incomparable, in many
852 // other instances we should only override the other boundary if something
853 // is a clear good pick on one boundary. Skip heuristics that are more
854 // "tie-breaking" in nature.
855 bool SameBoundary = Zone != nullptr;
856 if (SameBoundary) {
857 // For loops that are acyclic path limited, aggressively schedule for
858 // latency. Within an single cycle, whenever CurrMOps > 0, allow normal
859 // heuristics to take precedence.
860 if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
861 tryLatency(TryCand, Cand, *Zone))
862 return TryCand.Reason != NoCand;
863
864 // MaxMemoryClause-specific: Prioritize long latency memory load
865 // instructions in top-bottom order to hide more latency. The mayLoad check
866 // is used to exclude store-like instructions, which we do not want to
867 // scheduler them too early.
868 bool TryMayLoad =
869 TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
870 bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
871
872 if (TryMayLoad || CandMayLoad) {
873 bool TryLongLatency =
874 TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
875 bool CandLongLatency =
876 10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
877
878 if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
879 Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
880 Cand, Stall))
881 return TryCand.Reason != NoCand;
882 }
883 // Prioritize instructions that read unbuffered resources by stall cycles.
884 if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
885 Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
886 return TryCand.Reason != NoCand;
887 }
888
889 if (SameBoundary) {
890 // Weak edges are for clustering and other constraints.
891 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
892 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
893 return TryCand.Reason != NoCand;
894 }
895
896 // Avoid increasing the max pressure of the entire region.
897 if (DAG->isTrackingPressure() &&
898 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
899 Cand, RegMax, TRI, DAG->MF))
900 return TryCand.Reason != NoCand;
901
902 if (SameBoundary) {
903 // Avoid critical resource consumption and balance the schedule.
906 TryCand, Cand, ResourceReduce))
907 return TryCand.Reason != NoCand;
909 Cand.ResDelta.DemandedResources, TryCand, Cand,
911 return TryCand.Reason != NoCand;
912
913 // Avoid serializing long latency dependence chains.
914 // For acyclic path limited loops, latency was already checked above.
915 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
916 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
917 return TryCand.Reason != NoCand;
918
919 // Fall through to original instruction order.
920 if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
921 assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
922 TryCand.Reason = NodeOrder;
923 return true;
924 }
925 }
926
927 return false;
928}
929
931 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
932 : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
933 MFI(*MF.getInfo<SIMachineFunctionInfo>()),
934 StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy),
935 RegionLiveOuts(this, /*IsLiveOut=*/true) {
936
937 // We want regions with a single MI to be scheduled so that we can reason
938 // about them correctly during scheduling stages that move MIs between regions
939 // (e.g., rematerialization).
941 LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
942 if (RelaxedOcc) {
943 MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
944 if (MinOccupancy != StartingOccupancy)
945 LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
946 << ".\n");
947 }
948}
949
950std::unique_ptr<GCNSchedStage>
951GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
952 switch (SchedStageID) {
954 return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
956 return std::make_unique<RewriteMFMAFormStage>(SchedStageID, *this);
958 return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
960 return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
962 return std::make_unique<PreRARematStage>(SchedStageID, *this);
964 return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
966 return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
967 *this);
968 }
969
970 llvm_unreachable("Unknown SchedStageID.");
971}
972
974 // Collect all scheduling regions. The actual scheduling is performed in
975 // GCNScheduleDAGMILive::finalizeSchedule.
976 Regions.push_back(std::pair(RegionBegin, RegionEnd));
977}
978
980GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const {
982 RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second,
983 &LiveIns[RegionIdx]);
984 return RPTracker.moveMaxPressure();
985}
986
988 MachineBasicBlock::iterator RegionEnd) {
989 assert(RegionBegin != RegionEnd && "Region must not be empty");
990 return &*skipDebugInstructionsBackward(std::prev(RegionEnd), RegionBegin);
991}
992
993void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
994 const MachineBasicBlock *MBB) {
995 GCNDownwardRPTracker RPTracker(*LIS);
996
997 // If the block has the only successor then live-ins of that successor are
998 // live-outs of the current block. We can reuse calculated live set if the
999 // successor will be sent to scheduling past current block.
1000
1001 // However, due to the bug in LiveInterval analysis it may happen that two
1002 // predecessors of the same successor block have different lane bitmasks for
1003 // a live-out register. Workaround that by sticking to one-to-one relationship
1004 // i.e. one predecessor with one successor block.
1005 const MachineBasicBlock *OnlySucc = nullptr;
1006 if (MBB->succ_size() == 1) {
1007 auto *Candidate = *MBB->succ_begin();
1008 if (!Candidate->empty() && Candidate->pred_size() == 1) {
1009 SlotIndexes *Ind = LIS->getSlotIndexes();
1010 if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
1011 OnlySucc = Candidate;
1012 }
1013 }
1014
1015 // Scheduler sends regions from the end of the block upwards.
1016 size_t CurRegion = RegionIdx;
1017 for (size_t E = Regions.size(); CurRegion != E; ++CurRegion)
1018 if (Regions[CurRegion].first->getParent() != MBB)
1019 break;
1020 --CurRegion;
1021
1022 auto I = MBB->begin();
1023 auto LiveInIt = MBBLiveIns.find(MBB);
1024 auto &Rgn = Regions[CurRegion];
1025 auto *NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
1026 if (LiveInIt != MBBLiveIns.end()) {
1027 auto LiveIn = std::move(LiveInIt->second);
1028 RPTracker.reset(*MBB->begin(), &LiveIn);
1029 MBBLiveIns.erase(LiveInIt);
1030 } else {
1031 I = Rgn.first;
1032 auto LRS = BBLiveInMap.lookup(NonDbgMI);
1033#ifdef EXPENSIVE_CHECKS
1034 assert(isEqual(getLiveRegsBefore(*NonDbgMI, *LIS), LRS));
1035#endif
1036 RPTracker.reset(*I, &LRS);
1037 }
1038
1039 for (;;) {
1040 I = RPTracker.getNext();
1041
1042 if (Regions[CurRegion].first == I || NonDbgMI == I) {
1043 LiveIns[CurRegion] = RPTracker.getLiveRegs();
1044 RPTracker.clearMaxPressure();
1045 }
1046
1047 if (Regions[CurRegion].second == I) {
1048 Pressure[CurRegion] = RPTracker.moveMaxPressure();
1049 if (CurRegion-- == RegionIdx)
1050 break;
1051 auto &Rgn = Regions[CurRegion];
1052 NonDbgMI = &*skipDebugInstructionsForward(Rgn.first, Rgn.second);
1053 }
1054 RPTracker.advanceToNext();
1055 RPTracker.advanceBeforeNext();
1056 }
1057
1058 if (OnlySucc) {
1059 if (I != MBB->end()) {
1060 RPTracker.advanceToNext();
1061 RPTracker.advance(MBB->end());
1062 }
1063 RPTracker.advanceBeforeNext();
1064 MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs();
1065 }
1066}
1067
1069GCNScheduleDAGMILive::getRegionLiveInMap() const {
1070 assert(!Regions.empty());
1071 std::vector<MachineInstr *> RegionFirstMIs;
1072 RegionFirstMIs.reserve(Regions.size());
1073 for (auto &[RegionBegin, RegionEnd] : reverse(Regions))
1074 RegionFirstMIs.push_back(
1076
1077 return getLiveRegMap(RegionFirstMIs, /*After=*/false, *LIS);
1078}
1079
1081GCNScheduleDAGMILive::getRegionLiveOutMap() const {
1082 assert(!Regions.empty());
1083 std::vector<MachineInstr *> RegionLastMIs;
1084 RegionLastMIs.reserve(Regions.size());
1085 for (auto &[RegionBegin, RegionEnd] : reverse(Regions)) {
1086 // Skip empty regions.
1087 if (RegionBegin == RegionEnd)
1088 continue;
1089 RegionLastMIs.push_back(getLastMIForRegion(RegionBegin, RegionEnd));
1090 }
1091 return getLiveRegMap(RegionLastMIs, /*After=*/true, *LIS);
1092}
1093
1095 IdxToInstruction.clear();
1096
1097 RegionLiveRegMap =
1098 IsLiveOut ? DAG->getRegionLiveOutMap() : DAG->getRegionLiveInMap();
1099 for (unsigned I = 0; I < DAG->Regions.size(); I++) {
1100 auto &[RegionBegin, RegionEnd] = DAG->Regions[I];
1101 // Skip empty regions.
1102 if (RegionBegin == RegionEnd)
1103 continue;
1104 MachineInstr *RegionKey =
1105 IsLiveOut ? getLastMIForRegion(RegionBegin, RegionEnd) : &*RegionBegin;
1106 IdxToInstruction[I] = RegionKey;
1107 }
1108}
1109
1111 // Start actual scheduling here. This function is called by the base
1112 // MachineScheduler after all regions have been recorded by
1113 // GCNScheduleDAGMILive::schedule().
1114 LiveIns.resize(Regions.size());
1115 Pressure.resize(Regions.size());
1116 RegionsWithHighRP.resize(Regions.size());
1117 RegionsWithExcessRP.resize(Regions.size());
1118 RegionsWithIGLPInstrs.resize(Regions.size());
1119 RegionsWithHighRP.reset();
1120 RegionsWithExcessRP.reset();
1121 RegionsWithIGLPInstrs.reset();
1122
1123 runSchedStages();
1124}
1125
1126void GCNScheduleDAGMILive::runSchedStages() {
1127 LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
1128
1129 if (!Regions.empty()) {
1130 BBLiveInMap = getRegionLiveInMap();
1131 if (GCNTrackers)
1132 RegionLiveOuts.buildLiveRegMap();
1133 }
1134
1135#ifdef DUMP_MAX_REG_PRESSURE
1139 LIS->dump();
1140 }
1141#endif
1142
1143 GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
1144 while (S.advanceStage()) {
1145 auto Stage = createSchedStage(S.getCurrentStage());
1146 if (!Stage->initGCNSchedStage())
1147 continue;
1148
1149 for (auto Region : Regions) {
1150 RegionBegin = Region.first;
1151 RegionEnd = Region.second;
1152 // Setup for scheduling the region and check whether it should be skipped.
1153 if (!Stage->initGCNRegion()) {
1154 Stage->advanceRegion();
1155 exitRegion();
1156 continue;
1157 }
1158
1159 if (GCNTrackers) {
1160 GCNDownwardRPTracker *DownwardTracker = S.getDownwardTracker();
1161 GCNUpwardRPTracker *UpwardTracker = S.getUpwardTracker();
1162 GCNRPTracker::LiveRegSet *RegionLiveIns =
1163 &LiveIns[Stage->getRegionIdx()];
1164
1165 reinterpret_cast<GCNRPTracker *>(DownwardTracker)
1166 ->reset(MRI, *RegionLiveIns);
1167 reinterpret_cast<GCNRPTracker *>(UpwardTracker)
1168 ->reset(MRI, RegionLiveOuts.getLiveRegsForRegionIdx(
1169 Stage->getRegionIdx()));
1170 }
1171
1173 Stage->finalizeGCNRegion();
1174 Stage->advanceRegion();
1175 exitRegion();
1176 }
1177
1178 Stage->finalizeGCNSchedStage();
1179 }
1180
1181#ifdef DUMP_MAX_REG_PRESSURE
1185 LIS->dump();
1186 }
1187#endif
1188}
1189
1190#ifndef NDEBUG
1192 switch (StageID) {
1194 OS << "Max Occupancy Initial Schedule";
1195 break;
1197 OS << "Instruction Rewriting Reschedule";
1198 break;
1200 OS << "Unclustered High Register Pressure Reschedule";
1201 break;
1203 OS << "Clustered Low Occupancy Reschedule";
1204 break;
1206 OS << "Pre-RA Rematerialize";
1207 break;
1209 OS << "Max ILP Initial Schedule";
1210 break;
1212 OS << "Max memory clause Initial Schedule";
1213 break;
1214 }
1215
1216 return OS;
1217}
1218#endif
1219
1223
1225 if (!DAG.LIS)
1226 return false;
1227
1228 LLVM_DEBUG(dbgs() << "Starting scheduling stage: " << StageID << "\n");
1229 return true;
1230}
1231
1232void RewriteMFMAFormStage::findReachingDefs(
1233 MachineOperand &UseMO, LiveIntervals *LIS,
1234 SmallVectorImpl<SlotIndex> &DefIdxs) {
1235 MachineInstr *UseMI = UseMO.getParent();
1236 LiveInterval &UseLI = LIS->getInterval(UseMO.getReg());
1237 VNInfo *VNI = UseLI.getVNInfoAt(LIS->getInstructionIndex(*UseMI));
1238
1239 // If the def is not a PHI, then it must be the only reaching def.
1240 if (!VNI->isPHIDef()) {
1241 DefIdxs.push_back(VNI->def);
1242 return;
1243 }
1244
1245 SmallPtrSet<MachineBasicBlock *, 8> Visited = {UseMI->getParent()};
1246 SmallVector<MachineBasicBlock *, 8> Worklist;
1247
1248 // Mark the predecessor blocks for traversal
1249 for (MachineBasicBlock *PredMBB : UseMI->getParent()->predecessors()) {
1250 Worklist.push_back(PredMBB);
1251 Visited.insert(PredMBB);
1252 }
1253
1254 while (!Worklist.empty()) {
1255 MachineBasicBlock *CurrMBB = Worklist.pop_back_val();
1256
1257 SlotIndex CurrMBBEnd = LIS->getMBBEndIdx(CurrMBB);
1258 VNInfo *VNI = UseLI.getVNInfoAt(CurrMBBEnd.getPrevSlot());
1259
1260 MachineBasicBlock *DefMBB = LIS->getMBBFromIndex(VNI->def);
1261
1262 // If there is a def in this block, then add it to the list. This is the
1263 // reaching def of this path.
1264 if (!VNI->isPHIDef()) {
1265 DefIdxs.push_back(VNI->def);
1266 continue;
1267 }
1268
1269 for (MachineBasicBlock *PredMBB : DefMBB->predecessors()) {
1270 if (Visited.insert(PredMBB).second)
1271 Worklist.push_back(PredMBB);
1272 }
1273 }
1274}
1275
1276void RewriteMFMAFormStage::findReachingUses(
1278 SmallVectorImpl<MachineOperand *> &ReachingUses) {
1279 SlotIndex DefIdx = LIS->getInstructionIndex(*DefMI);
1280 for (MachineOperand &UseMO :
1281 DAG.MRI.use_nodbg_operands(DefMI->getOperand(0).getReg())) {
1282 SmallVector<SlotIndex, 8> ReachingDefIndexes;
1283 findReachingDefs(UseMO, LIS, ReachingDefIndexes);
1284
1285 // If we find a use that contains this DefMI in its reachingDefs, then it is
1286 // a reaching use.
1287 if (any_of(ReachingDefIndexes, [DefIdx](SlotIndex RDIdx) {
1288 return SlotIndex::isSameInstr(RDIdx, DefIdx);
1289 }))
1290 ReachingUses.push_back(&UseMO);
1291 }
1292}
1293
1295 // We only need to run this pass if the architecture supports AGPRs.
1296 // Additionally, we don't use AGPRs at occupancy levels above 1 so there
1297 // is no need for this pass in that case, either.
1298 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1299 if (!ST.hasGFX90AInsts() || MFI.getMinWavesPerEU() > 1)
1300 return false;
1301
1302 RegionsWithExcessArchVGPR.resize(DAG.Regions.size());
1303 RegionsWithExcessArchVGPR.reset();
1304 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
1306 if (PressureBefore.getArchVGPRNum() > ST.getAddressableNumArchVGPRs())
1307 RegionsWithExcessArchVGPR[Region] = true;
1308 }
1309
1310 if (RegionsWithExcessArchVGPR.none())
1311 return false;
1312
1313 TII = ST.getInstrInfo();
1314 SRI = ST.getRegisterInfo();
1315
1316 std::vector<std::pair<MachineInstr *, unsigned>> RewriteCands;
1319
1320 if (!initHeuristics(RewriteCands, CopyForUse, CopyForDef))
1321 return false;
1322
1323 int64_t Cost = getRewriteCost(RewriteCands, CopyForUse, CopyForDef);
1324
1325 // If we haven't found the beneficial conditions, prefer the VGPR form which
1326 // may result in less cross RC copies.
1327 if (Cost > 0)
1328 return false;
1329
1330 return rewrite(RewriteCands);
1331}
1332
1335 return false;
1336
1338 return false;
1339
1340 if (DAG.RegionsWithHighRP.none() && DAG.RegionsWithExcessRP.none())
1341 return false;
1342
1343 SavedMutations.swap(DAG.Mutations);
1344 DAG.addMutation(
1346
1347 InitialOccupancy = DAG.MinOccupancy;
1348 // Aggressively try to reduce register pressure in the unclustered high RP
1349 // stage. Temporarily increase occupancy target in the region.
1350 TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
1351 ? InitialOccupancy + 1
1352 : InitialOccupancy;
1353 IsAnyRegionScheduled = false;
1354 S.SGPRLimitBias = S.HighRPSGPRBias;
1355 S.VGPRLimitBias = S.HighRPVGPRBias;
1356
1357 LLVM_DEBUG(
1358 dbgs()
1359 << "Retrying function scheduling without clustering. "
1360 "Aggressively try to reduce register pressure to achieve occupancy "
1361 << TempTargetOccupancy << ".\n");
1362
1363 return true;
1364}
1365
1368 return false;
1369
1371 return false;
1372
1373 // Don't bother trying to improve ILP in lower RP regions if occupancy has not
1374 // been dropped. All regions will have already been scheduled with the ideal
1375 // occupancy targets.
1376 if (DAG.StartingOccupancy <= DAG.MinOccupancy)
1377 return false;
1378
1379 LLVM_DEBUG(
1380 dbgs() << "Retrying function scheduling with lowest recorded occupancy "
1381 << DAG.MinOccupancy << ".\n");
1382 return true;
1383}
1384
1385/// Allows to easily filter for this stage's debug output.
1386#define REMAT_PREFIX "[PreRARemat] "
1387#define REMAT_DEBUG(X) LLVM_DEBUG(dbgs() << REMAT_PREFIX; X;)
1388
1390 // FIXME: This pass will invalidate cached BBLiveInMap and MBBLiveIns for
1391 // regions inbetween the defs and region we sinked the def to. Will need to be
1392 // fixed if there is another pass after this pass.
1393 assert(!S.hasNextStage());
1394
1395 if (!GCNSchedStage::initGCNSchedStage() || DAG.Regions.size() == 1)
1396 return false;
1397
1398 // Before performing any IR modification record the parent region of each MI
1399 // and the parent MBB of each region.
1400 const unsigned NumRegions = DAG.Regions.size();
1401 RegionBB.reserve(NumRegions);
1402 for (unsigned I = 0; I < NumRegions; ++I) {
1403 RegionBoundaries Region = DAG.Regions[I];
1404 for (auto MI = Region.first; MI != Region.second; ++MI)
1405 MIRegion.insert({&*MI, I});
1406 RegionBB.push_back(Region.first->getParent());
1407 }
1408
1409 if (!canIncreaseOccupancyOrReduceSpill())
1410 return false;
1411
1412 // Rematerialize identified instructions and update scheduler's state.
1413 rematerialize();
1414 if (GCNTrackers)
1415 DAG.RegionLiveOuts.buildLiveRegMap();
1416 REMAT_DEBUG({
1417 dbgs() << "Retrying function scheduling with new min. occupancy of "
1418 << AchievedOcc << " from rematerializing (original was "
1419 << DAG.MinOccupancy;
1420 if (TargetOcc)
1421 dbgs() << ", target was " << *TargetOcc;
1422 dbgs() << ")\n";
1423 });
1424
1425 if (AchievedOcc > DAG.MinOccupancy) {
1426 DAG.MinOccupancy = AchievedOcc;
1428 MFI.increaseOccupancy(MF, DAG.MinOccupancy);
1429 }
1430 return true;
1431}
1432
1434 DAG.finishBlock();
1435 LLVM_DEBUG(dbgs() << "Ending scheduling stage: " << StageID << "\n");
1436}
1437
1439 SavedMutations.swap(DAG.Mutations);
1440 S.SGPRLimitBias = S.VGPRLimitBias = 0;
1441 if (DAG.MinOccupancy > InitialOccupancy) {
1442 assert(IsAnyRegionScheduled);
1444 << " stage successfully increased occupancy to "
1445 << DAG.MinOccupancy << '\n');
1446 } else if (!IsAnyRegionScheduled) {
1447 assert(DAG.MinOccupancy == InitialOccupancy);
1449 << ": No regions scheduled, min occupancy stays at "
1450 << DAG.MinOccupancy << ", MFI occupancy stays at "
1451 << MFI.getOccupancy() << ".\n");
1452 }
1453
1455}
1456
1458 // Check whether this new region is also a new block.
1459 if (DAG.RegionBegin->getParent() != CurrentMBB)
1460 setupNewBlock();
1461
1462 unsigned NumRegionInstrs = std::distance(DAG.begin(), DAG.end());
1463 DAG.enterRegion(CurrentMBB, DAG.begin(), DAG.end(), NumRegionInstrs);
1464
1465 // Skip empty scheduling regions (0 or 1 schedulable instructions).
1466 if (DAG.begin() == DAG.end() || DAG.begin() == std::prev(DAG.end()))
1467 return false;
1468
1469 LLVM_DEBUG(dbgs() << "********** MI Scheduling **********\n");
1470 LLVM_DEBUG(dbgs() << MF.getName() << ":" << printMBBReference(*CurrentMBB)
1471 << " " << CurrentMBB->getName()
1472 << "\n From: " << *DAG.begin() << " To: ";
1473 if (DAG.RegionEnd != CurrentMBB->end()) dbgs() << *DAG.RegionEnd;
1474 else dbgs() << "End";
1475 dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
1476
1477 // Save original instruction order before scheduling for possible revert.
1478 Unsched.clear();
1479 Unsched.reserve(DAG.NumRegionInstrs);
1482 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG.TII);
1483 for (auto &I : DAG) {
1484 Unsched.push_back(&I);
1485 if (SII->isIGLPMutationOnly(I.getOpcode()))
1486 DAG.RegionsWithIGLPInstrs[RegionIdx] = true;
1487 }
1488 } else {
1489 for (auto &I : DAG)
1490 Unsched.push_back(&I);
1491 }
1492
1493 PressureBefore = DAG.Pressure[RegionIdx];
1494
1495 LLVM_DEBUG(
1496 dbgs() << "Pressure before scheduling:\nRegion live-ins:"
1497 << print(DAG.LiveIns[RegionIdx], DAG.MRI)
1498 << "Region live-in pressure: "
1499 << print(llvm::getRegPressure(DAG.MRI, DAG.LiveIns[RegionIdx]))
1500 << "Region register pressure: " << print(PressureBefore));
1501
1502 S.HasHighPressure = false;
1503 S.KnownExcessRP = isRegionWithExcessRP();
1504
1505 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1507 SavedMutations.clear();
1508 SavedMutations.swap(DAG.Mutations);
1509 bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
1511 DAG.addMutation(createIGroupLPDAGMutation(
1512 IsInitialStage ? AMDGPU::SchedulingPhase::Initial
1514 }
1515
1516 return true;
1517}
1518
1520 // Only reschedule regions that have excess register pressure (i.e. spilling)
1521 // or had minimum occupancy at the beginning of the stage (as long as
1522 // rescheduling of previous regions did not make occupancy drop back down to
1523 // the initial minimum).
1524 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1525 // If no region has been scheduled yet, the DAG has not yet been updated with
1526 // the occupancy target. So retrieve it from the temporary.
1527 unsigned CurrentTargetOccupancy =
1528 IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
1529 if (!DAG.RegionsWithExcessRP[RegionIdx] &&
1530 (CurrentTargetOccupancy <= InitialOccupancy ||
1531 DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
1532 InitialOccupancy))
1533 return false;
1534
1535 bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
1536 // If this is the first region scheduled during this stage, make the target
1537 // occupancy changes in the DAG and MFI.
1538 if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
1539 IsAnyRegionScheduled = true;
1540 if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) {
1541 DAG.MinOccupancy = TempTargetOccupancy;
1542 MFI.increaseOccupancy(MF, TempTargetOccupancy);
1543 }
1544 }
1545 return IsSchedulingThisRegion;
1546}
1547
1549 // We may need to reschedule this region if it wasn't rescheduled in the last
1550 // stage, or if we found it was testing critical register pressure limits in
1551 // the unclustered reschedule stage. The later is because we may not have been
1552 // able to raise the min occupancy in the previous stage so the region may be
1553 // overly constrained even if it was already rescheduled.
1554 if (!DAG.RegionsWithHighRP[RegionIdx])
1555 return false;
1556
1558}
1559
1561 return RescheduleRegions[RegionIdx] && GCNSchedStage::initGCNRegion();
1562}
1563
1565 if (CurrentMBB)
1566 DAG.finishBlock();
1567
1568 CurrentMBB = DAG.RegionBegin->getParent();
1569 DAG.startBlock(CurrentMBB);
1570 // Get real RP for the region if it hasn't be calculated before. After the
1571 // initial schedule stage real RP will be collected after scheduling.
1575 DAG.computeBlockPressure(RegionIdx, CurrentMBB);
1576}
1577
1579 DAG.Regions[RegionIdx] = std::pair(DAG.RegionBegin, DAG.RegionEnd);
1580 if (S.HasHighPressure)
1581 DAG.RegionsWithHighRP[RegionIdx] = true;
1582
1583 // Revert scheduling if we have dropped occupancy or there is some other
1584 // reason that the original schedule is better.
1586
1587 if (DAG.RegionsWithIGLPInstrs[RegionIdx] &&
1589 SavedMutations.swap(DAG.Mutations);
1590}
1591
1593 // Check the results of scheduling.
1594 PressureAfter = DAG.getRealRegPressure(RegionIdx);
1595
1596 LLVM_DEBUG(dbgs() << "Pressure after scheduling: " << print(PressureAfter));
1597 LLVM_DEBUG(dbgs() << "Region: " << RegionIdx << ".\n");
1598
1599 unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
1600
1601 if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit &&
1602 PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) <= S.VGPRCriticalLimit) {
1603 DAG.Pressure[RegionIdx] = PressureAfter;
1604
1605 // Early out if we have achieved the occupancy target.
1606 LLVM_DEBUG(dbgs() << "Pressure in desired limits, done.\n");
1607 return;
1608 }
1609
1610 unsigned TargetOccupancy = std::min(
1611 S.getTargetOccupancy(), ST.getOccupancyWithWorkGroupSizes(MF).second);
1612 unsigned WavesAfter = std::min(
1613 TargetOccupancy, PressureAfter.getOccupancy(ST, DynamicVGPRBlockSize));
1614 unsigned WavesBefore = std::min(
1615 TargetOccupancy, PressureBefore.getOccupancy(ST, DynamicVGPRBlockSize));
1616 LLVM_DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore
1617 << ", after " << WavesAfter << ".\n");
1618
1619 // We may not be able to keep the current target occupancy because of the just
1620 // scheduled region. We might still be able to revert scheduling if the
1621 // occupancy before was higher, or if the current schedule has register
1622 // pressure higher than the excess limits which could lead to more spilling.
1623 unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
1624
1625 // Allow memory bound functions to drop to 4 waves if not limited by an
1626 // attribute.
1627 if (WavesAfter < WavesBefore && WavesAfter < DAG.MinOccupancy &&
1628 WavesAfter >= MFI.getMinAllowedOccupancy()) {
1629 LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to "
1630 << MFI.getMinAllowedOccupancy() << " waves\n");
1631 NewOccupancy = WavesAfter;
1632 }
1633
1634 if (NewOccupancy < DAG.MinOccupancy) {
1635 DAG.MinOccupancy = NewOccupancy;
1636 MFI.limitOccupancy(DAG.MinOccupancy);
1637 LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to "
1638 << DAG.MinOccupancy << ".\n");
1639 }
1640 // The maximum number of arch VGPR on non-unified register file, or the
1641 // maximum VGPR + AGPR in the unified register file case.
1642 unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
1643 // The maximum number of arch VGPR for both unified and non-unified register
1644 // file.
1645 unsigned MaxArchVGPRs = std::min(MaxVGPRs, ST.getAddressableNumArchVGPRs());
1646 unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
1647
1648 if (PressureAfter.getVGPRNum(ST.hasGFX90AInsts()) > MaxVGPRs ||
1649 PressureAfter.getArchVGPRNum() > MaxArchVGPRs ||
1650 PressureAfter.getAGPRNum() > MaxArchVGPRs ||
1651 PressureAfter.getSGPRNum() > MaxSGPRs) {
1652 DAG.RegionsWithHighRP[RegionIdx] = true;
1653 DAG.RegionsWithExcessRP[RegionIdx] = true;
1654 }
1655
1656 // Revert if this region's schedule would cause a drop in occupancy or
1657 // spilling.
1658 if (shouldRevertScheduling(WavesAfter)) {
1660 std::tie(DAG.RegionBegin, DAG.RegionEnd) = DAG.Regions[RegionIdx];
1661 } else {
1662 DAG.Pressure[RegionIdx] = PressureAfter;
1663 }
1664}
1665
1666unsigned
1667GCNSchedStage::computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle,
1668 DenseMap<unsigned, unsigned> &ReadyCycles,
1669 const TargetSchedModel &SM) {
1670 unsigned ReadyCycle = CurrCycle;
1671 for (auto &D : SU.Preds) {
1672 if (D.isAssignedRegDep()) {
1673 MachineInstr *DefMI = D.getSUnit()->getInstr();
1674 unsigned Latency = SM.computeInstrLatency(DefMI);
1675 unsigned DefReady = ReadyCycles[DAG.getSUnit(DefMI)->NodeNum];
1676 ReadyCycle = std::max(ReadyCycle, DefReady + Latency);
1677 }
1678 }
1679 ReadyCycles[SU.NodeNum] = ReadyCycle;
1680 return ReadyCycle;
1681}
1682
1683#ifndef NDEBUG
1685 bool operator()(std::pair<MachineInstr *, unsigned> A,
1686 std::pair<MachineInstr *, unsigned> B) const {
1687 return A.second < B.second;
1688 }
1689};
1690
1691static void printScheduleModel(std::set<std::pair<MachineInstr *, unsigned>,
1692 EarlierIssuingCycle> &ReadyCycles) {
1693 if (ReadyCycles.empty())
1694 return;
1695 unsigned BBNum = ReadyCycles.begin()->first->getParent()->getNumber();
1696 dbgs() << "\n################## Schedule time ReadyCycles for MBB : " << BBNum
1697 << " ##################\n# Cycle #\t\t\tInstruction "
1698 " "
1699 " \n";
1700 unsigned IPrev = 1;
1701 for (auto &I : ReadyCycles) {
1702 if (I.second > IPrev + 1)
1703 dbgs() << "****************************** BUBBLE OF " << I.second - IPrev
1704 << " CYCLES DETECTED ******************************\n\n";
1705 dbgs() << "[ " << I.second << " ] : " << *I.first << "\n";
1706 IPrev = I.second;
1707 }
1708}
1709#endif
1710
1712GCNSchedStage::getScheduleMetrics(const std::vector<SUnit> &InputSchedule) {
1713#ifndef NDEBUG
1714 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1715 ReadyCyclesSorted;
1716#endif
1717 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1718 unsigned SumBubbles = 0;
1719 DenseMap<unsigned, unsigned> ReadyCycles;
1720 unsigned CurrCycle = 0;
1721 for (auto &SU : InputSchedule) {
1722 unsigned ReadyCycle =
1723 computeSUnitReadyCycle(SU, CurrCycle, ReadyCycles, SM);
1724 SumBubbles += ReadyCycle - CurrCycle;
1725#ifndef NDEBUG
1726 ReadyCyclesSorted.insert(std::make_pair(SU.getInstr(), ReadyCycle));
1727#endif
1728 CurrCycle = ++ReadyCycle;
1729 }
1730#ifndef NDEBUG
1731 LLVM_DEBUG(
1732 printScheduleModel(ReadyCyclesSorted);
1733 dbgs() << "\n\t"
1734 << "Metric: "
1735 << (SumBubbles
1736 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1737 : 1)
1738 << "\n\n");
1739#endif
1740
1741 return ScheduleMetrics(CurrCycle, SumBubbles);
1742}
1743
1746#ifndef NDEBUG
1747 std::set<std::pair<MachineInstr *, unsigned>, EarlierIssuingCycle>
1748 ReadyCyclesSorted;
1749#endif
1750 const TargetSchedModel &SM = ST.getInstrInfo()->getSchedModel();
1751 unsigned SumBubbles = 0;
1752 DenseMap<unsigned, unsigned> ReadyCycles;
1753 unsigned CurrCycle = 0;
1754 for (auto &MI : DAG) {
1755 SUnit *SU = DAG.getSUnit(&MI);
1756 if (!SU)
1757 continue;
1758 unsigned ReadyCycle =
1759 computeSUnitReadyCycle(*SU, CurrCycle, ReadyCycles, SM);
1760 SumBubbles += ReadyCycle - CurrCycle;
1761#ifndef NDEBUG
1762 ReadyCyclesSorted.insert(std::make_pair(SU->getInstr(), ReadyCycle));
1763#endif
1764 CurrCycle = ++ReadyCycle;
1765 }
1766#ifndef NDEBUG
1767 LLVM_DEBUG(
1768 printScheduleModel(ReadyCyclesSorted);
1769 dbgs() << "\n\t"
1770 << "Metric: "
1771 << (SumBubbles
1772 ? (SumBubbles * ScheduleMetrics::ScaleFactor) / CurrCycle
1773 : 1)
1774 << "\n\n");
1775#endif
1776
1777 return ScheduleMetrics(CurrCycle, SumBubbles);
1778}
1779
1780bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
1781 if (WavesAfter < DAG.MinOccupancy)
1782 return true;
1783
1784 // For dynamic VGPR mode, we don't want to waste any VGPR blocks.
1785 if (DAG.MFI.isDynamicVGPREnabled()) {
1786 unsigned BlocksBefore = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1787 &ST, DAG.MFI.getDynamicVGPRBlockSize(),
1788 PressureBefore.getVGPRNum(false));
1789 unsigned BlocksAfter = AMDGPU::IsaInfo::getAllocatedNumVGPRBlocks(
1790 &ST, DAG.MFI.getDynamicVGPRBlockSize(),
1791 PressureAfter.getVGPRNum(false));
1792 if (BlocksAfter > BlocksBefore)
1793 return true;
1794 }
1795
1796 return false;
1797}
1798
1801 return false;
1802
1804 return true;
1805
1806 if (mayCauseSpilling(WavesAfter))
1807 return true;
1808
1809 return false;
1810}
1811
1813 // If RP is not reduced in the unclustered reschedule stage, revert to the
1814 // old schedule.
1815 if ((WavesAfter <=
1816 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()) &&
1817 mayCauseSpilling(WavesAfter)) ||
1819 LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
1820 return true;
1821 }
1822
1823 // Do not attempt to relax schedule even more if we are already spilling.
1825 return false;
1826
1827 LLVM_DEBUG(
1828 dbgs()
1829 << "\n\t *** In shouldRevertScheduling ***\n"
1830 << " *********** BEFORE UnclusteredHighRPStage ***********\n");
1831 ScheduleMetrics MBefore = getScheduleMetrics(DAG.SUnits);
1832 LLVM_DEBUG(
1833 dbgs()
1834 << "\n *********** AFTER UnclusteredHighRPStage ***********\n");
1836 unsigned OldMetric = MBefore.getMetric();
1837 unsigned NewMetric = MAfter.getMetric();
1838 unsigned WavesBefore = std::min(
1839 S.getTargetOccupancy(),
1840 PressureBefore.getOccupancy(ST, DAG.MFI.getDynamicVGPRBlockSize()));
1841 unsigned Profit =
1842 ((WavesAfter * ScheduleMetrics::ScaleFactor) / WavesBefore *
1844 NewMetric) /
1846 LLVM_DEBUG(dbgs() << "\tMetric before " << MBefore << "\tMetric after "
1847 << MAfter << "Profit: " << Profit << "\n");
1848 return Profit < ScheduleMetrics::ScaleFactor;
1849}
1850
1853 return false;
1854
1856 return true;
1857
1858 if (mayCauseSpilling(WavesAfter))
1859 return true;
1860
1861 return false;
1862}
1863
1865 return GCNSchedStage::shouldRevertScheduling(WavesAfter) ||
1866 mayCauseSpilling(WavesAfter) || (TargetOcc && WavesAfter < TargetOcc);
1867}
1868
1870 if (mayCauseSpilling(WavesAfter))
1871 return true;
1872
1873 return false;
1874}
1875
1877 unsigned WavesAfter) {
1878 return mayCauseSpilling(WavesAfter);
1879}
1880
1881bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
1882 if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
1884 LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
1885 return true;
1886 }
1887
1888 return false;
1889}
1890
1893 ArrayRef<MachineInstr *> MIOrder) {
1894 assert(static_cast<size_t>(std::distance(DAG.Regions[RegionIdx].first,
1895 DAG.Regions[RegionIdx].second)) ==
1896 MIOrder.size() &&
1897 "instruction number mismatch");
1898 if (MIOrder.empty())
1899 return;
1900
1901 LLVM_DEBUG(dbgs() << "Reverting scheduling for region " << RegionIdx << '\n');
1902
1903 // Reconstruct MI sequence by moving instructions in desired order before
1904 // the current region's start.
1905 MachineBasicBlock::iterator RegionEnd = DAG.Regions[RegionIdx].first;
1906 for (MachineInstr *MI : MIOrder) {
1907 // Either move the next MI in order before the end of the region or move the
1908 // region end past the MI if it is at the correct position.
1909 MachineBasicBlock::iterator MII = MI->getIterator();
1910 if (MII != RegionEnd) {
1911 // Will subsequent splice move MI up past a non-debug instruction?
1912 bool NonDebugReordered =
1913 !MI->isDebugInstr() &&
1914 skipDebugInstructionsForward(RegionEnd, MII) != MII;
1915 MBB->splice(RegionEnd, MBB, MI);
1916 // Only update LiveIntervals information if non-debug instructions are
1917 // reordered. Otherwise debug instructions could cause code generation to
1918 // change.
1919 if (NonDebugReordered)
1920 DAG.LIS->handleMove(*MI, true);
1921 } else {
1922 ++RegionEnd;
1923 }
1924 if (MI->isDebugInstr()) {
1925 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
1926 continue;
1927 }
1928
1929 // Reset read-undef flags and update them later.
1930 for (MachineOperand &Op : MI->all_defs())
1931 Op.setIsUndef(false);
1932 RegisterOperands RegOpers;
1933 RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
1934 if (DAG.ShouldTrackLaneMasks) {
1935 // Adjust liveness and add missing dead+read-undef flags.
1936 SlotIndex SlotIdx = DAG.LIS->getInstructionIndex(*MI).getRegSlot();
1937 RegOpers.adjustLaneLiveness(*DAG.LIS, DAG.MRI, SlotIdx, MI);
1938 } else {
1939 // Adjust for missing dead-def flags.
1940 RegOpers.detectDeadDefs(*MI, *DAG.LIS);
1941 }
1942 LLVM_DEBUG(dbgs() << "Scheduling " << *MI);
1943 }
1944
1945 // The region end doesn't change throughout scheduling since it itself is
1946 // outside the region (whether that is a MBB end or a terminator MI).
1947 assert(RegionEnd == DAG.Regions[RegionIdx].second && "region end mismatch");
1948 DAG.Regions[RegionIdx].first = MIOrder.front();
1949}
1950
1951bool RewriteMFMAFormStage::isRewriteCandidate(MachineInstr *MI) const {
1952
1953 if (!static_cast<const SIInstrInfo *>(DAG.TII)->isMAI(*MI))
1954 return false;
1955 return AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode()) != -1;
1956}
1957
1958bool RewriteMFMAFormStage::initHeuristics(
1959 std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
1960 DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
1961 SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
1962 bool Changed = false;
1963
1964 // Prepare for the heuristics
1965 for (MachineBasicBlock &MBB : MF) {
1966 for (MachineInstr &MI : MBB) {
1967 if (!isRewriteCandidate(&MI))
1968 continue;
1969
1970 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode());
1971 assert(ReplacementOp != -1);
1972
1973 RewriteCands.push_back({&MI, MI.getOpcode()});
1974 MI.setDesc(TII->get(ReplacementOp));
1975
1976 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1977 if (Src2->isReg()) {
1978 SmallVector<SlotIndex, 8> Src2ReachingDefs;
1979 findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
1980
1981 // For any definition of the src2 register which is non-MFMA, we
1982 // insert a copy.
1983 for (SlotIndex RDIdx : Src2ReachingDefs) {
1984 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIdx);
1985 if (!TII->isMAI(*RD))
1986 CopyForDef.insert(RD);
1987 }
1988 }
1989
1990 MachineOperand &Dst = MI.getOperand(0);
1991 SmallVector<MachineOperand *, 8> DstReachingUses;
1992
1993 findReachingUses(&MI, DAG.LIS, DstReachingUses);
1994
1995 for (MachineOperand *RUOp : DstReachingUses) {
1996 if (TII->isMAI(*RUOp->getParent()))
1997 continue;
1998
1999 // For any user of the result of the MFMA which is not an MFMA, we
2000 // insert a copy. For a given register, we will only insert one copy
2001 // per user block.
2002 CopyForUse[RUOp->getParent()->getParent()].insert(RUOp->getReg());
2003
2004 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2005 findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
2006
2007 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2008 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2009 if (TII->isMAI(*RD))
2010 continue;
2011
2012 // For any definition of the user of the MFMA which is not an MFMA,
2013 // we insert a copy. We do this to transform all the reaching defs
2014 // of this use to AGPR. By doing this, we can insert a copy from
2015 // AGPR to VGPR at the user rather than after the MFMA.
2016 CopyForDef.insert(RD);
2017 }
2018 }
2019
2020 // Do the rewrite to allow for updated RP calculation.
2021 const TargetRegisterClass *VGPRRC = DAG.MRI.getRegClass(Dst.getReg());
2022 const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(VGPRRC);
2023 DAG.MRI.setRegClass(Dst.getReg(), AGPRRC);
2024 if (Src2->isReg())
2025 DAG.MRI.setRegClass(Src2->getReg(), AGPRRC);
2026 Changed = true;
2027 }
2028 }
2029
2030 return Changed;
2031}
2032
2033int64_t RewriteMFMAFormStage::getRewriteCost(
2034 const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands,
2035 const DenseMap<MachineBasicBlock *, std::set<Register>> &CopyForUse,
2036 const SmallPtrSetImpl<MachineInstr *> &CopyForDef) {
2037 MachineBlockFrequencyInfo *MBFI = DAG.MBFI;
2038
2039 int64_t BestSpillCost = 0;
2040 int64_t Cost = 0;
2041 uint64_t EntryFreq = MBFI->getEntryFreq().getFrequency();
2042
2043 std::pair<unsigned, unsigned> MaxVectorRegs =
2044 ST.getMaxNumVectorRegs(MF.getFunction());
2045 unsigned ArchVGPRThreshold = MaxVectorRegs.first;
2046 unsigned AGPRThreshold = MaxVectorRegs.second;
2047 unsigned CombinedThreshold = ST.getMaxNumVGPRs(MF);
2048
2049 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2050 if (!RegionsWithExcessArchVGPR[Region])
2051 continue;
2052
2053 GCNRegPressure &PressureBefore = DAG.Pressure[Region];
2054 unsigned SpillCostBefore = PressureBefore.getVGPRSpills(
2055 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2056
2057 // For the cases we care about (i.e. ArchVGPR usage is greater than the
2058 // addressable limit), rewriting alone should bring pressure to manageable
2059 // level. If we find any such region, then the rewrite is potentially
2060 // beneficial.
2061 GCNRegPressure PressureAfter = DAG.getRealRegPressure(Region);
2062 unsigned SpillCostAfter = PressureAfter.getVGPRSpills(
2063 MF, ArchVGPRThreshold, AGPRThreshold, CombinedThreshold);
2064
2065 uint64_t BlockFreq =
2066 MBFI->getBlockFreq(DAG.Regions[Region].first->getParent())
2067 .getFrequency();
2068
2069 bool RelativeFreqIsDenom = EntryFreq > BlockFreq;
2070 uint64_t RelativeFreq = EntryFreq && BlockFreq
2071 ? (RelativeFreqIsDenom ? EntryFreq / BlockFreq
2072 : BlockFreq / EntryFreq)
2073 : 1;
2074
2075 // This assumes perfect spilling / splitting -- using one spill / copy
2076 // instruction and one restoreFrom / copy for each excess register,
2077 int64_t SpillCost = ((int)SpillCostAfter - (int)SpillCostBefore) * 2;
2078
2079 // Also account for the block frequency.
2080 if (RelativeFreqIsDenom)
2081 SpillCost /= (int64_t)RelativeFreq;
2082 else
2083 SpillCost *= (int64_t)RelativeFreq;
2084
2085 // If we have increased spilling in any block, just bail.
2086 if (SpillCost > 0)
2087 return SpillCost;
2088
2089 if (SpillCost < BestSpillCost)
2090 BestSpillCost = SpillCost;
2091 }
2092
2093 // Set the cost to the largest decrease in spill cost in order to not double
2094 // count spill reductions.
2095 Cost = BestSpillCost;
2096 assert(Cost <= 0);
2097
2098 unsigned CopyCost = 0;
2099
2100 // For each CopyForDef, increase the cost by the register size while
2101 // accounting for block frequency.
2102 for (MachineInstr *DefMI : CopyForDef) {
2103 Register DefReg = DefMI->getOperand(0).getReg();
2104 uint64_t DefFreq =
2105 EntryFreq
2106 ? MBFI->getBlockFreq(DefMI->getParent()).getFrequency() / EntryFreq
2107 : 1;
2108
2109 const TargetRegisterClass *RC = DAG.MRI.getRegClass(DefReg);
2110 CopyCost += RC->getCopyCost() * DefFreq;
2111 }
2112
2113 // Account for CopyForUse copies in each block that the register is used.
2114 for (auto &[UseBlock, UseRegs] : CopyForUse) {
2115 uint64_t UseFreq =
2116 EntryFreq ? MBFI->getBlockFreq(UseBlock).getFrequency() / EntryFreq : 1;
2117
2118 for (Register UseReg : UseRegs) {
2119 const TargetRegisterClass *RC = DAG.MRI.getRegClass(UseReg);
2120 CopyCost += RC->getCopyCost() * UseFreq;
2121 }
2122 }
2123
2124 // Reset the classes that were changed to AGPR for better RB analysis.
2125 // We must do rewriting after copy-insertion, as some defs of the register
2126 // may require VGPR. Additionally, if we bail out and don't perform the
2127 // rewrite then these need to be restored anyway.
2128 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2129 assert(TII->isMAI(*MI));
2130 const TargetRegisterClass *AGPRRC =
2131 DAG.MRI.getRegClass(MI->getOperand(0).getReg());
2132 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(AGPRRC);
2133
2134 MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
2135 assert(Src2);
2136
2137 if (Src2->isReg())
2138 DAG.MRI.setRegClass(Src2->getReg(), VGPRRC);
2139 DAG.MRI.setRegClass(MI->getOperand(0).getReg(), VGPRRC);
2140 MI->setDesc(TII->get(OriginalOpcode));
2141 }
2142
2143 return Cost + CopyCost;
2144}
2145
2146bool RewriteMFMAFormStage::rewrite(
2147 const std::vector<std::pair<MachineInstr *, unsigned>> &RewriteCands) {
2148 DenseMap<MachineInstr *, unsigned> FirstMIToRegion;
2149 DenseMap<MachineInstr *, unsigned> LastMIToRegion;
2150
2151 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++) {
2152 RegionBoundaries Entry = DAG.Regions[Region];
2153 if (Entry.first == Entry.second)
2154 continue;
2155
2156 FirstMIToRegion[&*Entry.first] = Region;
2157 if (Entry.second != Entry.first->getParent()->end())
2158 LastMIToRegion[&*Entry.second] = Region;
2159 }
2160
2161 // Rewrite the MFMAs to AGPR, and insert any copies as needed.
2162 // The general assumption of the algorithm (and the previous cost calculation)
2163 // is that it is better to insert the copies in the MBB of the def of the src2
2164 // operands, and in the MBB of the user of the dest operands. This is based on
2165 // the assumption that the MFMAs are likely to appear in loop bodies, while
2166 // the src2 and dest operands are live-in / live-out of the loop. Due to this
2167 // design, the algorithm for finding copy insertion points is more
2168 // complicated.
2169 //
2170 // There are three main cases to handle: 1. the reaching defs of the src2
2171 // operands, 2. the reaching uses of the dst operands, and 3. the reaching
2172 // defs of the reaching uses of the dst operand.
2173 //
2174 // In the first case, we simply insert copies after each of the reaching
2175 // definitions. In the second case, we collect all the uses of a given dest
2176 // and organize them by MBB. Then, we insert 1 copy for each MBB before the
2177 // earliest use. Since the use may have multiple reaching defs, and since we
2178 // want to replace the register it is using with the result of the copy, we
2179 // must handle case 3. In the third case, we simply insert a copy after each
2180 // of the reaching defs to connect to the copy of the reaching uses of the dst
2181 // reg. This allows us to avoid inserting copies next to the MFMAs.
2182 //
2183 // While inserting the copies, we maintain a map of operands which will use
2184 // different regs (i.e. the result of the copies). For example, a case 1 src2
2185 // operand will use the register result of the copies after the reaching defs,
2186 // as opposed to the original register. Now that we have completed our copy
2187 // analysis and placement, we can bulk update the registers. We do this
2188 // separately as to avoid complicating the reachingDef and reachingUse
2189 // queries.
2190 //
2191 // While inserting the copies, we also maintain a list or registers which we
2192 // will want to reclassify as AGPR. After doing the copy insertion and the
2193 // register replacement, we can finally do the reclassification. This uses the
2194 // redef map, as the registers we are interested in reclassifying may be
2195 // replaced by the result of a copy. We must do this after the copy analysis
2196 // and placement as we must have an accurate redef map -- otherwise we may end
2197 // up creating illegal instructions.
2198
2199 // The original registers of the MFMA that need to be reclassified as AGPR.
2200 DenseSet<Register> RewriteRegs;
2201 // The map of an original register in the MFMA to a new register (result of a
2202 // copy) that it should be replaced with.
2203 DenseMap<Register, Register> RedefMap;
2204 // The map of the original MFMA registers to the relevant MFMA operands.
2205 DenseMap<Register, DenseSet<MachineOperand *>> ReplaceMap;
2206 // The map of reaching defs for a given register -- to avoid duplicate copies.
2207 DenseMap<Register, SmallPtrSet<MachineInstr *, 8>> ReachingDefCopyMap;
2208 // The map of reaching uses for a given register by basic block -- to avoid
2209 // duplicate copies and to calculate per MBB insert pts.
2210 DenseMap<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>
2211 ReachingUseTracker;
2212
2213 for (auto &[MI, OriginalOpcode] : RewriteCands) {
2214 int ReplacementOp = AMDGPU::getMFMASrcCVDstAGPROp(MI->getOpcode());
2215 if (ReplacementOp == -1)
2216 continue;
2217 MI->setDesc(TII->get(ReplacementOp));
2218
2219 // Case 1: insert copies for the reaching defs of the Src2Reg.
2220 MachineOperand *Src2 = TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
2221 if (Src2->isReg()) {
2222 Register Src2Reg = Src2->getReg();
2223 if (!Src2Reg.isVirtual())
2224 return false;
2225
2226 Register MappedReg = Src2->getReg();
2227 SmallVector<SlotIndex, 8> Src2ReachingDefs;
2228 findReachingDefs(*Src2, DAG.LIS, Src2ReachingDefs);
2229 SmallSetVector<MachineInstr *, 8> Src2DefsReplace;
2230
2231 for (SlotIndex RDIndex : Src2ReachingDefs) {
2232 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2233 if (TII->isMAI(*RD))
2234 continue;
2235
2236 // If there is a non mai reaching def, then we need a copy.
2237 Src2DefsReplace.insert(RD);
2238 }
2239
2240 if (!Src2DefsReplace.empty()) {
2241 DenseMap<Register, Register>::iterator RI = RedefMap.find(Src2Reg);
2242 if (RI != RedefMap.end()) {
2243 MappedReg = RI->second;
2244 } else {
2245 assert(!ReachingDefCopyMap.contains(Src2Reg));
2246 const TargetRegisterClass *Src2RC = DAG.MRI.getRegClass(Src2Reg);
2247 const TargetRegisterClass *VGPRRC =
2248 SRI->getEquivalentVGPRClass(Src2RC);
2249
2250 // Track the mapping of the original register to the new register.
2251 MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
2252 RedefMap[Src2Reg] = MappedReg;
2253 }
2254
2255 // If none exists, create a copy from this reaching def.
2256 // We may have inserted a copy already in an earlier iteration.
2257 for (MachineInstr *RD : Src2DefsReplace) {
2258 // Do not create redundant copies.
2259 if (ReachingDefCopyMap[Src2Reg].insert(RD).second) {
2260 MachineInstrBuilder VGPRCopy =
2261 BuildMI(*RD->getParent(), std::next(RD->getIterator()),
2262 RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
2263 .addDef(MappedReg, {}, 0)
2264 .addUse(Src2Reg, {}, 0);
2265 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2266
2267 // If this reaching def was the last MI in the region, update the
2268 // region boundaries.
2269 if (LastMIToRegion.contains(RD)) {
2270 unsigned UpdateRegion = LastMIToRegion[RD];
2271 DAG.Regions[UpdateRegion].second = VGPRCopy;
2272 LastMIToRegion.erase(RD);
2273 }
2274 }
2275 }
2276 }
2277
2278 // Track the register for reclassification
2279 RewriteRegs.insert(Src2Reg);
2280
2281 // Always insert the operand for replacement. If this corresponds with a
2282 // chain of tied-def we may not see the VGPR requirement until later.
2283 ReplaceMap[Src2Reg].insert(Src2);
2284 }
2285
2286 // Case 2 and Case 3: insert copies before the reaching uses of the dsts,
2287 // and after the reaching defs of the reaching uses of the dsts.
2288
2289 MachineOperand *Dst = &MI->getOperand(0);
2290 Register DstReg = Dst->getReg();
2291 if (!DstReg.isVirtual())
2292 return false;
2293
2294 Register MappedReg = DstReg;
2295 SmallVector<MachineOperand *, 8> DstReachingUses;
2296
2297 SmallVector<MachineOperand *, 8> DstReachingUseCopies;
2298 SmallVector<MachineInstr *, 8> DstUseDefsReplace;
2299
2300 findReachingUses(MI, DAG.LIS, DstReachingUses);
2301
2302 for (MachineOperand *RUOp : DstReachingUses) {
2303 if (TII->isMAI(*RUOp->getParent()))
2304 continue;
2305
2306 // If there is a non mai reaching use, then we need a copy.
2307 if (find(DstReachingUseCopies, RUOp) == DstReachingUseCopies.end())
2308 DstReachingUseCopies.push_back(RUOp);
2309 SmallVector<SlotIndex, 8> DstUsesReachingDefs;
2310 findReachingDefs(*RUOp, DAG.LIS, DstUsesReachingDefs);
2311
2312 for (SlotIndex RDIndex : DstUsesReachingDefs) {
2313 MachineInstr *RD = DAG.LIS->getInstructionFromIndex(RDIndex);
2314 if (TII->isMAI(*RD))
2315 continue;
2316
2317 // If there is a non mai reaching def of this reaching use, then we will
2318 // need a copy.
2319 if (find(DstUseDefsReplace, RD) == DstUseDefsReplace.end())
2320 DstUseDefsReplace.push_back(RD);
2321 }
2322 }
2323
2324 if (!DstUseDefsReplace.empty()) {
2325 DenseMap<Register, Register>::iterator RI = RedefMap.find(DstReg);
2326 if (RI != RedefMap.end()) {
2327 MappedReg = RI->second;
2328 } else {
2329 assert(!ReachingDefCopyMap.contains(DstReg));
2330 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
2331 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2332
2333 // Track the mapping of the original register to the new register.
2334 MappedReg = DAG.MRI.createVirtualRegister(VGPRRC);
2335 RedefMap[DstReg] = MappedReg;
2336 }
2337
2338 // If none exists, create a copy from this reaching def.
2339 // We may have inserted a copy already in an earlier iteration.
2340 for (MachineInstr *RD : DstUseDefsReplace) {
2341 // Do not create reundant copies.
2342 if (ReachingDefCopyMap[DstReg].insert(RD).second) {
2343 MachineInstrBuilder VGPRCopy =
2344 BuildMI(*RD->getParent(), std::next(RD->getIterator()),
2345 RD->getDebugLoc(), TII->get(TargetOpcode::COPY))
2346 .addDef(MappedReg, {}, 0)
2347 .addUse(DstReg, {}, 0);
2348 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2349
2350 // If this reaching def was the last MI in the region, update the
2351 // region boundaries.
2353 LastMIToRegion.find(RD);
2354 if (LMI != LastMIToRegion.end()) {
2355 unsigned UpdateRegion = LMI->second;
2356 DAG.Regions[UpdateRegion].second = VGPRCopy;
2357 LastMIToRegion.erase(RD);
2358 }
2359 }
2360 }
2361 }
2362
2363 DenseSet<MachineOperand *> &DstRegSet = ReplaceMap[DstReg];
2364 for (MachineOperand *RU : DstReachingUseCopies) {
2365 MachineBasicBlock *RUBlock = RU->getParent()->getParent();
2366 // Just keep track of the reaching use of this register by block. After we
2367 // have scanned all the MFMAs we can find optimal insert pts.
2368 if (RUBlock != MI->getParent()) {
2369 ReachingUseTracker[RUBlock->getNumber()][DstReg].insert(RU);
2370 continue;
2371 }
2372
2373 // Special case, the use is in the same block as the MFMA. Insert the copy
2374 // just before the use.
2375 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(DstReg);
2376 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2377 Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
2378 MachineInstr *UseInst = RU->getParent();
2379 MachineInstrBuilder VGPRCopy =
2380 BuildMI(*UseInst->getParent(), UseInst->getIterator(),
2381 UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
2382 .addDef(NewUseReg, {}, 0)
2383 .addUse(DstReg, {}, 0);
2384 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2385 // Since we know this use has only one reaching def, we can replace the
2386 // use reg.
2387 RU->setReg(NewUseReg);
2388 // Track the copy source operand for r eplacement.
2389 DstRegSet.insert(&VGPRCopy->getOperand(1));
2390 }
2391
2392 // Track the register for reclassification
2393 RewriteRegs.insert(DstReg);
2394
2395 // Insert the dst operand for replacement. If this dst is in a chain of
2396 // tied-def MFMAs, and the first src2 needs to be replaced with a new reg,
2397 // all the correspond operands need to be replaced.
2398 DstRegSet.insert(Dst);
2399 }
2400
2401 // Handle the copies for dst uses.
2402 using RUBType =
2403 std::pair<unsigned, DenseMap<Register, SmallPtrSet<MachineOperand *, 8>>>;
2404 for (RUBType RUBlockEntry : ReachingUseTracker) {
2405 using RUDType = std::pair<Register, SmallPtrSet<MachineOperand *, 8>>;
2406 for (RUDType RUDst : RUBlockEntry.second) {
2407 MachineOperand *OpBegin = *RUDst.second.begin();
2408 SlotIndex InstPt = DAG.LIS->getInstructionIndex(*OpBegin->getParent());
2409
2410 // Find the earliest use in this block.
2411 for (MachineOperand *User : RUDst.second) {
2412 SlotIndex NewInstPt = DAG.LIS->getInstructionIndex(*User->getParent());
2413 if (SlotIndex::isEarlierInstr(NewInstPt, InstPt))
2414 InstPt = NewInstPt;
2415 }
2416
2417 const TargetRegisterClass *DstRC = DAG.MRI.getRegClass(RUDst.first);
2418 const TargetRegisterClass *VGPRRC = SRI->getEquivalentVGPRClass(DstRC);
2419 Register NewUseReg = DAG.MRI.createVirtualRegister(VGPRRC);
2420 MachineInstr *UseInst = DAG.LIS->getInstructionFromIndex(InstPt);
2421
2422 MachineInstrBuilder VGPRCopy =
2423 BuildMI(*UseInst->getParent(), UseInst->getIterator(),
2424 UseInst->getDebugLoc(), TII->get(TargetOpcode::COPY))
2425 .addDef(NewUseReg, {}, 0)
2426 .addUse(RUDst.first, {}, 0);
2427 DAG.LIS->InsertMachineInstrInMaps(*VGPRCopy);
2428
2429 // If this UseInst was the first MI in the region, update the region
2430 // boundaries.
2432 FirstMIToRegion.find(UseInst);
2433 if (FI != FirstMIToRegion.end()) {
2434 unsigned UpdateRegion = FI->second;
2435 DAG.Regions[UpdateRegion].first = VGPRCopy;
2436 FirstMIToRegion.erase(UseInst);
2437 }
2438
2439 // Replace the operand for all users.
2440 for (MachineOperand *User : RUDst.second) {
2441 User->setReg(NewUseReg);
2442 }
2443
2444 // Track the copy source operand for replacement.
2445 ReplaceMap[RUDst.first].insert(&VGPRCopy->getOperand(1));
2446 }
2447 }
2448
2449 // We may have needed to insert copies after the reaching defs of the MFMAs.
2450 // Replace the original register with the result of the copy for all relevant
2451 // operands.
2452 for (std::pair<Register, Register> NewDef : RedefMap) {
2453 Register OldReg = NewDef.first;
2454 Register NewReg = NewDef.second;
2455
2456 // Replace the register for any associated operand in the MFMA chain.
2457 for (MachineOperand *ReplaceOp : ReplaceMap[OldReg])
2458 ReplaceOp->setReg(NewReg);
2459 }
2460
2461 // Finally, do the reclassification of the MFMA registers.
2462 for (Register RewriteReg : RewriteRegs) {
2463 Register RegToRewrite = RewriteReg;
2464
2465 // Be sure to update the replacement register and not the original.
2466 DenseMap<Register, Register>::iterator RI = RedefMap.find(RewriteReg);
2467 if (RI != RedefMap.end())
2468 RegToRewrite = RI->second;
2469
2470 const TargetRegisterClass *CurrRC = DAG.MRI.getRegClass(RegToRewrite);
2471 const TargetRegisterClass *AGPRRC = SRI->getEquivalentAGPRClass(CurrRC);
2472
2473 DAG.MRI.setRegClass(RegToRewrite, AGPRRC);
2474 }
2475
2476 // Bulk update the LIS.
2477 DAG.LIS->reanalyze(DAG.MF);
2478 // Liveins may have been modified for cross RC copies
2479 RegionPressureMap LiveInUpdater(&DAG, false);
2480 LiveInUpdater.buildLiveRegMap();
2481
2482 for (unsigned Region = 0; Region < DAG.Regions.size(); Region++)
2483 DAG.LiveIns[Region] = LiveInUpdater.getLiveRegsForRegionIdx(Region);
2484
2485 DAG.Pressure[RegionIdx] = DAG.getRealRegPressure(RegionIdx);
2486
2487 return true;
2488}
2489
2490bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
2491 const Function &F = MF.getFunction();
2492
2493 // Maps optimizable regions (i.e., regions at minimum and register-limited
2494 // occupancy, or regions with spilling) to the target RP we would like to
2495 // reach.
2496 DenseMap<unsigned, GCNRPTarget> OptRegions;
2497 unsigned MaxSGPRs = ST.getMaxNumSGPRs(F);
2498 unsigned MaxVGPRs = ST.getMaxNumVGPRs(F);
2499 bool HasVectorRegisterExcess;
2500
2501 auto ResetTargetRegions = [&]() {
2502 OptRegions.clear();
2503 HasVectorRegisterExcess = false;
2504 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2505 const GCNRegPressure &RP = DAG.Pressure[I];
2506 GCNRPTarget Target(MaxSGPRs, MaxVGPRs, MF, RP);
2507 if (!Target.satisfied())
2508 OptRegions.insert({I, Target});
2509 // We are willing to consider a RP that does not satisfy the target
2510 // as long as it doesn't result in spilling to memory. We
2511 // tolerate SGPR spills to VGPR since these have a lower impact on the
2512 // performance.
2513 // VGPR/AGPR excess could be aliviated by copying to an AGPR/VGPR.
2514 // Currently, we consider any spilling as bad (we should consider this
2515 // case in the future).
2516 HasVectorRegisterExcess |= Target.hasVectorRegisterExcess();
2517 }
2518 };
2519
2520 ResetTargetRegions();
2521 if (HasVectorRegisterExcess || DAG.MinOccupancy >= MFI.getMaxWavesPerEU()) {
2522 // In addition to register usage being above addressable limits, occupancy
2523 // below the minimum is considered like "spilling" as well.
2524 TargetOcc = std::nullopt;
2525 } else {
2526 // There is no spilling and room to improve occupancy; set up "increased
2527 // occupancy targets" for all regions.
2528 TargetOcc = DAG.MinOccupancy + 1;
2529 unsigned VGPRBlockSize =
2530 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
2531 MaxSGPRs = ST.getMaxNumSGPRs(*TargetOcc, false);
2532 MaxVGPRs = ST.getMaxNumVGPRs(*TargetOcc, VGPRBlockSize);
2533 ResetTargetRegions();
2534 }
2535 REMAT_DEBUG({
2536 dbgs() << "Analyzing ";
2537 MF.getFunction().printAsOperand(dbgs(), false);
2538 dbgs() << ": ";
2539 if (OptRegions.empty()) {
2540 dbgs() << "no objective to achieve, occupancy is maximal at "
2541 << MFI.getMaxWavesPerEU();
2542 } else if (!TargetOcc) {
2543 dbgs() << "reduce spilling (minimum target occupancy is "
2544 << MFI.getMinWavesPerEU() << ')';
2545 } else {
2546 dbgs() << "increase occupancy from " << DAG.MinOccupancy << " to "
2547 << TargetOcc;
2548 }
2549 dbgs() << '\n';
2550 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2551 if (auto OptIt = OptRegions.find(I); OptIt != OptRegions.end()) {
2552 dbgs() << REMAT_PREFIX << " [" << I << "] " << OptIt->getSecond()
2553 << '\n';
2554 }
2555 }
2556 });
2557 if (OptRegions.empty())
2558 return false;
2559
2560 // Accounts for a reduction in RP in an optimizable region. Returns whether we
2561 // estimate that we have identified enough rematerialization opportunities to
2562 // achieve our goal, and sets Progress to true when this particular reduction
2563 // in pressure was helpful toward that goal.
2564 auto ReduceRPInRegion = [&](auto OptIt, Register Reg, LaneBitmask Mask,
2565 bool &Progress) -> bool {
2566 GCNRPTarget &Target = OptIt->getSecond();
2567 if (!Target.isSaveBeneficial(Reg))
2568 return false;
2569 Progress = true;
2570 Target.saveReg(Reg, Mask, DAG.MRI);
2571 if (Target.satisfied())
2572 OptRegions.erase(OptIt->getFirst());
2573 return OptRegions.empty();
2574 };
2575
2576 // We need up-to-date live-out info. to query live-out register masks in
2577 // regions containing rematerializable instructions.
2578 DAG.RegionLiveOuts.buildLiveRegMap();
2579
2580 // Cache set of registers that are going to be rematerialized.
2581 DenseSet<unsigned> RematRegs;
2582
2583 // Identify rematerializable instructions in the function.
2584 for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
2585 auto Region = DAG.Regions[I];
2586 for (auto MI = Region.first; MI != Region.second; ++MI) {
2587 // The instruction must be rematerializable.
2588 MachineInstr &DefMI = *MI;
2589 if (!isReMaterializable(DefMI))
2590 continue;
2591
2592 // We only support rematerializing virtual registers with one definition.
2593 Register Reg = DefMI.getOperand(0).getReg();
2594 if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg))
2595 continue;
2596
2597 // We only care to rematerialize the instruction if it has a single
2598 // non-debug user in a different region. The using MI may not belong to a
2599 // region if it is a lone region terminator.
2600 MachineInstr *UseMI = DAG.MRI.getOneNonDBGUser(Reg);
2601 if (!UseMI)
2602 continue;
2603 auto UseRegion = MIRegion.find(UseMI);
2604 if (UseRegion != MIRegion.end() && UseRegion->second == I)
2605 continue;
2606
2607 // Do not rematerialize an instruction if it uses or is used by an
2608 // instruction that we have designated for rematerialization.
2609 // FIXME: Allow for rematerialization chains: this requires 1. updating
2610 // remat points to account for uses that are rematerialized, and 2. either
2611 // rematerializing the candidates in careful ordering, or deferring the
2612 // MBB RP walk until the entire chain has been rematerialized.
2613 if (Rematerializations.contains(UseMI) ||
2614 llvm::any_of(DefMI.operands(), [&RematRegs](MachineOperand &MO) {
2615 return MO.isReg() && RematRegs.contains(MO.getReg());
2616 }))
2617 continue;
2618
2619 // Do not rematerialize an instruction it it uses registers that aren't
2620 // available at its use. This ensures that we are not extending any live
2621 // range while rematerializing.
2622 SlotIndex UseIdx = DAG.LIS->getInstructionIndex(*UseMI).getRegSlot(true);
2623 if (!VirtRegAuxInfo::allUsesAvailableAt(&DefMI, UseIdx, *DAG.LIS, DAG.MRI,
2624 *DAG.TII))
2625 continue;
2626
2627 REMAT_DEBUG(dbgs() << "Region " << I << ": remat instruction " << DefMI);
2628 RematInstruction &Remat =
2629 Rematerializations.try_emplace(&DefMI, UseMI).first->second;
2630
2631 bool RematUseful = false;
2632 if (auto It = OptRegions.find(I); It != OptRegions.end()) {
2633 // Optimistically consider that moving the instruction out of its
2634 // defining region will reduce RP in the latter; this assumes that
2635 // maximum RP in the region is reached somewhere between the defining
2636 // instruction and the end of the region.
2637 REMAT_DEBUG(dbgs() << " Defining region is optimizable\n");
2638 LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
2639 if (ReduceRPInRegion(It, Reg, Mask, RematUseful))
2640 return true;
2641 }
2642
2643 for (unsigned LIRegion = 0; LIRegion != E; ++LIRegion) {
2644 // We are only collecting regions in which the register is a live-in
2645 // (and may be live-through).
2646 auto It = DAG.LiveIns[LIRegion].find(Reg);
2647 if (It == DAG.LiveIns[LIRegion].end() || It->second.none())
2648 continue;
2649 Remat.LiveInRegions.insert(LIRegion);
2650
2651 // Account for the reduction in RP due to the rematerialization in an
2652 // optimizable region in which the defined register is a live-in. This
2653 // is exact for live-through region but optimistic in the using region,
2654 // where RP is actually reduced only if maximum RP is reached somewhere
2655 // between the beginning of the region and the rematerializable
2656 // instruction's use.
2657 if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
2658 REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n');
2659 if (ReduceRPInRegion(It, Reg, DAG.LiveIns[LIRegion][Reg],
2660 RematUseful))
2661 return true;
2662 }
2663 }
2664
2665 // If the instruction is not a live-in or live-out in any optimizable
2666 // region then there is no point in rematerializing it.
2667 if (!RematUseful) {
2668 Rematerializations.pop_back();
2669 REMAT_DEBUG(dbgs() << " No impact, not rematerializing instruction\n");
2670 } else {
2671 RematRegs.insert(Reg);
2672 }
2673 }
2674 }
2675
2676 if (TargetOcc) {
2677 // We were trying to increase occupancy but failed, abort the stage.
2678 REMAT_DEBUG(dbgs() << "Cannot increase occupancy\n");
2679 Rematerializations.clear();
2680 return false;
2681 }
2682 REMAT_DEBUG(dbgs() << "Can reduce but not eliminate spilling\n");
2683 return !Rematerializations.empty();
2684}
2685
2686void PreRARematStage::rematerialize() {
2687 const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
2688
2689 // Collect regions whose RP changes in unpredictable way; we will have to
2690 // fully recompute their RP after all rematerailizations.
2691 DenseSet<unsigned> RecomputeRP;
2692
2693 // Rematerialize all instructions.
2694 for (auto &[DefMI, Remat] : Rematerializations) {
2695 MachineBasicBlock::iterator InsertPos(Remat.UseMI);
2697 unsigned DefRegion = MIRegion.at(DefMI);
2698
2699 // Rematerialize DefMI to its use block.
2700 TII->reMaterialize(*InsertPos->getParent(), InsertPos, Reg,
2701 AMDGPU::NoSubRegister, *DefMI);
2702 Remat.RematMI = &*std::prev(InsertPos);
2703 DAG.LIS->InsertMachineInstrInMaps(*Remat.RematMI);
2704
2705 // Update region boundaries in regions we sinked from (remove defining MI)
2706 // and to (insert MI rematerialized in use block). Only then we can erase
2707 // the original MI.
2708 DAG.updateRegionBoundaries(DAG.Regions[DefRegion], DefMI, nullptr);
2709 auto UseRegion = MIRegion.find(Remat.UseMI);
2710 if (UseRegion != MIRegion.end()) {
2711 DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], InsertPos,
2712 Remat.RematMI);
2713 }
2714 DAG.LIS->RemoveMachineInstrFromMaps(*DefMI);
2716
2717 // Collect all regions impacted by the rematerialization and update their
2718 // live-in/RP information.
2719 for (unsigned I : Remat.LiveInRegions) {
2720 ImpactedRegions.insert({I, DAG.Pressure[I]});
2721 GCNRPTracker::LiveRegSet &RegionLiveIns = DAG.LiveIns[I];
2722
2723#ifdef EXPENSIVE_CHECKS
2724 // All uses are known to be available / live at the remat point. Thus, the
2725 // uses should already be live in to the region.
2726 for (MachineOperand &MO : DefMI->operands()) {
2727 if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
2728 continue;
2729
2730 Register UseReg = MO.getReg();
2731 if (!UseReg.isVirtual())
2732 continue;
2733
2734 LiveInterval &LI = DAG.LIS->getInterval(UseReg);
2735 LaneBitmask LM = DAG.MRI.getMaxLaneMaskForVReg(MO.getReg());
2736 if (LI.hasSubRanges() && MO.getSubReg())
2737 LM = DAG.TRI->getSubRegIndexLaneMask(MO.getSubReg());
2738
2739 LaneBitmask LiveInMask = RegionLiveIns.at(UseReg);
2740 LaneBitmask UncoveredLanes = LM & ~(LiveInMask & LM);
2741 // If this register has lanes not covered by the LiveIns, be sure they
2742 // do not map to any subrange. ref:
2743 // machine-scheduler-sink-trivial-remats.mir::omitted_subrange
2744 if (UncoveredLanes.any()) {
2745 assert(LI.hasSubRanges());
2746 for (LiveInterval::SubRange &SR : LI.subranges())
2747 assert((SR.LaneMask & UncoveredLanes).none());
2748 }
2749 }
2750#endif
2751
2752 // The register is no longer a live-in in all regions but the one that
2753 // contains the single use. In live-through regions, maximum register
2754 // pressure decreases predictably so we can directly update it. In the
2755 // using region, maximum RP may or may not decrease, so we will mark it
2756 // for re-computation after all materializations have taken place.
2757 LaneBitmask PrevMask = RegionLiveIns[Reg];
2758 RegionLiveIns.erase(Reg);
2759 RegMasks.insert({{I, Remat.RematMI->getOperand(0).getReg()}, PrevMask});
2760 if (Remat.UseMI->getParent() != DAG.Regions[I].first->getParent())
2761 DAG.Pressure[I].inc(Reg, PrevMask, LaneBitmask::getNone(), DAG.MRI);
2762 else
2763 RecomputeRP.insert(I);
2764 }
2765 // RP in the region from which the instruction was rematerialized may or may
2766 // not decrease.
2767 ImpactedRegions.insert({DefRegion, DAG.Pressure[DefRegion]});
2768 RecomputeRP.insert(DefRegion);
2769
2770 // Recompute live interval to reflect the register's rematerialization.
2771 Register RematReg = Remat.RematMI->getOperand(0).getReg();
2772 DAG.LIS->removeInterval(RematReg);
2773 DAG.LIS->createAndComputeVirtRegInterval(RematReg);
2774 }
2775
2776 // All regions impacted by at least one rematerialization must be rescheduled.
2777 // Maximum pressure must also be recomputed for all regions where it changed
2778 // non-predictably and checked against the target occupancy.
2779 unsigned DynamicVGPRBlockSize =
2780 MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
2781 AchievedOcc = MFI.getMaxWavesPerEU();
2782 for (auto &[I, OriginalRP] : ImpactedRegions) {
2783 bool IsEmptyRegion = DAG.Regions[I].first == DAG.Regions[I].second;
2784 RescheduleRegions[I] = !IsEmptyRegion;
2785 if (!RecomputeRP.contains(I))
2786 continue;
2787
2788 GCNRegPressure RP;
2789 if (IsEmptyRegion) {
2790 RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
2791 } else {
2792 GCNDownwardRPTracker RPT(*DAG.LIS);
2793 auto *NonDbgMI = &*skipDebugInstructionsForward(DAG.Regions[I].first,
2794 DAG.Regions[I].second);
2795 if (NonDbgMI == DAG.Regions[I].second) {
2796 // Region is non-empty but contains only debug instructions.
2797 RP = getRegPressure(DAG.MRI, DAG.LiveIns[I]);
2798 } else {
2799 RPT.reset(*NonDbgMI, &DAG.LiveIns[I]);
2800 RPT.advance(DAG.Regions[I].second);
2801 RP = RPT.moveMaxPressure();
2802 }
2803 }
2804 DAG.Pressure[I] = RP;
2805 AchievedOcc =
2806 std::min(AchievedOcc, RP.getOccupancy(ST, DynamicVGPRBlockSize));
2807 }
2808 REMAT_DEBUG(dbgs() << "Achieved occupancy " << AchievedOcc << "\n");
2809}
2810
2811// Copied from MachineLICM
2812bool PreRARematStage::isReMaterializable(const MachineInstr &MI) {
2813 if (!DAG.TII->isReMaterializable(MI))
2814 return false;
2815
2816 for (const MachineOperand &MO : MI.all_uses()) {
2817 // We can't remat physreg uses, unless it is a constant or an ignorable
2818 // use (e.g. implicit exec use on VALU instructions)
2819 if (MO.getReg().isPhysical()) {
2820 if (DAG.MRI.isConstantPhysReg(MO.getReg()) || DAG.TII->isIgnorableUse(MO))
2821 continue;
2822 return false;
2823 }
2824 }
2825
2826 return true;
2827}
2828
2830 // We consider that reducing spilling is always beneficial so we never
2831 // rollback rematerializations in such cases. It's also possible that
2832 // rescheduling lowers occupancy over the one achieved just through remats, in
2833 // which case we do not want to rollback either (the rescheduling was already
2834 // reverted in PreRARematStage::shouldRevertScheduling in such cases).
2835 unsigned MaxOcc = std::max(AchievedOcc, DAG.MinOccupancy);
2836 if (!TargetOcc || MaxOcc >= *TargetOcc)
2837 return;
2838
2839 REMAT_DEBUG(dbgs() << "Rolling back all rematerializations\n");
2840 const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
2841
2842 // Rollback the rematerializations.
2843 for (const auto &[DefMI, Remat] : Rematerializations) {
2844 MachineInstr &RematMI = *Remat.RematMI;
2845 unsigned DefRegion = MIRegion.at(DefMI);
2846 MachineBasicBlock::iterator InsertPos(DAG.Regions[DefRegion].second);
2847 MachineBasicBlock *MBB = RegionBB[DefRegion];
2848 Register Reg = RematMI.getOperand(0).getReg();
2849
2850 // Re-rematerialize MI at the end of its original region. Note that it may
2851 // not be rematerialized exactly in the same position as originally within
2852 // the region, but it should not matter much.
2853 TII->reMaterialize(*MBB, InsertPos, Reg, AMDGPU::NoSubRegister, RematMI);
2854 MachineInstr *NewMI = &*std::prev(InsertPos);
2855 DAG.LIS->InsertMachineInstrInMaps(*NewMI);
2856
2857 auto UseRegion = MIRegion.find(Remat.UseMI);
2858 if (UseRegion != MIRegion.end()) {
2859 DAG.updateRegionBoundaries(DAG.Regions[UseRegion->second], RematMI,
2860 nullptr);
2861 }
2862 DAG.updateRegionBoundaries(DAG.Regions[DefRegion], InsertPos, NewMI);
2863
2864 // Erase rematerialized MI.
2865 DAG.LIS->RemoveMachineInstrFromMaps(RematMI);
2866 RematMI.eraseFromParent();
2867
2868 // Recompute live interval for the re-rematerialized register
2869 DAG.LIS->removeInterval(Reg);
2870 DAG.LIS->createAndComputeVirtRegInterval(Reg);
2871
2872 // Re-add the register as a live-in in all regions it used to be one in.
2873 for (unsigned LIRegion : Remat.LiveInRegions)
2874 DAG.LiveIns[LIRegion].insert({Reg, RegMasks.at({LIRegion, Reg})});
2875 }
2876
2877 // Reset RP in all impacted regions.
2878 for (auto &[I, OriginalRP] : ImpactedRegions)
2879 DAG.Pressure[I] = OriginalRP;
2880
2882}
2883
2884void GCNScheduleDAGMILive::updateRegionBoundaries(
2886 MachineInstr *NewMI) {
2887 assert((!NewMI || NewMI != RegionBounds.second) &&
2888 "cannot remove at region end");
2889
2890 if (RegionBounds.first == RegionBounds.second) {
2891 assert(NewMI && "cannot remove from an empty region");
2892 RegionBounds.first = NewMI;
2893 return;
2894 }
2895
2896 // We only care for modifications at the beginning of a non-empty region since
2897 // the upper region boundary is exclusive.
2898 if (MI != RegionBounds.first)
2899 return;
2900 if (!NewMI)
2901 RegionBounds.first = std::next(MI); // Removal
2902 else
2903 RegionBounds.first = NewMI; // Insertion
2904}
2905
2907 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
2908 return any_of(*DAG, [SII](MachineBasicBlock::iterator MI) {
2909 return SII->isIGLPMutationOnly(MI->getOpcode());
2910 });
2911}
2912
2914 MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
2915 bool RemoveKillFlags)
2917
2919 HasIGLPInstrs = hasIGLPInstrs(this);
2920 if (HasIGLPInstrs) {
2921 SavedMutations.clear();
2922 SavedMutations.swap(Mutations);
2924 }
2925
2927}
2928
2930 if (HasIGLPInstrs)
2931 SavedMutations.swap(Mutations);
2932
2934}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file defines the GCNRegPressure class, which tracks registry pressure by bookkeeping number of S...
static cl::opt< bool > GCNTrackers("amdgpu-use-amdgpu-trackers", cl::Hidden, cl::desc("Use the AMDGPU specific RPTrackers during scheduling"), cl::init(false))
static cl::opt< bool > DisableClusteredLowOccupancy("amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden, cl::desc("Disable clustered low occupancy " "rescheduling for ILP scheduling stage."), cl::init(false))
#define REMAT_PREFIX
Allows to easily filter for this stage's debug output.
static MachineInstr * getLastMIForRegion(MachineBasicBlock::iterator RegionBegin, MachineBasicBlock::iterator RegionEnd)
static bool shouldCheckPending(SchedBoundary &Zone, const TargetSchedModel *SchedModel)
static cl::opt< bool > RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden, cl::desc("Relax occupancy targets for kernels which are memory " "bound (amdgpu-membound-threshold), or " "Wave Limited (amdgpu-limit-wave-threshold)."), cl::init(false))
#define REMAT_DEBUG(X)
static cl::opt< bool > DisableUnclusterHighRP("amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden, cl::desc("Disable unclustered high register pressure " "reduction scheduling stage."), cl::init(false))
static void printScheduleModel(std::set< std::pair< MachineInstr *, unsigned >, EarlierIssuingCycle > &ReadyCycles)
static cl::opt< bool > PrintMaxRPRegUsageAfterScheduler("amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure after scheduling."), cl::init(false))
static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG)
static bool canUsePressureDiffs(const SUnit &SU)
Checks whether SU can use the cached DAG pressure diffs to compute the current register pressure.
static cl::opt< unsigned > PendingQueueLimit("amdgpu-scheduler-pending-queue-limit", cl::Hidden, cl::desc("Max (Available+Pending) size to inspect pending queue (0 disables)"), cl::init(256))
static void getRegisterPressures(bool AtTop, const RegPressureTracker &RPTracker, SUnit *SU, std::vector< unsigned > &Pressure, std::vector< unsigned > &MaxPressure, GCNDownwardRPTracker &DownwardTracker, GCNUpwardRPTracker &UpwardTracker, ScheduleDAGMI *DAG, const SIRegisterInfo *SRI)
static cl::opt< bool > PrintMaxRPRegUsageBeforeScheduler("amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden, cl::desc("Print a list of live registers along with their def/uses at the " "point of maximum register pressure before scheduling."), cl::init(false))
static SUnit * pickOnlyChoice(SchedBoundary &Zone, const TargetSchedModel *SchedModel)
static cl::opt< unsigned > ScheduleMetricBias("amdgpu-schedule-metric-bias", cl::Hidden, cl::desc("Sets the bias which adds weight to occupancy vs latency. Set it to " "100 to chase the occupancy only."), cl::init(10))
static cl::opt< bool > DisableRewriteMFMAFormSchedStage("amdgpu-disable-rewrite-mfma-form-sched-stage", cl::Hidden, cl::desc("Disable rewrie mfma rewrite scheduling stage"), cl::init(true))
static Register UseReg(const MachineOperand &MO)
const HexagonInstrInfo * TII
static constexpr std::pair< StringLiteral, StringLiteral > ReplaceMap[]
IRTranslator LLVM IR MI
A common definition of LaneBitmask for use in TableGen and CodeGen.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
if(PassOpts->AAPipeline)
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
uint64_t getFrequency() const
Returns the frequency as a fixpoint number scaled by the entry frequency.
bool shouldRevertScheduling(unsigned WavesAfter) override
ValueT & at(const_arg_type_t< KeyT > Val)
at - Return the entry for the specified key, or abort if no such entry exists.
Definition DenseMap.h:224
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
bool erase(const KeyT &Val)
Definition DenseMap.h:330
DenseMapIterator< KeyT, ValueT, KeyInfoT, BucketT > iterator
Definition DenseMap.h:74
bool empty() const
Definition DenseMap.h:109
iterator end()
Definition DenseMap.h:81
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:169
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
GCNRegPressure bumpDownwardPressure(const MachineInstr *MI, const SIRegisterInfo *TRI) const
Mostly copy/paste from CodeGen/RegisterPressure.cpp Calculate the impact MI will have on CurPressure ...
GCNMaxILPSchedStrategy(const MachineSchedContext *C)
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
Apply a set of heuristics to a new candidate.
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const override
GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as much as possible.
GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext *C)
GCNMaxOccupancySchedStrategy(const MachineSchedContext *C, bool IsLegacyScheduler=false)
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNPostScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
GCNRegPressure getPressure() const
DenseMap< unsigned, LaneBitmask > LiveRegSet
GCNSchedStrategy & S
GCNRegPressure PressureBefore
bool isRegionWithExcessRP() const
bool mayCauseSpilling(unsigned WavesAfter)
ScheduleMetrics getScheduleMetrics(const std::vector< SUnit > &InputSchedule)
GCNScheduleDAGMILive & DAG
const GCNSchedStageID StageID
std::vector< MachineInstr * > Unsched
GCNRegPressure PressureAfter
MachineFunction & MF
virtual void finalizeGCNRegion()
SIMachineFunctionInfo & MFI
unsigned computeSUnitReadyCycle(const SUnit &SU, unsigned CurrCycle, DenseMap< unsigned, unsigned > &ReadyCycles, const TargetSchedModel &SM)
virtual void finalizeGCNSchedStage()
virtual bool initGCNSchedStage()
virtual bool shouldRevertScheduling(unsigned WavesAfter)
std::vector< std::unique_ptr< ScheduleDAGMutation > > SavedMutations
GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
void modifyRegionSchedule(unsigned RegionIdx, MachineBasicBlock *MBB, ArrayRef< MachineInstr * > MIOrder)
Sets the schedule of region RegionIdx in block MBB to MIOrder.
MachineBasicBlock * CurrentMBB
const GCNSubtarget & ST
This is a minimal scheduler strategy.
GCNDownwardRPTracker DownwardTracker
GCNSchedStrategy(const MachineSchedContext *C)
SmallVector< GCNSchedStageID, 4 > SchedStages
std::vector< unsigned > MaxPressure
SUnit * pickNodeBidirectional(bool &IsTopNode, bool &PickedPending)
GCNSchedStageID getCurrentStage()
bool tryPendingCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Evaluates instructions in the pending queue using a subset of scheduling heuristics.
SmallVectorImpl< GCNSchedStageID >::iterator CurrentStage
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
GCNDownwardRPTracker * getDownwardTracker()
std::vector< unsigned > Pressure
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
GCNUpwardRPTracker UpwardTracker
void printCandidateDecision(const SchedCandidate &Current, const SchedCandidate &Preferred)
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand, bool &IsPending, bool IsBottomUp)
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp)
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
GCNUpwardRPTracker * getUpwardTracker()
GCNSchedStageID getNextStage() const
void finalizeSchedule() override
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
void schedule() override
Orders nodes according to selected style.
GCNScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
void recede(const MachineInstr &MI)
Move to the state of RP just before the MI .
void traceCandidate(const SchedCandidate &Cand)
LLVM_ABI void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone, SchedBoundary *OtherZone)
Set the CandPolicy given a scheduling zone given the current resources and latencies inside and outsi...
MachineSchedPolicy RegionPolicy
const TargetSchedModel * SchedModel
const MachineSchedContext * Context
const TargetRegisterInfo * TRI
SchedCandidate BotCand
Candidate last picked from Bot boundary.
SchedCandidate TopCand
Candidate last picked from Top boundary.
virtual bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone) const
Apply a set of heuristics to a new candidate.
ScheduleDAGMILive * DAG
void initialize(ScheduleDAGMI *dag) override
Initialize the strategy after building the DAG for a new region.
void schedNode(SUnit *SU, bool IsTopNode) override
Update the scheduler's state after scheduling a node.
GenericScheduler(const MachineSchedContext *C)
bool shouldRevertScheduling(unsigned WavesAfter) override
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
SlotIndex getMBBEndIdx(const MachineBasicBlock *mbb) const
Return the last index in the given basic block.
LiveInterval & getInterval(Register Reg)
LLVM_ABI void dump() const
MachineBasicBlock * getMBBFromIndex(SlotIndex index) const
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
LLVM_ABI BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const
getblockFreq - Return block frequency.
LLVM_ABI BlockFrequency getEntryFreq() const
Divide a block's BlockFrequency::getFrequency() value by this value to obtain the entry block - relat...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
mop_range operands()
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool readsReg() const
readsReg - Returns true if this operand reads the previous value of its register.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Register getReg() const
getReg - Returns the register number.
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
bool shouldRevertScheduling(unsigned WavesAfter) override
bool initGCNSchedStage() override
Capture a change in pressure for a single pressure set.
Helpers for implementing custom MachineSchedStrategy classes.
unsigned size() const
Track the current register pressure at some position in the instruction stream, and remember the high...
LLVM_ABI void advance()
Advance across the current instruction.
LLVM_ABI void getDownwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction top-down.
const std::vector< unsigned > & getRegSetPressureAtPos() const
Get the register set pressure at the current position, which may be less than the pressure across the...
LLVM_ABI void getUpwardPressure(const MachineInstr *MI, std::vector< unsigned > &PressureResult, std::vector< unsigned > &MaxPressureResult)
Get the pressure of each PSet after traversing this instruction bottom-up.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
List of registers defined and used by a machine instruction.
LLVM_ABI void collect(const MachineInstr &MI, const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI, bool TrackLaneMasks, bool IgnoreDead)
Analyze the given instruction MI and fill in the Uses, Defs and DeadDefs list based on the MachineOpe...
LLVM_ABI void adjustLaneLiveness(const LiveIntervals &LIS, const MachineRegisterInfo &MRI, SlotIndex Pos, MachineInstr *AddFlagsMI=nullptr)
Use liveness information to find out which uses/defs are partially undefined/dead and adjust the VReg...
LLVM_ABI void detectDeadDefs(const MachineInstr &MI, const LiveIntervals &LIS)
Use liveness information to find dead defs not marked with a dead flag and move them to the DeadDefs ...
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
bool isIGLPMutationOnly(unsigned Opcode) const
static bool isMAI(const MCInstrDesc &Desc)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Scheduling unit. This is a node in the scheduling DAG.
bool isInstr() const
Returns true if this SUnit refers to a machine instruction as opposed to an SDNode.
unsigned TopReadyCycle
Cycle relative to start when node is ready.
unsigned NodeNum
Entry # of node in the node vector.
unsigned short Latency
Node latency.
bool isScheduled
True once scheduled.
unsigned ParentClusterIdx
The parent cluster id.
unsigned BotReadyCycle
Cycle relative to end when node is ready.
bool isBottomReady() const
bool isTopReady() const
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Each Scheduling boundary is associated with ready queues.
LLVM_ABI void releasePending()
Release pending ready nodes in to the available queue.
LLVM_ABI unsigned getLatencyStallCycles(SUnit *SU)
Get the difference between the given SUnit's ready time and the current cycle.
LLVM_ABI SUnit * pickOnlyChoice()
Call this before applying any other heuristics to the Available queue.
LLVM_ABI void bumpCycle(unsigned NextCycle)
Move the boundary of scheduled code by one cycle.
unsigned getCurrMOps() const
Micro-ops issued in the current cycle.
unsigned getCurrCycle() const
Number of cycles to issue the instructions scheduled in this zone.
LLVM_ABI bool checkHazard(SUnit *SU)
Does this SU have a hazard within the current instruction group.
A ScheduleDAG for scheduling lists of MachineInstr.
bool ScheduleSingleMIRegions
True if regions with a single MI should be scheduled.
MachineBasicBlock::iterator RegionEnd
The end of the range to be scheduled.
virtual void finalizeSchedule()
Allow targets to perform final scheduling actions at the level of the whole MachineFunction.
virtual void exitRegion()
Called when the scheduler has finished scheduling the current region.
const MachineLoopInfo * MLI
bool RemoveKillFlags
True if the DAG builder should remove kill flags (in preparation for rescheduling).
MachineBasicBlock::iterator RegionBegin
The beginning of the range to be scheduled.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMILive(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S)
RegPressureTracker RPTracker
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
void addMutation(std::unique_ptr< ScheduleDAGMutation > Mutation)
Add a postprocessing step to the DAG builder.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
ScheduleDAGMI(MachineSchedContext *C, std::unique_ptr< MachineSchedStrategy > S, bool RemoveKillFlags)
std::vector< std::unique_ptr< ScheduleDAGMutation > > Mutations
Ordered list of DAG postprocessing steps.
MachineRegisterInfo & MRI
Virtual/real register map.
const TargetInstrInfo * TII
Target instruction information.
MachineFunction & MF
Machine function.
static const unsigned ScaleFactor
unsigned getMetric() const
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
static bool isSameInstr(SlotIndex A, SlotIndex B)
isSameInstr - Return true if A and B refer to the same instruction.
static bool isEarlierInstr(SlotIndex A, SlotIndex B)
isEarlierInstr - Return true if A refers to an instruction earlier than B.
SlotIndex getPrevSlot() const
Returns the previous slot in the index list.
SlotIndex getMBBStartIdx(unsigned Num) const
Returns the first index in the given basic block number.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
uint8_t getCopyCost() const
Return the cost of copying a value between two registers in this class.
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI bool hasInstrSchedModel() const
Return true if this machine model includes an instruction-level scheduling model.
unsigned getMicroOpBufferSize() const
Number of micro-ops that may be buffered for OOO execution.
bool shouldRevertScheduling(unsigned WavesAfter) override
VNInfo - Value Number Information.
SlotIndex def
The index of the defining instruction.
bool isPHIDef() const
Returns true if this value is defined by a PHI instruction (or was, PHI instructions may have been el...
static bool allUsesAvailableAt(const MachineInstr *MI, SlotIndex UseIdx, const LiveIntervals &LIS, const MachineRegisterInfo &MRI, const TargetInstrInfo &TII)
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAllocatedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize)
LLVM_READONLY int getMFMASrcCVDstAGPROp(uint16_t Opcode)
unsigned getDynamicVGPRBlockSize(const Function &F)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1763
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
LLVM_ABI unsigned getWeakLeft(const SUnit *SU, bool isTop)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI, Range &&LiveRegs)
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
std::pair< MachineBasicBlock::iterator, MachineBasicBlock::iterator > RegionBoundaries
A region's boundaries i.e.
IterT skipDebugInstructionsForward(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It until it points to a non-debug instruction or to End and return the resulting iterator.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool tryPressure(const PressureChange &TryP, const PressureChange &CandP, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason, const TargetRegisterInfo *TRI, const MachineFunction &MF)
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
LLVM_ABI cl::opt< bool > VerifyScheduling
LLVM_ABI bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone)
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
IterT skipDebugInstructionsBackward(IterT It, IterT Begin, bool SkipPseudoOp=true)
Decrement It until it points to a non-debug instruction or to Begin and return the resulting iterator...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
bool isTheSameCluster(unsigned A, unsigned B)
Return whether the input cluster ID's are the same and valid.
DWARFExpression::Operation Op
LLVM_ABI bool tryGreater(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1915
DenseMap< MachineInstr *, GCNRPTracker::LiveRegSet > getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS)
creates a map MachineInstr -> LiveRegSet R - range of iterators on instructions After - upon entry or...
GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI, const LiveIntervals &LIS)
LLVM_ABI bool tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
Return true if this heuristic determines order.
LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF, GCNRegPressure::RegKind Kind, LiveIntervals &LIS, const MachineLoopInfo *MLI)
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
LLVM_ABI int biasPhysReg(const SUnit *SU, bool isTop)
Minimize physical register live ranges.
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
bool operator()(std::pair< MachineInstr *, unsigned > A, std::pair< MachineInstr *, unsigned > B) const
unsigned getArchVGPRNum() const
unsigned getAGPRNum() const
unsigned getSGPRNum() const
Policy for scheduling the next instruction in the candidate's zone.
Store the state used by GenericScheduler heuristics, required for the lifetime of one invocation of p...
void reset(const CandPolicy &NewPolicy)
LLVM_ABI void initResourceDelta(const ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel)
Status of an instruction's critical resource consumption.
constexpr bool any() const
Definition LaneBitmask.h:53
static constexpr LaneBitmask getNone()
Definition LaneBitmask.h:81
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...