LLVM 23.0.0git
AMDGPUCoExecSchedStrategy.cpp
Go to the documentation of this file.
1//===- AMDGPUCoExecSchedStrategy.cpp - CoExec Scheduling Strategy ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Coexecution-focused scheduling strategy for AMDGPU.
11//
12//===----------------------------------------------------------------------===//
13
15#include "llvm/Support/Debug.h"
16
17using namespace llvm;
18using namespace llvm::AMDGPU;
19
20#define DEBUG_TYPE "machine-scheduler"
21
22namespace {
23
24// Used to disable post-RA scheduling with function level granularity.
25class GCNNoopPostScheduleDAG final : public ScheduleDAGInstrs {
26public:
27 explicit GCNNoopPostScheduleDAG(MachineSchedContext *C)
28 : ScheduleDAGInstrs(*C->MF, C->MLI, /*RemoveKillFlags=*/true) {}
29
30 // Do nothing.
31 void schedule() override {}
32};
33
34} // namespace
35
37 // pickOnlyChoice() releases pending instructions and checks for new hazards.
38 SUnit *OnlyChoice = Zone.pickOnlyChoice();
39 if (!Zone.Pending.empty())
40 return nullptr;
41
42 return OnlyChoice;
43}
44
46 const SIInstrInfo &SII) {
47 if (MI.isDebugInstr())
49
50 unsigned Opc = MI.getOpcode();
51
52 // Check for specific opcodes first.
53 if (Opc == AMDGPU::ATOMIC_FENCE || Opc == AMDGPU::S_WAIT_ASYNCCNT ||
54 Opc == AMDGPU::S_WAIT_TENSORCNT || Opc == AMDGPU::S_BARRIER_WAIT ||
55 Opc == AMDGPU::S_BARRIER_SIGNAL_IMM)
57
58 if (SII.isLDSDMA(MI))
60
61 if (SII.isMFMAorWMMA(MI))
63
64 if (SII.isTRANS(MI))
66
67 if (SII.isVALU(MI))
69
70 if (SII.isDS(MI))
72
73 if (SII.isFLAT(MI) || SII.isFLATGlobal(MI) || SII.isFLATScratch(MI))
75
76 if (SII.isSALU(MI))
78
80}
81
83 for (auto *PrioritySU : PrioritySUs) {
84 if (!PrioritySU->isTopReady())
85 return PrioritySU;
86 }
87
88 if (!LookDeep)
89 return nullptr;
90
91 unsigned MinDepth = std::numeric_limits<unsigned int>::max();
92 SUnit *TargetSU = nullptr;
93 for (auto *SU : AllSUs) {
94 if (SU->isScheduled)
95 continue;
96
97 if (SU->isTopReady())
98 continue;
99
100 if (SU->getDepth() < MinDepth) {
101 MinDepth = SU->getDepth();
102 TargetSU = SU;
103 }
104 }
105 return TargetSU;
106}
107
108void HardwareUnitInfo::insert(SUnit *SU, unsigned BlockingCycles) {
109#ifndef NDEBUG
110 bool Inserted = AllSUs.insert(SU);
111 assert(Inserted);
112#else
113 AllSUs.insert(SU);
114#endif
115
116 TotalCycles += BlockingCycles;
117
118 if (PrioritySUs.empty()) {
119 PrioritySUs.insert(SU);
120 return;
121 }
122 unsigned SUDepth = SU->getDepth();
123 unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
124 if (SUDepth > CurrDepth)
125 return;
126
127 if (SUDepth == CurrDepth) {
128 PrioritySUs.insert(SU);
129 return;
130 }
131
132 // SU is lower depth and should be prioritized.
133 PrioritySUs.clear();
134 PrioritySUs.insert(SU);
135}
136
137void HardwareUnitInfo::markScheduled(SUnit *SU, unsigned BlockingCycles) {
138 // We may want to ignore some HWUIs (e.g. InstructionFlavor::Other). To do so,
139 // we just clear the HWUI. However, we still have instructions which map to
140 // this HWUI. Don't bother managing the state for these HWUI.
141 if (TotalCycles == 0)
142 return;
143
144 AllSUs.remove(SU);
145 PrioritySUs.remove(SU);
146
147 TotalCycles -= BlockingCycles;
148
149 if (AllSUs.empty())
150 return;
151 if (PrioritySUs.empty()) {
152 for (auto SU : AllSUs) {
153 if (PrioritySUs.empty()) {
154 PrioritySUs.insert(SU);
155 continue;
156 }
157 unsigned SUDepth = SU->getDepth();
158 unsigned CurrDepth = (*PrioritySUs.begin())->getDepth();
159 if (SUDepth > CurrDepth)
160 continue;
161
162 if (SUDepth == CurrDepth) {
163 PrioritySUs.insert(SU);
164 continue;
165 }
166
167 // SU is lower depth and should be prioritized.
168 PrioritySUs.clear();
169 PrioritySUs.insert(SU);
170 }
171 }
172}
173
176 for (auto &HWUICand : HWUInfo) {
177 if (HWUICand.getType() == Flavor) {
178 return &HWUICand;
179 }
180 }
181 return nullptr;
182}
183
185 assert(SchedModel && SchedModel->hasInstrSchedModel());
186 unsigned ReleaseAtCycle = 0;
187 const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
188 for (TargetSchedModel::ProcResIter PI = SchedModel->getWriteProcResBegin(SC),
189 PE = SchedModel->getWriteProcResEnd(SC);
190 PI != PE; ++PI) {
191 ReleaseAtCycle = std::max(ReleaseAtCycle, (unsigned)PI->ReleaseAtCycle);
192 }
193 return ReleaseAtCycle;
194}
195
202
205 const TargetRegisterInfo *TRI) {
206 DAG = SchedDAG;
208 assert(SchedModel && SchedModel->hasInstrSchedModel());
209
210 SRI = static_cast<const SIRegisterInfo *>(TRI);
211 SII = static_cast<const SIInstrInfo *>(DAG->TII);
212
214
215 for (unsigned I = 0; I < HWUInfo.size(); I++) {
216 HWUInfo[I].reset();
217 HWUInfo[I].setType(I);
218 }
219
220 HWUInfo[(int)InstructionFlavor::WMMA].setProducesCoexecWindow(true);
221 HWUInfo[(int)InstructionFlavor::MultiCycleVALU].setProducesCoexecWindow(true);
222 HWUInfo[(int)InstructionFlavor::TRANS].setProducesCoexecWindow(true);
223
225}
226
228 if (!SchedModel || !SchedModel->hasInstrSchedModel())
229 return;
230
231 for (auto &SU : DAG->SUnits) {
232 const InstructionFlavor Flavor = classifyFlavor(*SU.getInstr(), *SII);
233 HWUInfo[(int)(Flavor)].insert(&SU, getHWUICyclesForInst(&SU));
234 }
235
237}
238
240 MachineBasicBlock *BB = DAG->begin()->getParent();
241 dbgs() << "\n=== Region: " << DAG->MF.getName() << " BB" << BB->getNumber()
242 << " (" << DAG->SUnits.size() << " SUs) ===\n";
243
244 dbgs() << "\nHWUI Resource Pressure:\n";
245 for (auto &HWUI : HWUInfo) {
246 if (HWUI.getTotalCycles() == 0)
247 continue;
248
249 StringRef Name = getFlavorName(HWUI.getType());
250 dbgs() << " " << Name << ": " << HWUI.getTotalCycles() << " cycles, "
251 << HWUI.size() << " instrs\n";
252 }
253 dbgs() << "\n";
254}
255
257 // Highest priority should be first.
259 // Prefer CoexecWindow producers
260 if (A.producesCoexecWindow() != B.producesCoexecWindow())
261 return A.producesCoexecWindow();
262
263 // Prefer more demanded resources
264 if (A.getTotalCycles() != B.getTotalCycles())
265 return A.getTotalCycles() > B.getTotalCycles();
266
267 // In ties -- prefer the resource with more instructions
268 if (A.size() != B.size())
269 return A.size() < B.size();
270
271 // Default to Flavor order
272 return (unsigned)A.getType() < (unsigned)B.getType();
273 });
274}
275
279
280 auto HasPrioritySU = [this, &Cand, &TryCand](unsigned ResourceIdx) {
281 const HardwareUnitInfo &HWUI = HWUInfo[ResourceIdx];
282
283 auto CandFlavor = classifyFlavor(*Cand.SU->getInstr(), *SII);
284 auto TryCandFlavor = classifyFlavor(*TryCand.SU->getInstr(), *SII);
285 bool LookDeep = (CandFlavor == InstructionFlavor::DS ||
286 TryCandFlavor == InstructionFlavor::DS) &&
288 auto *TargetSU = HWUI.getNextTargetSU(LookDeep);
289
290 // If we do not have a TargetSU for this resource, then it is not critical.
291 if (!TargetSU)
292 return false;
293
294 return true;
295 };
296
297 auto TryEnablesResource = [&Cand, &TryCand, this](unsigned ResourceIdx) {
298 const HardwareUnitInfo &HWUI = HWUInfo[ResourceIdx];
299 auto CandFlavor = classifyFlavor(*Cand.SU->getInstr(), *SII);
300
301 // We want to ensure our DS order matches WMMA order.
302 bool LookDeep = CandFlavor == InstructionFlavor::DS &&
304 auto *TargetSU = HWUI.getNextTargetSU(LookDeep);
305
306 bool CandEnables =
307 TargetSU != Cand.SU && DAG->IsReachable(TargetSU, Cand.SU);
308 bool TryCandEnables =
309 TargetSU != TryCand.SU && DAG->IsReachable(TargetSU, TryCand.SU);
310
311 if (!CandEnables && !TryCandEnables)
312 return false;
313
314 if (CandEnables && !TryCandEnables) {
317
318 return true;
319 }
320
321 if (!CandEnables && TryCandEnables) {
323 return true;
324 }
325
326 // Both enable, prefer the critical path.
327 unsigned CandHeight = Cand.SU->getHeight();
328 unsigned TryCandHeight = TryCand.SU->getHeight();
329
330 if (CandHeight > TryCandHeight) {
333
334 return true;
335 }
336
337 if (CandHeight < TryCandHeight) {
339 return true;
340 }
341
342 // Same critical path, just prefer original candidate.
345
346 return true;
347 };
348
349 for (unsigned I = 0; I < HWUInfo.size(); I++) {
350 // If we have encountered a resource that is not critical, then neither
351 // candidate enables a critical resource
352 if (!HasPrioritySU(I))
353 continue;
354
355 bool Enabled = TryEnablesResource(I);
356 // If neither has enabled the resource, continue to the next resource
357 if (Enabled)
358 return true;
359 }
360 return false;
361}
362
366 for (unsigned I = 0; I < HWUInfo.size(); I++) {
367 const HardwareUnitInfo &HWUI = HWUInfo[I];
368
369 bool CandUsesCrit = HWUI.contains(Cand.SU);
370 bool TryCandUsesCrit = HWUI.contains(TryCand.SU);
371
372 if (!CandUsesCrit && !TryCandUsesCrit)
373 continue;
374
375 if (CandUsesCrit != TryCandUsesCrit) {
376 if (CandUsesCrit) {
379 return true;
380 }
382 return true;
383 }
384
385 // Otherwise, both use the critical resource
386 // For longer latency InstructionFlavors, we should prioritize first by
387 // their enablement of critical resources
388 if (HWUI.getType() == InstructionFlavor::DS) {
389 if (tryCriticalResourceDependency(TryCand, Cand, Zone))
390 return true;
391 }
392
393 // Prioritize based on HWUI priorities.
394 SUnit *Match = HWUI.getHigherPriority(Cand.SU, TryCand.SU);
395 if (Match) {
396 if (Match == Cand.SU) {
399 return true;
400 }
402 return true;
403 }
404 }
405
406 return false;
407}
408
417
420 unsigned NumRegionInstrs) {
424 "coexec scheduler only supports top-down scheduling");
425 RegionPolicy.OnlyTopDown = true;
426 RegionPolicy.OnlyBottomUp = false;
427 RegionPolicy.ShouldTrackLaneMasks = true;
428}
429
431 // Coexecution scheduling strategy is only done top-down to support new
432 // resource balancing heuristics.
433 RegionPolicy.OnlyTopDown = true;
434 RegionPolicy.OnlyBottomUp = false;
435
437 Heurs.initialize(DAG, SchedModel, TRI);
438}
439
441 Heurs.updateForScheduling(SU);
442 GCNSchedStrategy::schedNode(SU, IsTopNode);
443}
444
446 assert(RegionPolicy.OnlyTopDown && !RegionPolicy.OnlyBottomUp &&
447 "coexec scheduler only supports top-down scheduling");
448
449 if (DAG->top() == DAG->bottom()) {
450 assert(Top.Available.empty() && Top.Pending.empty() &&
451 Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
452 return nullptr;
453 }
454
455 bool PickedPending = false;
456 SUnit *SU = nullptr;
457#ifndef NDEBUG
458 SchedCandidate *PickedCand = nullptr;
459#endif
460 do {
461 PickedPending = false;
462 SU = pickOnlyChoice(Top);
463 if (!SU) {
464 CandPolicy NoPolicy;
465 TopCand.reset(NoPolicy);
466 pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand,
467 PickedPending, /*IsBottomUp=*/false);
468 assert(TopCand.Reason != NoCand && "failed to find a candidate");
469 SU = TopCand.SU;
470#ifndef NDEBUG
471 PickedCand = &TopCand;
472#endif
473 }
474 IsTopNode = true;
475 } while (SU->isScheduled);
476
477 LLVM_DEBUG(if (PickedCand) dumpPickSummary(SU, IsTopNode, *PickedCand));
478
479 if (PickedPending) {
480 unsigned ReadyCycle = SU->TopReadyCycle;
481 unsigned CurrentCycle = Top.getCurrCycle();
482 if (ReadyCycle > CurrentCycle)
483 Top.bumpCycle(ReadyCycle);
484
485 // checkHazard() does not expose the exact cycle where the hazard clears.
486 while (Top.checkHazard(SU))
487 Top.bumpCycle(Top.getCurrCycle() + 1);
488
489 Top.releasePending();
490 }
491
492 if (SU->isTopReady())
493 Top.removeReady(SU);
494 if (SU->isBottomReady())
495 Bot.removeReady(SU);
496
497 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
498 << *SU->getInstr());
499
500 assert(IsTopNode && "coexec scheduler must only schedule from top boundary");
501 return SU;
502}
503
505 SchedBoundary &Zone, const CandPolicy &ZonePolicy,
506 const RegPressureTracker &RPTracker, SchedCandidate &Cand,
507 bool &PickedPending, bool IsBottomUp) {
508 assert(Zone.isTop() && "coexec scheduler only supports top boundary");
509 assert(!IsBottomUp && "coexec scheduler only supports top-down scheduling");
510
511 const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo *>(TRI);
513 unsigned SGPRPressure = 0;
514 unsigned VGPRPressure = 0;
515 PickedPending = false;
516 if (DAG->isTrackingPressure()) {
517 if (!useGCNTrackers()) {
518 SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
519 VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
520 } else {
521 SGPRPressure = DownwardTracker.getPressure().getSGPRNum();
522 VGPRPressure = DownwardTracker.getPressure().getArchVGPRNum();
523 }
524 }
525
526 auto EvaluateQueue = [&](ReadyQueue &Q, bool FromPending) {
527 for (SUnit *SU : Q) {
528 SchedCandidate TryCand(ZonePolicy);
529 initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
530 VGPRPressure, IsBottomUp);
531 SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
532 tryCandidateCoexec(Cand, TryCand, ZoneArg);
533 if (TryCand.Reason != NoCand) {
534 if (TryCand.ResDelta == SchedResourceDelta())
535 TryCand.initResourceDelta(Zone.DAG, SchedModel);
536 LLVM_DEBUG(printCandidateDecision(Cand, TryCand));
537 PickedPending = FromPending;
538 Cand.setBest(TryCand);
539 } else {
540 LLVM_DEBUG(printCandidateDecision(TryCand, Cand));
541 }
542 }
543 };
544
545 LLVM_DEBUG(dbgs() << "Available Q:\n");
546 EvaluateQueue(Zone.Available, /*FromPending=*/false);
547
548 LLVM_DEBUG(dbgs() << "Pending Q:\n");
549 EvaluateQueue(Zone.Pending, /*FromPending=*/true);
550}
551
552#ifndef NDEBUG
554 SchedCandidate &Cand) {
555 const SIInstrInfo *SII = static_cast<const SIInstrInfo *>(DAG->TII);
556 unsigned Cycle = IsTopNode ? Top.getCurrCycle() : Bot.getCurrCycle();
557
558 dbgs() << "=== Pick @ Cycle " << Cycle << " ===\n";
559
560 const InstructionFlavor Flavor = classifyFlavor(*SU->getInstr(), *SII);
561 dbgs() << "Picked: SU(" << SU->NodeNum << ") ";
562 SU->getInstr()->print(dbgs(), /*IsStandalone=*/true, /*SkipOpers=*/false,
563 /*SkipDebugLoc=*/true);
564 dbgs() << " [" << getFlavorName(Flavor) << "]\n";
565
566 dbgs() << " Reason: ";
569 else if (Cand.Reason != NoCand)
571 else
572 dbgs() << "Unknown";
573 dbgs() << "\n\n";
574
576}
577#endif
578
580 SchedCandidate &TryCand,
581 SchedBoundary *Zone) {
582 // Initialize the candidate if needed.
583 if (!Cand.isValid()) {
584 TryCand.Reason = FirstValid;
585 return true;
586 }
587
588 // Bias PhysReg Defs and copies to their uses and defined respectively.
589 if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
590 biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
591 return TryCand.Reason != NoCand;
592
593 // Avoid exceeding the target's limit.
594 if (DAG->isTrackingPressure() &&
595 tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
596 RegExcess, TRI, DAG->MF))
597 return TryCand.Reason != NoCand;
598
599 // We only compare a subset of features when comparing nodes between
600 // Top and Bottom boundary. Some properties are simply incomparable, in many
601 // other instances we should only override the other boundary if something
602 // is a clear good pick on one boundary. Skip heuristics that are more
603 // "tie-breaking" in nature.
604 bool SameBoundary = Zone != nullptr;
605 if (SameBoundary) {
606 // Compare candidates by the stall they would introduce if
607 // scheduled in the current cycle.
608 if (tryEffectiveStall(Cand, TryCand, *Zone))
609 return TryCand.Reason != NoCand;
610
611 Heurs.sortHWUIResources();
612 if (Heurs.tryCriticalResource(TryCand, Cand, Zone)) {
614 return TryCand.Reason != NoCand;
615 }
616
617 if (Heurs.tryCriticalResourceDependency(TryCand, Cand, Zone)) {
619 return TryCand.Reason != NoCand;
620 }
621 }
622
623 // Keep clustered nodes together to encourage downstream peephole
624 // optimizations which may reduce resource requirements.
625 //
626 // This is a best effort to set things up for a post-RA pass. Optimizations
627 // like generating loads of multiple registers should ideally be done within
628 // the scheduler pass by combining the loads during DAG postprocessing.
629 unsigned CandZoneCluster = Cand.AtTop ? TopClusterID : BotClusterID;
630 unsigned TryCandZoneCluster = TryCand.AtTop ? TopClusterID : BotClusterID;
631 bool CandIsClusterSucc =
632 isTheSameCluster(CandZoneCluster, Cand.SU->ParentClusterIdx);
633 bool TryCandIsClusterSucc =
634 isTheSameCluster(TryCandZoneCluster, TryCand.SU->ParentClusterIdx);
635
636 if (tryGreater(TryCandIsClusterSucc, CandIsClusterSucc, TryCand, Cand,
637 Cluster))
638 return TryCand.Reason != NoCand;
639
640 if (SameBoundary) {
641 // Weak edges are for clustering and other constraints.
642 if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
643 getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
644 return TryCand.Reason != NoCand;
645 }
646
647 // Avoid increasing the max pressure of the entire region.
648 if (DAG->isTrackingPressure() &&
649 tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
650 Cand, RegMax, TRI, DAG->MF))
651 return TryCand.Reason != NoCand;
652
653 if (SameBoundary) {
654 // Avoid serializing long latency dependence chains.
655 // For acyclic path limited loops, latency was already checked above.
656 if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
657 !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
658 return TryCand.Reason != NoCand;
659
660 // Fall through to original instruction order.
661 if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
662 (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
663 TryCand.Reason = NodeOrder;
664 return true;
665 }
666 }
667
668 return false;
669}
670
672 SchedCandidate &TryCand,
673 SchedBoundary &Zone) const {
674 // Treat structural and latency stalls as a single scheduling cost for the
675 // current cycle.
676 struct StallCosts {
677 unsigned Ready = 0;
678 unsigned Structural = 0;
679 unsigned Latency = 0;
680 unsigned Effective = 0;
681 };
682
683 unsigned CurrCycle = Zone.getCurrCycle();
684 auto GetStallCosts = [&](SUnit *SU) {
685 unsigned ReadyCycle = Zone.isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
686 StallCosts Costs;
687 Costs.Ready = ReadyCycle > CurrCycle ? ReadyCycle - CurrCycle : 0;
688 Costs.Structural = getStructuralStallCycles(Zone, SU);
689 Costs.Latency = Zone.getLatencyStallCycles(SU);
690 Costs.Effective = std::max({Costs.Ready, Costs.Structural, Costs.Latency});
691 return Costs;
692 };
693
694 StallCosts TryCosts = GetStallCosts(TryCand.SU);
695 StallCosts CandCosts = GetStallCosts(Cand.SU);
696
697 LLVM_DEBUG(if (TryCosts.Effective || CandCosts.Effective) {
698 dbgs() << "Effective stalls: try=" << TryCosts.Effective
699 << " (ready=" << TryCosts.Ready << ", struct=" << TryCosts.Structural
700 << ", lat=" << TryCosts.Latency << ") cand=" << CandCosts.Effective
701 << " (ready=" << CandCosts.Ready
702 << ", struct=" << CandCosts.Structural
703 << ", lat=" << CandCosts.Latency << ")\n";
704 });
705
706 return tryLess(TryCosts.Effective, CandCosts.Effective, TryCand, Cand, Stall);
707}
708
711 LLVM_DEBUG(dbgs() << "AMDGPU coexec preRA scheduler selected for "
712 << C->MF->getName() << '\n');
713 return new GCNScheduleDAGMILive(
714 C, std::make_unique<AMDGPUCoExecSchedStrategy>(C));
715}
716
719 LLVM_DEBUG(dbgs() << "AMDGPU nop postRA scheduler selected for "
720 << C->MF->getName() << '\n');
721 return new GCNNoopPostScheduleDAG(C);
722}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SUnit * pickOnlyChoice(SchedBoundary &Zone)
Coexecution-focused scheduling strategy for AMDGPU.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
#define LLVM_DEBUG(...)
Definition Debug.h:114
bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary &Zone) const
void initPolicy(MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, unsigned NumRegionInstrs) override
Optionally override the per-region scheduling policy.
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand, bool &PickedPending, bool IsBottomUp)
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
AMDGPUCoExecSchedStrategy(const MachineSchedContext *C)
void dumpPickSummary(SUnit *SU, bool IsTopNode, SchedCandidate &Cand)
bool tryCandidateCoexec(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
void updateForScheduling(SUnit *SU)
Update the state to reflect that SU is going to be scheduled.
HardwareUnitInfo * getHWUIFromFlavor(AMDGPU::InstructionFlavor Flavor)
Given a Flavor , find the corresponding HardwareUnit.
void sortHWUIResources()
Sort the HWUInfo vector.
bool tryCriticalResource(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const
Check for critical resource consumption.
bool tryCriticalResourceDependency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const
Check for dependencies of instructions that use prioritized HardwareUnits.
SmallVector< HardwareUnitInfo, 8 > HWUInfo
const TargetSchedModel * SchedModel
void collectHWUIPressure()
Walk over the region and collect total usage per HardwareUnit.
void initialize(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel, const TargetRegisterInfo *TRI)
unsigned getHWUICyclesForInst(SUnit *SU)
Compute the blocking cycles for the appropriate HardwareUnit given an SU.
GCNDownwardRPTracker DownwardTracker
GCNSchedStrategy(const MachineSchedContext *C)
SmallVector< GCNSchedStageID, 4 > SchedStages
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
std::vector< unsigned > Pressure
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
void printCandidateDecision(const SchedCandidate &Current, const SchedCandidate &Preferred)
unsigned getStructuralStallCycles(SchedBoundary &Zone, SUnit *SU) const
Estimate how many cycles SU must wait due to structural hazards at the current boundary cycle.
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop, const RegPressureTracker &RPTracker, const SIRegisterInfo *SRI, unsigned SGPRPressure, unsigned VGPRPressure, bool IsBottomUp)
MachineSchedPolicy RegionPolicy
const TargetSchedModel * SchedModel
static const char * getReasonStr(GenericSchedulerBase::CandReason Reason)
const TargetRegisterInfo * TRI
SchedCandidate TopCand
Candidate last picked from Top boundary.
ScheduleDAGMILive * DAG
HardwareUnitInfo is a wrapper class which maps to some real hardware resource.
void markScheduled(SUnit *SU, unsigned BlockingCycles)
Update the state for SU being scheduled by removing it from the AllSus and reducing its BlockingCycle...
SUnit * getNextTargetSU(bool LookDeep=false) const
void insert(SUnit *SU, unsigned BlockingCycles)
Insert the SU into the AllSUs and account its BlockingCycles into the TotalCycles.
AMDGPU::InstructionFlavor getType() const
SUnit * getHigherPriority(SUnit *SU, SUnit *Other) const
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
MachineInstrBundleIterator< MachineInstr > iterator
Representation of each machine instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
virtual void initPolicy(MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, unsigned NumRegionInstrs)
Optionally override the per-region scheduling policy.
Helpers for implementing custom MachineSchedStrategy classes.
Track the current register pressure at some position in the instruction stream, and remember the high...
const std::vector< unsigned > & getRegSetPressureAtPos() const
Get the register set pressure at the current position, which may be less than the pressure across the...
static bool isDS(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isSALU(const MachineInstr &MI)
static bool isMFMAorWMMA(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isTRANS(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isVALU(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
Scheduling unit. This is a node in the scheduling DAG.
unsigned TopReadyCycle
Cycle relative to start when node is ready.
unsigned NodeNum
Entry # of node in the node vector.
unsigned getHeight() const
Returns the height of this node, which is the length of the maximum path down to any node which has n...
unsigned getDepth() const
Returns the depth of this node, which is the length of the maximum path up to any node which has no p...
bool isScheduled
True once scheduled.
unsigned ParentClusterIdx
The parent cluster id.
bool isBottomReady() const
bool isTopReady() const
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Each Scheduling boundary is associated with ready queues.
LLVM_ABI unsigned getLatencyStallCycles(SUnit *SU)
Get the difference between the given SUnit's ready time and the current cycle.
LLVM_ABI SUnit * pickOnlyChoice()
Call this before applying any other heuristics to the Available queue.
unsigned getCurrCycle() const
Number of cycles to issue the instructions scheduled in this zone.
A ScheduleDAG for scheduling lists of MachineInstr.
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
const MCWriteProcResEntry * ProcResIter
InstructionFlavor classifyFlavor(const MachineInstr &MI, const SIInstrInfo &SII)
StringRef getReasonName(AMDGPUSchedReason R)
StringRef getFlavorName(InstructionFlavor F)
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI int biasPhysReg(const SUnit *SU, bool isTop, bool BiasPRegsExtra=false)
Minimize physical register live ranges.
LLVM_ABI unsigned getWeakLeft(const SUnit *SU, bool isTop)
CycleInfo::CycleT Cycle
Definition CycleInfo.h:26
LLVM_ABI bool tryPressure(const PressureChange &TryP, const PressureChange &CandP, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason, const TargetRegisterInfo *TRI, const MachineFunction &MF)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
ScheduleDAGInstrs * createGCNNoopPostMachineScheduler(MachineSchedContext *C)
LLVM_ABI bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary &Zone)
ScheduleDAGInstrs * createGCNCoExecMachineScheduler(MachineSchedContext *C)
bool isTheSameCluster(unsigned A, unsigned B)
Return whether the input cluster ID's are the same and valid.
LLVM_ABI bool tryGreater(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
LLVM_ABI bool tryLess(int TryVal, int CandVal, GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, GenericSchedulerBase::CandReason Reason)
Return true if this heuristic determines order.
@ Enabled
Convert any .debug_str_offsets tables to DWARF64 if needed.
Definition DWP.h:32
LLVM_ABI cl::opt< MISched::Direction > PreRADirection
Policy for scheduling the next instruction in the candidate's zone.
Store the state used by GenericScheduler heuristics, required for the lifetime of one invocation of p...
LLVM_ABI void initResourceDelta(const ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel)
Status of an instruction's critical resource consumption.
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:123
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...