29#define DEBUG_TYPE "igrouplp"
35 cl::desc(
"Whether to use the exponential time solver to fit "
36 "the instructions to the pipeline as closely as "
42 cl::desc(
"The maximum number of scheduling group conflicts "
43 "which we attempt to solve with the exponential time "
44 "exact solver. Problem sizes greater than this will"
45 "be solved by the less accurate greedy algorithm. Selecting "
46 "solver by size is superseded by manually selecting "
47 "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
51 cl::desc(
"The amount of branches that we are willing to explore with"
52 "the exact algorithm before giving up."));
56 cl::desc(
"Whether to use the cost heuristic to make choices as we "
57 "traverse the search space using the exact solver. Defaulted "
58 "to on, and if turned off, we will use the node order -- "
59 "attempting to put the later nodes in the later sched groups. "
60 "Experimentally, results are mixed, so this should be set on a "
61 "case-by-case basis."));
65enum class SchedGroupMask {
78 ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
79 DS_READ | DS_WRITE | TRANS,
88class InstructionRule {
94 std::optional<SmallVector<SUnit *, 4>> Cache;
104 bool NeedsCache =
false)
111 virtual ~InstructionRule() =
default;
124 SchedGroupMask SGMask;
127 std::optional<unsigned> MaxSize;
140 static unsigned NumSchedGroups;
157 bool canAddSU(
SUnit &SU)
const;
162 void link(
SUnit &SU,
bool MakePred =
false);
166 int link(
SUnit &SU,
bool MakePred,
167 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
176 void link(SchedGroup &OtherGroup);
179 bool isFull()
const {
return MaxSize && Collection.
size() >= *MaxSize; }
185 void addRule(std::shared_ptr<InstructionRule> NewRule) {
190 bool allowedByRules(
const SUnit *SU,
192 for (
auto &Rule : Rules) {
193 if (!Rule->apply(SU, Collection, SyncPipe))
200 void add(
SUnit &SU) {
202 <<
format_hex((
int)SGMask, 10,
true) <<
" adding "
208 void pop() { Collection.
pop_back(); }
211 void initSchedGroup();
218 void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
219 SUnitsToCandidateSGsMap &SyncedInstrs);
221 void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
223 int getSyncID() {
return SyncID; }
225 int getSGID() {
return SGID; }
227 SchedGroupMask getMask() {
return SGMask; }
229 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
231 : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG),
TII(
TII) {
232 SGID = NumSchedGroups++;
235 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
int SyncID,
237 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG),
TII(
TII) {
238 SGID = NumSchedGroups++;
248 while (!SU.
Preds.empty())
252 while (!SU.
Succs.empty())
253 for (
auto &S : SU.
Succs)
254 for (
auto &SP : S.getSUnit()->Preds)
255 if (SP.getSUnit() == &SU)
256 S.getSUnit()->removePred(SP);
259using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;
271class PipelineSolver {
284 bool NeedsSolver =
false;
288 unsigned computeProblemSize();
299 int CurrConflInstNo = 0;
301 int CurrSyncGroupIdx = 0;
303 int BeginSyncGroupIdx = 0;
309 bool IsBottomUp =
true;
312 void advancePosition();
315 void retreatPosition();
324 template <
typename T>
325 void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
331 template <
typename T>
338 template <
typename T>
void linkSchedGroups(
T I,
T E);
342 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
346 template <
typename T>
347 int linkSUnit(
SUnit *SU,
int SGID,
348 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
T E);
350 void removeEdges(
const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
352 void convertSyncMapsToArrays();
364 : DAG(DAG), SyncedInstrs(SyncedInstrs),
365 SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
367 for (
auto &PipelineInstrs : SyncedInstrs) {
368 if (PipelineInstrs.second.
size() > 0) {
377 convertSyncMapsToArrays();
379 CurrPipeline = BestPipeline;
381 while (
static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.
size() &&
382 PipelineInstrs[BeginSyncGroupIdx].
size() == 0)
385 if (
static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.
size())
390void PipelineSolver::reset() {
392 for (
auto &SyncPipeline : CurrPipeline) {
393 for (
auto &SG : SyncPipeline) {
395 SG.Collection.
clear();
399 if (SchedBarr != TempCollection.
end())
400 SG.Collection.push_back(*SchedBarr);
404 CurrSyncGroupIdx = BeginSyncGroupIdx;
409void PipelineSolver::convertSyncMapsToArrays() {
410 for (
auto &SyncPipe : SyncedSchedGroups) {
411 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
414 int PipelineIDx = SyncedInstrs.size() - 1;
415 PipelineInstrs.resize(SyncedInstrs.size());
416 for (
auto &SyncInstrMap : SyncedInstrs) {
417 for (
auto &SUsToCandSGs : SyncInstrMap.second) {
418 if (PipelineInstrs[PipelineIDx].
size() == 0) {
419 PipelineInstrs[PipelineIDx].push_back(
420 std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
423 auto *SortPosition = PipelineInstrs[PipelineIDx].begin();
426 while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
427 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
429 PipelineInstrs[PipelineIDx].insert(
430 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
436template <
typename T>
void PipelineSolver::linkSchedGroups(
T I,
T E) {
437 for (;
I != E; ++
I) {
439 for (
auto J = std::next(
I); J != E; ++J) {
446void PipelineSolver::makePipeline() {
448 for (
auto &SyncPipeline : BestPipeline) {
450 for (
auto &SG : SyncPipeline) {
453 SUnit *SGBarr =
nullptr;
454 for (
auto &SU : SG.Collection) {
462 resetEdges(*SGBarr, DAG);
463 SG.link(*SGBarr,
false);
467 for (
auto &SyncPipeline : BestPipeline) {
468 IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
469 : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
474int PipelineSolver::linkSUnit(
475 SUnit *SU,
int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
477 bool MakePred =
false;
480 if (
I->getSGID() == SGID) {
485 AddedCost += Group.link(*SU, MakePred, AddedEdges);
491int PipelineSolver::addEdges(
493 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
503 return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.
rbegin(),
505 : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.
begin(),
509void PipelineSolver::removeEdges(
510 const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
513 for (
auto &PredSuccPair : EdgesToRemove) {
514 SUnit *Pred = PredSuccPair.first;
515 SUnit *Succ = PredSuccPair.second;
518 Succ->
Preds, [&Pred](
SDep &
P) { return P.getSUnit() == Pred; });
526void PipelineSolver::advancePosition() {
529 if (
static_cast<size_t>(CurrConflInstNo) >=
530 PipelineInstrs[CurrSyncGroupIdx].
size()) {
534 while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
535 PipelineInstrs[CurrSyncGroupIdx].size() == 0)
540void PipelineSolver::retreatPosition() {
541 assert(CurrConflInstNo >= 0);
542 assert(CurrSyncGroupIdx >= 0);
544 if (CurrConflInstNo > 0) {
549 if (CurrConflInstNo == 0) {
552 if (CurrSyncGroupIdx == BeginSyncGroupIdx)
557 while (PipelineInstrs[CurrSyncGroupIdx].
size() == 0)
560 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
564bool PipelineSolver::checkOptimal() {
565 if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
566 if (BestCost == -1 || CurrCost < BestCost) {
567 BestPipeline = CurrPipeline;
574 bool DoneExploring =
false;
575 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
576 DoneExploring =
true;
578 return (DoneExploring || BestCost == 0);
582void PipelineSolver::populateReadyList(
584 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
585 auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
586 assert(CurrSU.second.size() >= 1);
588 for (;
I != E; ++
I) {
589 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
592 return SG.getSGID() == CandSGID;
597 if (
Match->isFull()) {
598 ReadyList.push_back(std::pair(*
I, MissPenalty));
602 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
603 ReadyList.push_back(std::pair(*
I, TempCost));
604 removeEdges(AddedEdges);
606 ReadyList.push_back(std::pair(*
I, -1));
610 std::sort(ReadyList.begin(), ReadyList.end(),
611 [](std::pair<int, int>
A, std::pair<int, int>
B) {
612 return A.second < B.second;
616 assert(ReadyList.size() == CurrSU.second.size());
619bool PipelineSolver::solveExact() {
623 if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
626 assert(
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
627 assert(
static_cast<size_t>(CurrConflInstNo) <
628 PipelineInstrs[CurrSyncGroupIdx].
size());
629 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
631 <<
") in Pipeline # " << CurrSyncGroupIdx <<
"\n");
636 IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.
rbegin(),
637 CurrSU.second.rend())
638 : populateReadyList(ReadyList, CurrSU.second.
begin(),
639 CurrSU.second.end());
641 auto *
I = ReadyList.
begin();
642 auto *E = ReadyList.
end();
643 for (;
I != E; ++
I) {
647 if (BestCost != -1 && (CurrCost +
I->second > BestCost))
650 int CandSGID =
I->first;
652 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
653 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
655 for (
auto &SG : SyncPipeline) {
656 if (SG.getSGID() == CandSGID)
663 if (!
Match->allowedByRules(CurrSU.first, SyncPipeline))
667 << (
int)
Match->getMask() <<
"and ID " << CandSGID
669 Match->add(*CurrSU.first);
670 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
671 LLVM_DEBUG(
dbgs() <<
"Cost of Assignment: " << AddedCost <<
"\n");
672 CurrCost += AddedCost;
675 bool FinishedExploring =
false;
678 if (CurrCost < BestCost || BestCost == -1) {
680 FinishedExploring = BestCost != 0;
681 if (!FinishedExploring)
687 CurrCost -= AddedCost;
688 removeEdges(AddedEdges);
690 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
691 if (FinishedExploring)
698 CurrCost += MissPenalty;
701 LLVM_DEBUG(
dbgs() <<
"NOT Assigned (" << CurrSU.first->NodeNum <<
")\n");
703 bool FinishedExploring =
false;
704 if (CurrCost < BestCost || BestCost == -1) {
706 bool FinishedExploring = BestCost != 0;
707 if (!FinishedExploring)
713 CurrCost -= MissPenalty;
714 return FinishedExploring;
718void PipelineSolver::greedyFind(
719 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
T E) {
720 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
721 int BestNodeCost = -1;
723 SchedGroup *BestGroup =
nullptr;
724 int BestGroupID = -1;
725 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
727 <<
") in Pipeline # " << CurrSyncGroupIdx <<
"\n");
733 for (;
I != E; ++
I) {
734 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
737 return SG.getSGID() == CandSGID;
741 LLVM_DEBUG(
dbgs() <<
"Trying SGID # " << CandSGID <<
" with Mask "
742 << (
int)
Match->getMask() <<
"\n");
744 if (
Match->isFull()) {
748 if (!
Match->allowedByRules(CurrSU.first, SyncPipeline)) {
749 LLVM_DEBUG(
dbgs() <<
"SGID # " << CandSGID <<
" has conflicting rule\n");
752 TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
754 if (TempCost < BestNodeCost || BestNodeCost == -1) {
756 BestNodeCost = TempCost;
757 BestGroupID = CandSGID;
759 removeEdges(AddedEdges);
760 if (BestNodeCost == 0)
764 if (BestGroupID != -1) {
765 BestGroup->add(*CurrSU.first);
766 addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
767 LLVM_DEBUG(
dbgs() <<
"Best Group has ID: " << BestGroupID <<
" and Mask"
768 << (
int)BestGroup->getMask() <<
"\n");
769 BestCost += TempCost;
771 BestCost += MissPenalty;
773 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
776bool PipelineSolver::solveGreedy() {
778 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
780 while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
781 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
783 ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
784 : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
787 BestPipeline = CurrPipeline;
788 removeEdges(AddedEdges);
792unsigned PipelineSolver::computeProblemSize() {
793 unsigned ProblemSize = 0;
794 for (
auto &PipeConflicts : PipelineInstrs) {
795 ProblemSize += PipeConflicts.size();
801void PipelineSolver::solve() {
805 unsigned ProblemSize = computeProblemSize();
808 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
809 MissPenalty = (ProblemSize / 2) + 1;
812 if (EnableExactSolver || BelowCutoff) {
816 LLVM_DEBUG(
dbgs() <<
"Greedy produced best cost of " << BestCost <<
"\n");
820 LLVM_DEBUG(
dbgs() <<
"Exact produced best cost of " << BestCost <<
"\n");
832enum IGLPStrategyID :
int {
833 MFMASmallGemmOptID = 0,
834 MFMASmallGemmSingleWaveOptID = 1,
835 MFMAExpInterleaveID = 2,
836 MFMAExpSimpleInterleaveID = 3
848 virtual bool applyIGLPStrategy(
857 bool IsBottomUp =
true;
862 virtual ~IGLPStrategy() =
default;
865class MFMASmallGemmOpt final :
public IGLPStrategy {
868 bool applyIGLPStrategy(
879 : IGLPStrategy(DAG,
TII) {
884bool MFMASmallGemmOpt::applyIGLPStrategy(
889 unsigned MFMACount = 0;
891 if (
TII->isMFMAorWMMA(
I))
894 const unsigned PipelineSyncID = 0;
895 SchedGroup *SG =
nullptr;
896 for (
unsigned I = 0;
I < MFMACount * 3; ++
I) {
897 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
898 SchedGroupMask::DS, 2, PipelineSyncID, DAG,
TII);
899 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
901 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
902 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
903 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
909class MFMAExpInterleaveOpt final :
public IGLPStrategy {
912 static unsigned TransPipeCount;
914 static unsigned MFMAPipeCount;
916 static unsigned AddPipeCount;
918 static unsigned MFMAEnablement;
920 static unsigned ExpRequirement;
922 static unsigned MFMAChains;
924 static unsigned MFMAChainLength;
929 static bool HasChainBetweenCvt;
931 static std::optional<unsigned> FirstPipeDSR;
940 class IsPipeExp final :
public InstructionRule {
945 auto *DAG = SyncPipe[0].DAG;
947 if (Cache->empty()) {
949 auto E = DAG->
SUnits.rend();
950 for (;
I != E;
I++) {
951 if (
TII->isMFMAorWMMA(*
I->getInstr()))
952 Cache->push_back(&*
I);
958 auto Reaches =
any_of(*Cache, [&SU, &DAG](
SUnit *TargetSU) {
964 IsPipeExp(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
965 : InstructionRule(
TII, SGID, NeedsCache) {}
970 class EnablesNthMFMA final :
public InstructionRule {
977 bool FoundTrans =
false;
978 unsigned Counter = 1;
979 auto *DAG = SyncPipe[0].DAG;
981 if (Cache->empty()) {
985 auto E = DAG->
SUnits.end();
986 for (;
I != E;
I++) {
987 if (FoundTrans &&
TII->isMFMAorWMMA(*
I->getInstr())) {
989 Cache->push_back(&*
I);
994 if (!FoundTrans &&
TII->isTRANS(
I->getInstr()->getOpcode()))
1005 bool NeedsCache =
false)
1011 class EnablesNthMFMAInChain final :
public InstructionRule {
1019 auto *DAG = SyncPipe[0].DAG;
1021 if (!SU || !
TII->isMFMAorWMMA(*ChainSeed->
getInstr()))
1024 if (Cache->empty()) {
1025 auto *TempSU = ChainSeed;
1030 for (
auto &Succ : TempSU->Succs) {
1031 if (
TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1032 TempSU = Succ.getSUnit();
1041 Cache->push_back(TempSU);
1050 EnablesNthMFMAInChain(
unsigned Number,
SUnit *ChainSeed,
1052 bool NeedsCache =
false)
1054 ChainSeed(ChainSeed) {}
1060 class LessThanNSuccs final :
public InstructionRule {
1063 bool HasIntermediary =
false;
1068 if (!SyncPipe.
size())
1071 auto SuccSize = std::count_if(
1073 [](
const SDep &Succ) { return Succ.getKind() == SDep::Data; });
1074 if (SuccSize >= Size)
1077 if (HasIntermediary) {
1078 for (
auto Succ : SU->
Succs) {
1079 auto SuccSize = std::count_if(
1081 [](
const SDep &SuccSucc) {
1082 return SuccSucc.getKind() == SDep::Data;
1084 if (SuccSize >= Size)
1091 LessThanNSuccs(
unsigned Size,
const SIInstrInfo *
TII,
unsigned SGID,
1092 bool HasIntermediary =
false,
bool NeedsCache =
false)
1093 : InstructionRule(
TII, SGID, NeedsCache), Size(Size),
1094 HasIntermediary(HasIntermediary) {}
1101 class GreaterThanOrEqualToNSuccs final :
public InstructionRule {
1104 bool HasIntermediary =
false;
1109 if (!SyncPipe.
size())
1112 auto SuccSize = std::count_if(
1114 [](
const SDep &Succ) { return Succ.getKind() == SDep::Data; });
1115 if (SuccSize >= Size)
1118 if (HasIntermediary) {
1119 for (
auto Succ : SU->
Succs) {
1120 auto SuccSize = std::count_if(
1122 [](
const SDep &SuccSucc) {
1123 return SuccSucc.getKind() == SDep::Data;
1125 if (SuccSize >= Size)
1132 GreaterThanOrEqualToNSuccs(
unsigned Size,
const SIInstrInfo *
TII,
1133 unsigned SGID,
bool HasIntermediary =
false,
1134 bool NeedsCache =
false)
1135 : InstructionRule(
TII, SGID, NeedsCache), Size(Size),
1136 HasIntermediary(HasIntermediary) {}
1140 class IsCvt final :
public InstructionRule {
1145 return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
1146 Opc == AMDGPU::V_CVT_I32_F32_e32;
1148 IsCvt(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1149 : InstructionRule(
TII, SGID, NeedsCache) {}
1153 class IsFMA final :
public InstructionRule {
1160 IsFMA(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1161 : InstructionRule(
TII, SGID, NeedsCache) {}
1165 class IsPipeAdd final :
public InstructionRule {
1171 IsPipeAdd(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1172 : InstructionRule(
TII, SGID, NeedsCache) {}
1177 class IsSuccOfPrevNthGroup final :
public InstructionRule {
1179 unsigned Distance = 1;
1184 SchedGroup *OtherGroup =
nullptr;
1185 if (!SyncPipe.
size())
1188 for (
auto &PipeSG : SyncPipe) {
1189 if ((
unsigned)PipeSG.getSGID() == SGID - Distance)
1190 OtherGroup = &PipeSG;
1195 if (!OtherGroup->Collection.size())
1198 for (
auto &OtherEle : OtherGroup->Collection) {
1199 for (
auto &Succ : OtherEle->Succs) {
1200 if (Succ.getSUnit() == SU && Succ.getKind() ==
SDep::Data)
1208 unsigned SGID,
bool NeedsCache =
false)
1209 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
1214 class IsReachableFromPrevNthGroup final :
public InstructionRule {
1216 unsigned Distance = 1;
1221 SchedGroup *OtherGroup =
nullptr;
1222 if (!SyncPipe.
size())
1225 for (
auto &PipeSG : SyncPipe) {
1226 if ((
unsigned)PipeSG.getSGID() == SGID - Distance)
1227 OtherGroup = &PipeSG;
1232 if (!OtherGroup->Collection.size())
1235 auto *DAG = SyncPipe[0].DAG;
1237 for (
auto &OtherEle : OtherGroup->Collection)
1243 IsReachableFromPrevNthGroup(
unsigned Distance,
const SIInstrInfo *
TII,
1244 unsigned SGID,
bool NeedsCache =
false)
1245 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
1249 class OccursAtOrAfterNode final :
public InstructionRule {
1260 bool NeedsCache =
false)
1266 class IsExactMFMA final :
public InstructionRule {
1274 if (!SU || !
TII->isMFMAorWMMA(*ChainSeed->
getInstr()))
1277 if (Cache->empty()) {
1278 auto *TempSU = ChainSeed;
1283 for (
auto &Succ : TempSU->Succs) {
1284 if (
TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1285 TempSU = Succ.getSUnit();
1294 Cache->push_back(TempSU);
1300 return (*Cache)[0] == SU;
1304 unsigned SGID,
bool NeedsCache =
false)
1306 ChainSeed(ChainSeed) {}
1312 class OccursAfterExp final :
public InstructionRule {
1318 auto *DAG = SyncPipe[0].DAG;
1319 if (Cache->empty()) {
1320 for (
auto &SU : DAG->
SUnits)
1329 return SU->
NodeNum > (*Cache)[0]->NodeNum;
1333 bool NeedsCache =
false)
1334 : InstructionRule(
TII, SGID, NeedsCache) {}
1338 bool applyIGLPStrategy(
1347 : IGLPStrategy(DAG,
TII) {
1352unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
1353unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
1354unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
1355unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
1356unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
1357unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
1358unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
1359bool MFMAExpInterleaveOpt::HasCvt =
false;
1360bool MFMAExpInterleaveOpt::HasChainBetweenCvt =
false;
1361std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
1370 auto isBitPack = [](
unsigned Opc) {
1371 return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;
1374 auto isCvt = [](
unsigned Opc) {
1375 return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32;
1378 auto isAdd = [](
unsigned Opc) {
return Opc == AMDGPU::V_ADD_F32_e32; };
1383 if (
TII->isTRANS(Opc)) {
1385 if (SU.
Succs.size() >= 7)
1387 for (
auto &Succ : SU.
Succs) {
1388 if (Succ.getSUnit()->Succs.size() >= 7)
1407 if (!(PackSUs.
size() && MFMAPipeCands.
size() && ExpPipeCands.
size()))
1412 std::optional<SUnit *> TempMFMA;
1413 std::optional<SUnit *> TempExp;
1415 for (
auto &PredSU : ExpPipeCands) {
1416 for (
auto &SuccSU : MFMAPipeCands) {
1429 if (!(TempExp && TempMFMA))
1432 HasChainBetweenCvt =
none_of((*TempExp)->Succs, [&isCvt](
SDep &Succ) {
1433 return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
1437 for (
auto &SuccSU : MFMAPipeCands) {
1438 if (MFMAPipeSUs.
size() &&
1439 any_of(MFMAPipeSUs, [&SuccSU](
SUnit *PotentialMatch) {
1440 return PotentialMatch->
NodeNum == SuccSU->NodeNum;
1444 for (
auto &PredSU : ExpPipeCands) {
1452 MFMAPipeCount = MFMAPipeSUs.
size();
1454 assert(TempExp && TempMFMA);
1455 assert(MFMAPipeCount > 0);
1457 std::optional<SUnit *> TempCvt;
1458 for (
auto &SuccSU : CvtSUs) {
1466 if (TempCvt.has_value()) {
1467 for (
auto &SuccSU : MFMAPipeSUs) {
1476 for (
auto &MFMAPipeSU : MFMAPipeSUs) {
1480 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1482 MFMAChainSeeds.push_back(MFMAPipeSU);
1490 for (
auto Pred : MFMAChainSeeds[0]->Preds) {
1491 if (
TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
1492 Pred.getSUnit()->getInstr()->mayLoad())
1493 FirstPipeDSR = Pred.getSUnit()->NodeNum;
1496 MFMAChainLength = MFMAPipeCount / MFMAChains;
1499 unsigned PackSuccCount = std::count_if(
1500 PackSUs.
begin(), PackSUs.
end(), [
this, &TempExp](
SUnit *VPack) {
1501 return DAG->IsReachable(VPack, *TempExp);
1505 unsigned PackPredCount =
1506 std::count_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
1507 [&isBitPack](
SDep &Pred) {
1508 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1509 return isBitPack(Opc);
1513 std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
1514 [&isBitPack](
SDep &Pred) {
1515 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1516 return isBitPack(Opc);
1519 if (PackPred == (*TempMFMA)->Preds.end())
1526 std::count_if(PackPred->getSUnit()->Succs.begin(),
1527 PackPred->getSUnit()->Succs.end(), [&
TII](
SDep &Succ) {
1528 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1532 MFMAEnablement *= PackSuccCount;
1536 std::count_if(ExpPipeCands.
begin(), ExpPipeCands.
end(),
1537 [
this, &PackPred](
SUnit *ExpBase) {
1538 return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
1541 ExpRequirement *= PackPredCount;
1550 if (
Phase != AMDGPU::SchedulingPhase::PostRA)
1551 MFMAChainSeeds.clear();
1552 if (
Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(
TII))
1558bool MFMAExpInterleaveOpt::applyIGLPStrategy(
1563 bool IsSmallKernelType =
1564 MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
1565 bool IsLargeKernelType =
1566 MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;
1568 if (!(IsSmallKernelType || IsLargeKernelType))
1574 unsigned PipelineSyncID = 0;
1575 SchedGroup *SG =
nullptr;
1577 unsigned MFMAChain = 0;
1578 unsigned PositionInChain = 0;
1579 unsigned CurrMFMAForTransPosition = 0;
1581 auto incrementTransPosition = [&MFMAChain, &PositionInChain,
1582 &CurrMFMAForTransPosition]() {
1583 CurrMFMAForTransPosition += MFMAEnablement;
1584 PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
1585 MFMAChain = CurrMFMAForTransPosition % MFMAChains;
1588 auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
1589 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1590 return (TempMFMAForTrans / MFMAChains);
1593 auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
1594 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1595 return TempMFMAForTrans % MFMAChains;
1598 unsigned CurrMFMAPosition = 0;
1599 unsigned MFMAChainForMFMA = 0;
1600 unsigned PositionInChainForMFMA = 0;
1602 auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
1603 &PositionInChainForMFMA]() {
1605 MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
1606 PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
1609 bool IsPostRA =
Phase == AMDGPU::SchedulingPhase::PostRA;
1610 assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);
1612 bool UsesFMA = IsSmallKernelType || !IsPostRA;
1613 bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
1614 bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA);
1615 bool UsesVALU = IsSmallKernelType;
1620 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1621 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG,
TII);
1622 if (!IsPostRA && MFMAChains) {
1623 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1624 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1628 std::make_shared<EnablesNthMFMA>(1,
TII, SG->getSGID(),
true));
1629 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1630 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1633 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1634 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG,
TII);
1635 if (!IsPostRA && MFMAChains) {
1636 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1637 getNextTransPositionInChain(),
1638 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
true));
1640 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1,
TII,
1641 SG->getSGID(),
true));
1642 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1643 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1647 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1648 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG,
TII);
1649 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR,
TII,
1651 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1655 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1656 SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG,
TII);
1657 if (!IsPostRA && MFMAChains)
1658 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1659 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
true));
1661 SG->addRule(std::make_shared<EnablesNthMFMA>(1,
TII, SG->getSGID(),
true));
1662 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1663 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1664 HasChainBetweenCvt));
1665 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1667 incrementTransPosition();
1670 for (
unsigned I = 0;
I < ExpRequirement;
I++) {
1673 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1674 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1675 SG->addRule(std::make_shared<IsCvt>(
TII, SG->getSGID()));
1676 if (HasChainBetweenCvt)
1677 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1678 1 + (2 + UsesFMA) *
I,
TII, SG->getSGID()));
1680 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(
1681 1 + (2 + UsesFMA) *
I,
TII, SG->getSGID()));
1682 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1687 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1688 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1689 if (!IsPostRA && MFMAChains) {
1690 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1691 getNextTransPositionInChain(),
1692 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
true));
1694 SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1,
1695 TII, SG->getSGID(),
true));
1696 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1697 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1701 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1702 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1703 if (!IsPostRA && MFMAChains)
1704 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1705 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1708 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1,
TII,
1709 SG->getSGID(),
true));
1710 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1711 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1712 HasChainBetweenCvt));
1713 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1718 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1719 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1720 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1721 SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>(
1722 8,
TII, SG->getSGID(), HasChainBetweenCvt));
1723 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1728 unsigned MFMARatio =
1729 MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;
1732 MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;
1734 unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement)
1735 ? TransPipeCount - (2 * ExpRequirement)
1737 unsigned ExpLoopCount = RemainingExp / ExpRatio;
1739 unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2)
1740 ? MFMAPipeCount - (MFMAEnablement * 2)
1742 unsigned MFMALoopCount = MFMAInLoop / MFMARatio;
1744 AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount;
1745 unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount);
1747 for (
unsigned I = 0;
I < LoopSize;
I++) {
1748 if (!(
I * ExpRatio % ExpRequirement))
1749 incrementTransPosition();
1752 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1753 SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG,
TII);
1754 if (!IsPostRA && MFMAChains)
1755 SG->addRule(std::make_shared<IsExactMFMA>(
1756 PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA],
TII,
1757 SG->getSGID(),
true));
1759 SG->addRule(std::make_shared<OccursAfterExp>(
TII, SG->getSGID(),
true));
1760 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1761 incrementMFMAPosition();
1764 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1765 SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG,
TII);
1766 SG->addRule(std::make_shared<IsPipeAdd>(
TII, SG->getSGID()));
1767 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1770 if (UsesDSRead && !(
I % 4)) {
1771 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1772 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG,
TII);
1773 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR,
TII,
1775 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1779 for (
unsigned J = 0; J < ExpRatio; J++) {
1780 auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (
I + 1);
1781 auto MaxMFMAOffset =
1782 (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;
1786 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1787 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1788 SG->addRule(std::make_shared<IsCvt>(
TII, SG->getSGID()));
1789 auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;
1790 auto DSROffset =
I / 4 + 1;
1791 auto MaxDSROffset = MaxMFMAOffset / 4;
1793 auto ExpOffset =
I * ExpRatio + J >= ExpRequirement ? 0 : 1;
1794 auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +
1795 std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff +
1797 if (HasChainBetweenCvt)
1798 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1799 CurrentOffset,
TII, SG->getSGID()));
1801 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset,
TII,
1803 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1808 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1809 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1810 if (!IsPostRA && MFMAChains)
1811 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1812 getNextTransPositionInChain(),
1813 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
1816 SG->addRule(std::make_shared<EnablesNthMFMA>(
1817 (((
I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1,
1818 TII, SG->getSGID(),
true));
1819 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1820 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1824 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1825 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1826 if (!IsPostRA && MFMAChains)
1827 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1828 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1831 SG->addRule(std::make_shared<EnablesNthMFMA>(
1832 (((
I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,
1833 TII, SG->getSGID(),
true));
1834 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1835 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1836 HasChainBetweenCvt));
1837 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1842 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1843 SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG,
TII);
1844 SG->addRule(std::make_shared<OccursAfterExp>(
TII, SG->getSGID(),
true));
1845 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1849class MFMAExpSimpleInterleaveOpt final :
public IGLPStrategy {
1851 bool applyIGLPStrategy(
1862 : IGLPStrategy(DAG,
TII) {
1867bool MFMAExpSimpleInterleaveOpt::applyIGLPStrategy(
1872 unsigned MFMACount = 0;
1874 if (
TII->isMFMAorWMMA(
I))
1877 const unsigned PipelineSyncID = 0;
1878 for (
unsigned I = 0;
I < MFMACount * 3; ++
I) {
1879 SchedGroup *SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1880 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1881 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1883 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1884 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
1885 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1891class MFMASmallGemmSingleWaveOpt final :
public IGLPStrategy {
1894 class EnablesInitialMFMA final :
public InstructionRule {
1898 if (!SyncPipe.
size())
1901 if (!Cache->size()) {
1902 for (
auto &Elt : SyncPipe[0].DAG->
SUnits) {
1903 if (
TII->isMFMAorWMMA(*Elt.getInstr())) {
1907 Cache->push_back(&Elt);
1913 auto *DAG = SyncPipe[0].DAG;
1914 for (
auto &Elt : *Cache) {
1922 bool NeedsCache =
false)
1923 : InstructionRule(
TII, SGID, NeedsCache) {}
1927 class IsPermForDSW final :
public InstructionRule {
1932 if (
MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
1935 bool FitsInGroup =
false;
1937 if (!Collection.
size()) {
1938 for (
auto &Succ : SU->
Succs) {
1939 SUnit *SuccUnit = Succ.getSUnit();
1942 Cache->push_back(SuccUnit);
1955 return ThisSucc.getSUnit() == Elt;
1960 IsPermForDSW(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1961 : InstructionRule(
TII, SGID, NeedsCache) {}
1965 class IsSuccOfPrevGroup final :
public InstructionRule {
1969 SchedGroup *OtherGroup =
nullptr;
1970 for (
auto &PipeSG : SyncPipe) {
1971 if ((
unsigned)PipeSG.getSGID() == SGID - 1) {
1972 OtherGroup = &PipeSG;
1978 if (!OtherGroup->Collection.size())
1982 return any_of(OtherGroup->Collection, [&SU](
SUnit *Elt) {
1983 return any_of(Elt->Succs,
1984 [&SU](SDep &Succ) { return Succ.getSUnit() == SU; });
1988 bool NeedsCache =
false)
1989 : InstructionRule(
TII, SGID, NeedsCache) {}
1993 class VMEMSize final :
public InstructionRule {
1998 if (
MI->getOpcode() == TargetOpcode::BUNDLE)
2000 if (!Collection.
size())
2005 auto TRI =
TII->getRegisterInfo();
2006 auto &
MRI =
MI->getParent()->getParent()->getRegInfo();
2007 for (
auto &Elt : Collection) {
2008 auto Op = Elt->getInstr()->getOperand(0);
2010 TRI.getRegSizeInBits(*
TRI.getRegClassForOperandReg(
MRI,
Op));
2014 if (NumBits < 128) {
2016 if (NumBits +
TRI.getRegSizeInBits(*
TRI.getRegClassForOperandReg(
2017 MRI,
MI->getOperand(0))) <=
2025 VMEMSize(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
2026 : InstructionRule(
TII, SGID, NeedsCache) {}
2031 class SharesPredWithPrevNthGroup final :
public InstructionRule {
2033 unsigned Distance = 1;
2038 SchedGroup *OtherGroup =
nullptr;
2039 if (!SyncPipe.
size())
2042 if (!Cache->size()) {
2044 for (
auto &PipeSG : SyncPipe) {
2045 if ((
unsigned)PipeSG.getSGID() == SGID - Distance) {
2046 OtherGroup = &PipeSG;
2052 if (!OtherGroup->Collection.size())
2055 for (
auto &OtherEle : OtherGroup->Collection) {
2056 for (
auto &Pred : OtherEle->Preds) {
2057 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2058 AMDGPU::V_PERM_B32_e64)
2059 Cache->push_back(Pred.getSUnit());
2068 auto *DAG = SyncPipe[0].DAG;
2075 SharesPredWithPrevNthGroup(
unsigned Distance,
const SIInstrInfo *
TII,
2076 unsigned SGID,
bool NeedsCache =
false)
2077 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
2081 bool applyIGLPStrategy(
2092 : IGLPStrategy(DAG,
TII) {
2097static unsigned DSWCount = 0;
2098static unsigned DSWWithPermCount = 0;
2099static unsigned DSWWithSharedVMEMCount = 0;
2101bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
2105 unsigned MFMACount = 0;
2106 unsigned DSRCount = 0;
2108 bool IsInitial =
Phase == AMDGPU::SchedulingPhase::Initial;
2110 assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
2111 DSWWithSharedVMEMCount == 0)) &&
2112 "DSWCounters should be zero in pre-RA scheduling!");
2114 for (
auto &SU : DAG->SUnits) {
2115 auto *
I = SU.getInstr();
2116 if (
TII->isMFMAorWMMA(*
I))
2118 else if (
TII->isDS(*
I)) {
2121 else if (
I->mayStore() && IsInitial) {
2123 for (
auto Pred : SU.Preds) {
2124 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2125 AMDGPU::V_PERM_B32_e64) {
2135 DSWWithPermCount = DSWithPerms.
size();
2136 auto *
I = DSWithPerms.
begin();
2137 auto *E = DSWithPerms.
end();
2147 for (;
I != E;
I++) {
2148 SUnit *Cand =
nullptr;
2149 bool MissedAny =
false;
2150 for (
auto &Pred : (*I)->Preds) {
2151 if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
2157 for (
auto &Succ : Pred.getSUnit()->Succs) {
2158 auto *
MI = Succ.getSUnit()->getInstr();
2159 if (!
TII->isVMEM(*
MI) || !
MI->mayLoad())
2162 if (MissedAny || !VMEMLookup.
size()) {
2164 VMEMLookup[
MI] = *
I;
2181 if (!MissedAny && Cand) {
2182 DSWWithSharedVMEMCount += 2;
2189 assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
2191 unsigned PipelineSyncID = 0;
2193 if (DSWWithPermCount) {
2194 for (
unsigned I = 0;
I < MFMACount;
I++) {
2195 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2196 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2197 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2199 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2200 SchedGroupMask::VALU, 2, PipelineSyncID, DAG,
TII);
2201 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2211 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2212 SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG,
TII);
2213 SG->addRule(std::make_shared<EnablesInitialMFMA>(
TII, SG->getSGID(),
true));
2214 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2216 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2217 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2218 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2221 for (
unsigned I = 0;
I < DSRCount - 4; ++
I) {
2222 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2223 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG,
TII);
2224 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2226 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2227 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2228 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2234 for (
unsigned I = 0;
I < DSWWithPermCount - DSWWithSharedVMEMCount; ++
I) {
2235 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2236 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2237 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2238 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2240 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2241 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2242 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2243 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2245 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2246 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2247 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2248 1,
TII, SG->getSGID(),
true));
2249 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2250 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2252 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2253 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2254 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2256 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2257 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2258 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2259 3,
TII, SG->getSGID(),
true));
2260 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2261 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2263 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2264 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2265 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2271 for (
unsigned I = 0;
I < DSWCount - DSWWithPermCount;
I++) {
2272 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2273 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2274 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2276 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2277 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2278 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2279 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2281 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2282 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2283 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2291 for (
unsigned I = 0;
I < DSWWithSharedVMEMCount; ++
I) {
2292 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2293 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2294 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2295 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2297 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2298 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2299 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2300 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2302 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2303 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2304 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2306 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2307 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2308 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2309 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2311 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2312 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2313 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2314 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2316 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2317 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2318 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2320 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2321 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2322 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2323 2,
TII, SG->getSGID(),
true));
2324 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2325 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2327 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2328 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2329 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2331 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2332 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2333 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2334 4,
TII, SG->getSGID(),
true));
2335 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2336 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2338 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2339 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2340 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2346static std::unique_ptr<IGLPStrategy>
2350 case MFMASmallGemmOptID:
2351 return std::make_unique<MFMASmallGemmOpt>(DAG,
TII);
2352 case MFMASmallGemmSingleWaveOptID:
2353 return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG,
TII);
2354 case MFMAExpInterleaveID:
2355 return std::make_unique<MFMAExpInterleaveOpt>(DAG,
TII);
2356 case MFMAExpSimpleInterleaveID:
2357 return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG,
TII);
2378 void addSchedBarrierEdges(
SUnit &SU);
2389 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask)
const;
2392 void initSchedGroupBarrierPipelineStage(
2393 std::vector<SUnit>::reverse_iterator RIter);
2395 bool initIGLPOpt(
SUnit &SU);
2405 bool IsBottomUp =
true;
2410 IGroupLPDAGMutation() =
default;
2414unsigned SchedGroup::NumSchedGroups = 0;
2426 if (
MI.isMetaInstruction())
2429 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
2434 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
2438 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
2442 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
2443 TII->isMFMAorWMMA(
MI))
2446 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
2450 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
2455 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
2460 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
2464 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
2465 MI.mayLoad() &&
TII->isDS(
MI))
2468 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
2469 MI.mayStore() &&
TII->isDS(
MI))
2472 else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
2477 dbgs() <<
"For SchedGroup with mask " <<
format_hex((
int)SGMask, 10,
true)
2478 << (Result ?
" could classify " :
" unable to classify ") <<
MI);
2483int SchedGroup::link(
SUnit &SU,
bool MakePred,
2484 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
2485 int MissedEdges = 0;
2486 for (
auto *
A : Collection) {
2488 if (
A ==
B ||
A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2498 bool Added = tryAddEdge(
A,
B);
2500 AddedEdges.emplace_back(
A,
B);
2508void SchedGroup::link(
SUnit &SU,
bool MakePred) {
2509 for (
auto *
A : Collection) {
2511 if (
A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2520void SchedGroup::link(
SUnit &SU,
2522 for (
auto *
A : Collection) {
2531void SchedGroup::link(SchedGroup &OtherGroup) {
2532 for (
auto *
B : OtherGroup.Collection)
2536bool SchedGroup::canAddSU(
SUnit &SU)
const {
2538 if (
MI.getOpcode() != TargetOpcode::BUNDLE)
2539 return canAddMI(
MI);
2544 while (E !=
MBB->
end() && E->isBundledWithPred())
2551void SchedGroup::initSchedGroup() {
2552 for (
auto &SU : DAG->
SUnits) {
2561void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
2562 SUnitsToCandidateSGsMap &SyncedInstrs) {
2563 SUnit &InitSU = *RIter;
2564 for (
auto E = DAG->
SUnits.rend(); RIter != E; ++RIter) {
2570 SyncedInstrs[&SU].push_back(SGID);
2578void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
2579 auto I = DAG->
SUnits.rbegin();
2580 auto E = DAG->
SUnits.rend();
2581 for (;
I != E; ++
I) {
2586 SyncedInstrs[&SU].push_back(SGID);
2592 if (!TSchedModel || DAGInstrs->
SUnits.empty())
2597 TII =
ST.getInstrInfo();
2599 SyncedSchedGroups.clear();
2600 SyncedInstrs.clear();
2601 bool FoundSB =
false;
2602 bool FoundIGLP =
false;
2603 bool ShouldApplyIGLP =
false;
2604 for (
auto R = DAG->
SUnits.rbegin(), E = DAG->
SUnits.rend();
R != E; ++
R) {
2605 unsigned Opc =
R->getInstr()->getOpcode();
2607 if (Opc == AMDGPU::SCHED_BARRIER) {
2608 addSchedBarrierEdges(*R);
2610 }
else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
2611 initSchedGroupBarrierPipelineStage(R);
2613 }
else if (Opc == AMDGPU::IGLP_OPT) {
2614 resetEdges(*R, DAG);
2615 if (!FoundSB && !FoundIGLP) {
2617 ShouldApplyIGLP = initIGLPOpt(*R);
2622 if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
2623 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
2631void IGroupLPDAGMutation::addSchedBarrierEdges(
SUnit &SchedBarrier) {
2633 assert(
MI.getOpcode() == AMDGPU::SCHED_BARRIER);
2636 resetEdges(SchedBarrier, DAG);
2637 LLVM_DEBUG(
dbgs() <<
"Building SchedGroup for SchedBarrier with Mask: "
2638 <<
MI.getOperand(0).getImm() <<
"\n");
2640 invertSchedBarrierMask((SchedGroupMask)
MI.getOperand(0).getImm());
2641 SchedGroup SG(InvertedMask, std::nullopt, DAG,
TII);
2642 SG.initSchedGroup();
2648 const SUnit *
A,
const SUnit *
B) {
return A->NodeNum >
B->NodeNum; });
2652IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask)
const {
2655 SchedGroupMask InvertedMask = ~Mask;
2658 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
2659 InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
2660 ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
2662 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
2663 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
2664 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
2665 (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
2666 InvertedMask &= ~SchedGroupMask::ALU;
2669 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
2670 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
2672 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
2673 (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
2674 InvertedMask &= ~SchedGroupMask::VMEM;
2677 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
2678 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
2680 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
2681 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
2682 InvertedMask &= ~SchedGroupMask::DS;
2684 LLVM_DEBUG(
dbgs() <<
"After Inverting, SchedGroup Mask: " << (
int)InvertedMask
2687 return InvertedMask;
2690void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
2691 std::vector<SUnit>::reverse_iterator RIter) {
2694 resetEdges(*RIter, DAG);
2701 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
2704 SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
2707bool IGroupLPDAGMutation::initIGLPOpt(
SUnit &SU) {
2708 IGLPStrategyID StrategyID =
2710 auto S = createIGLPStrategy(StrategyID, DAG,
TII);
2711 if (!S->shouldApplyStrategy(DAG,
Phase))
2714 IsBottomUp = S->IsBottomUp;
2715 return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups,
Phase);
2727std::unique_ptr<ScheduleDAGMutation>
2729 return std::make_unique<IGroupLPDAGMutation>(
Phase);
unsigned const MachineRegisterInfo * MRI
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late Phase
Provides AMDGPU specific target descriptions.
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
This file defines the DenseMap class.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
This class represents an Operation in the Expression.
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Instructions::iterator instr_iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const MachineOperand & getOperand(unsigned i) const
@ Data
Regular data dependence (aka true-dependence).
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Scheduling unit. This is a node in the scheduling DAG.
unsigned NodeNum
Entry # of node in the node vector.
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
SmallVector< SDep, 4 > Succs
All sunit successors.
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
bool IsReachable(SUnit *SU, SUnit *TargetSU)
IsReachable - Checks if SU is reachable from TargetSU.
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU)
True if an edge can be added from PredSU to SuccSU without creating a cycle.
void dump() const override
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Mutate the DAG as a postpass after normal DAG building.
virtual void apply(ScheduleDAGInstrs *DAG)=0
std::vector< SUnit > SUnits
The scheduling units.
MachineFunction & MF
Machine function.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Provide an instruction scheduling machine model to CodeGen passes.
An efficient, type-erasing, non-owning reference to a callable.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
initializer< Ty > init(const Ty &Val)
void link(std::unique_ptr< LinkGraph > G, std::unique_ptr< JITLinkContext > Ctx)
Link the given graph.
This is an optimization pass for GlobalISel generic memory operations.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
FormattedNumber format_hex(uint64_t N, unsigned Width, bool Upper=false)
format_hex - Output N as a fixed width hexadecimal.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.