29#define DEBUG_TYPE "igrouplp"
35 cl::desc(
"Whether to use the exponential time solver to fit "
36 "the instructions to the pipeline as closely as "
42 cl::desc(
"The maximum number of scheduling group conflicts "
43 "which we attempt to solve with the exponential time "
44 "exact solver. Problem sizes greater than this will"
45 "be solved by the less accurate greedy algorithm. Selecting "
46 "solver by size is superseded by manually selecting "
47 "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
51 cl::desc(
"The amount of branches that we are willing to explore with"
52 "the exact algorithm before giving up."));
56 cl::desc(
"Whether to use the cost heuristic to make choices as we "
57 "traverse the search space using the exact solver. Defaulted "
58 "to on, and if turned off, we will use the node order -- "
59 "attempting to put the later nodes in the later sched groups. "
60 "Experimentally, results are mixed, so this should be set on a "
61 "case-by-case basis."));
65enum class SchedGroupMask {
78 ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
79 DS_READ | DS_WRITE | TRANS,
88class InstructionRule {
94 std::optional<SmallVector<SUnit *, 4>> Cache;
104 bool NeedsCache =
false)
111 virtual ~InstructionRule() =
default;
124 SchedGroupMask SGMask;
127 std::optional<unsigned> MaxSize;
140 static unsigned NumSchedGroups;
157 bool canAddSU(
SUnit &SU)
const;
162 void link(
SUnit &SU,
bool MakePred =
false);
166 int link(
SUnit &SU,
bool MakePred,
167 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
176 void link(SchedGroup &OtherGroup);
179 bool isFull()
const {
return MaxSize && Collection.
size() >= *MaxSize; }
185 void addRule(std::shared_ptr<InstructionRule> NewRule) {
190 bool allowedByRules(
const SUnit *SU,
192 for (
auto &Rule : Rules) {
193 if (!Rule->apply(SU, Collection, SyncPipe))
200 void add(
SUnit &SU) {
202 <<
format_hex((
int)SGMask, 10,
true) <<
" adding "
208 void pop() { Collection.
pop_back(); }
211 void initSchedGroup();
218 void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
219 SUnitsToCandidateSGsMap &SyncedInstrs);
221 void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
223 int getSyncID() {
return SyncID; }
225 int getSGID() {
return SGID; }
227 SchedGroupMask getMask() {
return SGMask; }
229 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
231 : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG),
TII(
TII) {
232 SGID = NumSchedGroups++;
235 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
int SyncID,
237 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG),
TII(
TII) {
238 SGID = NumSchedGroups++;
242using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;
254class PipelineSolver {
267 bool NeedsSolver =
false;
271 unsigned computeProblemSize();
282 int CurrConflInstNo = 0;
284 int CurrSyncGroupIdx = 0;
286 int BeginSyncGroupIdx = 0;
292 bool IsBottomUp =
true;
295 void advancePosition();
298 void retreatPosition();
307 template <
typename T>
308 void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
314 template <
typename T>
321 template <
typename T>
void linkSchedGroups(
T I,
T E);
325 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
329 template <
typename T>
330 int linkSUnit(
SUnit *SU,
int SGID,
331 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
T E);
333 void removeEdges(
const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
335 void convertSyncMapsToArrays();
347 : DAG(DAG), SyncedInstrs(SyncedInstrs),
348 SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
350 for (
auto &PipelineInstrs : SyncedInstrs) {
351 if (PipelineInstrs.second.
size() > 0) {
360 convertSyncMapsToArrays();
362 CurrPipeline = BestPipeline;
364 while (
static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.
size() &&
365 PipelineInstrs[BeginSyncGroupIdx].
size() == 0)
368 if (
static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.
size())
373void PipelineSolver::reset() {
375 for (
auto &SyncPipeline : CurrPipeline) {
376 for (
auto &SG : SyncPipeline) {
378 SG.Collection.
clear();
382 if (SchedBarr != TempCollection.
end())
383 SG.Collection.push_back(*SchedBarr);
387 CurrSyncGroupIdx = BeginSyncGroupIdx;
392void PipelineSolver::convertSyncMapsToArrays() {
393 for (
auto &SyncPipe : SyncedSchedGroups) {
394 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
397 int PipelineIDx = SyncedInstrs.size() - 1;
398 PipelineInstrs.resize(SyncedInstrs.size());
399 for (
auto &SyncInstrMap : SyncedInstrs) {
400 for (
auto &SUsToCandSGs : SyncInstrMap.second) {
401 if (PipelineInstrs[PipelineIDx].
size() == 0) {
402 PipelineInstrs[PipelineIDx].push_back(
403 std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
406 auto *SortPosition = PipelineInstrs[PipelineIDx].begin();
409 while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
410 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
412 PipelineInstrs[PipelineIDx].insert(
413 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
419template <
typename T>
void PipelineSolver::linkSchedGroups(
T I,
T E) {
420 for (;
I != E; ++
I) {
422 for (
auto J = std::next(
I); J != E; ++J) {
429void PipelineSolver::makePipeline() {
431 for (
auto &SyncPipeline : BestPipeline) {
433 for (
auto &SG : SyncPipeline) {
436 SUnit *SGBarr =
nullptr;
437 for (
auto &SU : SG.Collection) {
438 if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
445 SG.link(*SGBarr,
false);
449 for (
auto &SyncPipeline : BestPipeline) {
450 IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
451 : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
456int PipelineSolver::linkSUnit(
457 SUnit *SU,
int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
459 bool MakePred =
false;
462 if (
I->getSGID() == SGID) {
467 AddedCost += Group.link(*SU, MakePred, AddedEdges);
473int PipelineSolver::addEdges(
475 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
485 return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.
rbegin(),
487 : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.
begin(),
491void PipelineSolver::removeEdges(
492 const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
495 for (
auto &PredSuccPair : EdgesToRemove) {
496 SUnit *Pred = PredSuccPair.first;
497 SUnit *Succ = PredSuccPair.second;
500 Succ->
Preds, [&Pred](
SDep &
P) { return P.getSUnit() == Pred; });
508void PipelineSolver::advancePosition() {
511 if (
static_cast<size_t>(CurrConflInstNo) >=
512 PipelineInstrs[CurrSyncGroupIdx].
size()) {
516 while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
517 PipelineInstrs[CurrSyncGroupIdx].size() == 0)
522void PipelineSolver::retreatPosition() {
523 assert(CurrConflInstNo >= 0);
524 assert(CurrSyncGroupIdx >= 0);
526 if (CurrConflInstNo > 0) {
531 if (CurrConflInstNo == 0) {
534 if (CurrSyncGroupIdx == BeginSyncGroupIdx)
539 while (PipelineInstrs[CurrSyncGroupIdx].
size() == 0)
542 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
546bool PipelineSolver::checkOptimal() {
547 if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
548 if (BestCost == -1 || CurrCost < BestCost) {
549 BestPipeline = CurrPipeline;
556 bool DoneExploring =
false;
557 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
558 DoneExploring =
true;
560 return (DoneExploring || BestCost == 0);
564void PipelineSolver::populateReadyList(
566 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
567 auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
568 assert(CurrSU.second.size() >= 1);
570 for (;
I != E; ++
I) {
571 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
574 return SG.getSGID() == CandSGID;
579 if (
Match->isFull()) {
580 ReadyList.push_back(std::pair(*
I, MissPenalty));
584 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
585 ReadyList.push_back(std::pair(*
I, TempCost));
586 removeEdges(AddedEdges);
588 ReadyList.push_back(std::pair(*
I, -1));
592 std::sort(ReadyList.begin(), ReadyList.end(),
593 [](std::pair<int, int>
A, std::pair<int, int>
B) {
594 return A.second < B.second;
598 assert(ReadyList.size() == CurrSU.second.size());
601bool PipelineSolver::solveExact() {
605 if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
608 assert(
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
609 assert(
static_cast<size_t>(CurrConflInstNo) <
610 PipelineInstrs[CurrSyncGroupIdx].
size());
611 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
613 <<
") in Pipeline # " << CurrSyncGroupIdx <<
"\n");
618 IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.
rbegin(),
619 CurrSU.second.rend())
620 : populateReadyList(ReadyList, CurrSU.second.
begin(),
621 CurrSU.second.end());
623 auto *
I = ReadyList.
begin();
624 auto *E = ReadyList.
end();
625 for (;
I != E; ++
I) {
629 if (BestCost != -1 && (CurrCost +
I->second > BestCost))
632 int CandSGID =
I->first;
634 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
635 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
637 for (
auto &SG : SyncPipeline) {
638 if (SG.getSGID() == CandSGID)
645 if (!
Match->allowedByRules(CurrSU.first, SyncPipeline))
649 << (
int)
Match->getMask() <<
"and ID " << CandSGID
651 Match->add(*CurrSU.first);
652 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
653 LLVM_DEBUG(
dbgs() <<
"Cost of Assignment: " << AddedCost <<
"\n");
654 CurrCost += AddedCost;
657 bool FinishedExploring =
false;
660 if (CurrCost < BestCost || BestCost == -1) {
662 FinishedExploring = BestCost != 0;
663 if (!FinishedExploring)
669 CurrCost -= AddedCost;
670 removeEdges(AddedEdges);
672 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
673 if (FinishedExploring)
680 CurrCost += MissPenalty;
683 LLVM_DEBUG(
dbgs() <<
"NOT Assigned (" << CurrSU.first->NodeNum <<
")\n");
685 bool FinishedExploring =
false;
686 if (CurrCost < BestCost || BestCost == -1) {
688 bool FinishedExploring = BestCost != 0;
689 if (!FinishedExploring)
695 CurrCost -= MissPenalty;
696 return FinishedExploring;
700void PipelineSolver::greedyFind(
701 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
T E) {
702 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
703 int BestNodeCost = -1;
705 SchedGroup *BestGroup =
nullptr;
706 int BestGroupID = -1;
707 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
709 <<
") in Pipeline # " << CurrSyncGroupIdx <<
"\n");
715 for (;
I != E; ++
I) {
716 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
719 return SG.getSGID() == CandSGID;
723 LLVM_DEBUG(
dbgs() <<
"Trying SGID # " << CandSGID <<
" with Mask "
724 << (
int)
Match->getMask() <<
"\n");
726 if (
Match->isFull()) {
730 if (!
Match->allowedByRules(CurrSU.first, SyncPipeline)) {
731 LLVM_DEBUG(
dbgs() <<
"SGID # " << CandSGID <<
" has conflicting rule\n");
734 TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
736 if (TempCost < BestNodeCost || BestNodeCost == -1) {
738 BestNodeCost = TempCost;
739 BestGroupID = CandSGID;
741 removeEdges(AddedEdges);
742 if (BestNodeCost == 0)
746 if (BestGroupID != -1) {
747 BestGroup->add(*CurrSU.first);
748 addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
749 LLVM_DEBUG(
dbgs() <<
"Best Group has ID: " << BestGroupID <<
" and Mask"
750 << (
int)BestGroup->getMask() <<
"\n");
751 BestCost += TempCost;
753 BestCost += MissPenalty;
755 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
758bool PipelineSolver::solveGreedy() {
760 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
762 while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
763 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
765 ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
766 : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
769 BestPipeline = CurrPipeline;
770 removeEdges(AddedEdges);
774unsigned PipelineSolver::computeProblemSize() {
775 unsigned ProblemSize = 0;
776 for (
auto &PipeConflicts : PipelineInstrs) {
777 ProblemSize += PipeConflicts.size();
783void PipelineSolver::solve() {
787 unsigned ProblemSize = computeProblemSize();
790 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
791 MissPenalty = (ProblemSize / 2) + 1;
794 if (EnableExactSolver || BelowCutoff) {
798 LLVM_DEBUG(
dbgs() <<
"Greedy produced best cost of " << BestCost <<
"\n");
802 LLVM_DEBUG(
dbgs() <<
"Exact produced best cost of " << BestCost <<
"\n");
814enum IGLPStrategyID :
int {
815 MFMASmallGemmOptID = 0,
816 MFMASmallGemmSingleWaveOptID = 1,
817 MFMAExpInterleaveID = 2,
818 MFMAExpSimpleInterleaveID = 3
830 virtual bool applyIGLPStrategy(
839 bool IsBottomUp =
true;
844 virtual ~IGLPStrategy() =
default;
847class MFMASmallGemmOpt final :
public IGLPStrategy {
850 bool applyIGLPStrategy(
861 : IGLPStrategy(DAG,
TII) {
866bool MFMASmallGemmOpt::applyIGLPStrategy(
871 unsigned MFMACount = 0;
873 if (
TII->isMFMAorWMMA(
I))
876 const unsigned PipelineSyncID = 0;
877 SchedGroup *SG =
nullptr;
878 for (
unsigned I = 0;
I < MFMACount * 3; ++
I) {
879 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
880 SchedGroupMask::DS, 2, PipelineSyncID, DAG,
TII);
881 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
883 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
884 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
885 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
891class MFMAExpInterleaveOpt final :
public IGLPStrategy {
894 static unsigned TransPipeCount;
896 static unsigned MFMAPipeCount;
898 static unsigned AddPipeCount;
900 static unsigned MFMAEnablement;
902 static unsigned ExpRequirement;
904 static unsigned MFMAChains;
906 static unsigned MFMAChainLength;
911 static bool HasChainBetweenCvt;
913 static std::optional<unsigned> FirstPipeDSR;
922 class IsPipeExp final :
public InstructionRule {
927 auto *DAG = SyncPipe[0].DAG;
929 if (Cache->empty()) {
930 auto I = DAG->SUnits.rbegin();
931 auto E = DAG->SUnits.rend();
932 for (;
I != E;
I++) {
933 if (
TII->isMFMAorWMMA(*
I->getInstr()))
934 Cache->push_back(&*
I);
940 auto Reaches =
any_of(*Cache, [&SU, &DAG](
SUnit *TargetSU) {
941 return DAG->IsReachable(TargetSU,
const_cast<SUnit *
>(SU));
946 IsPipeExp(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
947 : InstructionRule(
TII, SGID, NeedsCache) {}
952 class EnablesNthMFMA final :
public InstructionRule {
959 bool FoundTrans =
false;
960 unsigned Counter = 1;
961 auto *DAG = SyncPipe[0].DAG;
963 if (Cache->empty()) {
966 auto I = DAG->SUnits.begin();
967 auto E = DAG->SUnits.end();
968 for (;
I != E;
I++) {
969 if (FoundTrans &&
TII->isMFMAorWMMA(*
I->getInstr())) {
971 Cache->push_back(&*
I);
976 if (!FoundTrans &&
TII->isTRANS(
I->getInstr()->getOpcode()))
983 return DAG->IsReachable((*Cache)[0],
const_cast<SUnit *
>(SU));
987 bool NeedsCache =
false)
993 class EnablesNthMFMAInChain final :
public InstructionRule {
1001 auto *DAG = SyncPipe[0].DAG;
1003 if (!SU || !
TII->isMFMAorWMMA(*ChainSeed->
getInstr()))
1006 if (Cache->empty()) {
1007 auto *TempSU = ChainSeed;
1012 for (
auto &Succ : TempSU->Succs) {
1013 if (
TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1014 TempSU = Succ.getSUnit();
1023 Cache->push_back(TempSU);
1029 return DAG->IsReachable((*Cache)[0],
const_cast<SUnit *
>(SU));
1032 EnablesNthMFMAInChain(
unsigned Number,
SUnit *ChainSeed,
1034 bool NeedsCache =
false)
1036 ChainSeed(ChainSeed) {}
1042 class LessThanNSuccs final :
public InstructionRule {
1045 bool HasIntermediary =
false;
1050 if (!SyncPipe.
size())
1053 auto SuccSize = std::count_if(
1055 [](
const SDep &Succ) { return Succ.getKind() == SDep::Data; });
1056 if (SuccSize >= Size)
1059 if (HasIntermediary) {
1060 for (
auto Succ : SU->
Succs) {
1061 auto SuccSize = std::count_if(
1063 [](
const SDep &SuccSucc) {
1064 return SuccSucc.getKind() == SDep::Data;
1066 if (SuccSize >= Size)
1073 LessThanNSuccs(
unsigned Size,
const SIInstrInfo *
TII,
unsigned SGID,
1074 bool HasIntermediary =
false,
bool NeedsCache =
false)
1075 : InstructionRule(
TII, SGID, NeedsCache), Size(Size),
1076 HasIntermediary(HasIntermediary) {}
1083 class GreaterThanOrEqualToNSuccs final :
public InstructionRule {
1086 bool HasIntermediary =
false;
1091 if (!SyncPipe.
size())
1094 auto SuccSize = std::count_if(
1096 [](
const SDep &Succ) { return Succ.getKind() == SDep::Data; });
1097 if (SuccSize >= Size)
1100 if (HasIntermediary) {
1101 for (
auto Succ : SU->
Succs) {
1102 auto SuccSize = std::count_if(
1104 [](
const SDep &SuccSucc) {
1105 return SuccSucc.getKind() == SDep::Data;
1107 if (SuccSize >= Size)
1114 GreaterThanOrEqualToNSuccs(
unsigned Size,
const SIInstrInfo *
TII,
1115 unsigned SGID,
bool HasIntermediary =
false,
1116 bool NeedsCache =
false)
1117 : InstructionRule(
TII, SGID, NeedsCache), Size(Size),
1118 HasIntermediary(HasIntermediary) {}
1122 class IsCvt final :
public InstructionRule {
1127 return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
1128 Opc == AMDGPU::V_CVT_I32_F32_e32;
1130 IsCvt(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1131 : InstructionRule(
TII, SGID, NeedsCache) {}
1135 class IsFMA final :
public InstructionRule {
1142 IsFMA(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1143 : InstructionRule(
TII, SGID, NeedsCache) {}
1147 class IsPipeAdd final :
public InstructionRule {
1153 IsPipeAdd(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1154 : InstructionRule(
TII, SGID, NeedsCache) {}
1159 class IsSuccOfPrevNthGroup final :
public InstructionRule {
1161 unsigned Distance = 1;
1166 SchedGroup *OtherGroup =
nullptr;
1167 if (!SyncPipe.
size())
1170 for (
auto &PipeSG : SyncPipe) {
1171 if ((
unsigned)PipeSG.getSGID() == SGID - Distance)
1172 OtherGroup = &PipeSG;
1177 if (!OtherGroup->Collection.size())
1180 for (
auto &OtherEle : OtherGroup->Collection) {
1181 for (
auto &Succ : OtherEle->Succs) {
1182 if (Succ.getSUnit() == SU && Succ.getKind() ==
SDep::Data)
1190 unsigned SGID,
bool NeedsCache =
false)
1191 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
1196 class IsReachableFromPrevNthGroup final :
public InstructionRule {
1198 unsigned Distance = 1;
1203 SchedGroup *OtherGroup =
nullptr;
1204 if (!SyncPipe.
size())
1207 for (
auto &PipeSG : SyncPipe) {
1208 if ((
unsigned)PipeSG.getSGID() == SGID - Distance)
1209 OtherGroup = &PipeSG;
1214 if (!OtherGroup->Collection.size())
1217 auto *DAG = SyncPipe[0].DAG;
1219 for (
auto &OtherEle : OtherGroup->Collection)
1220 if (DAG->IsReachable(
const_cast<SUnit *
>(SU), OtherEle))
1225 IsReachableFromPrevNthGroup(
unsigned Distance,
const SIInstrInfo *
TII,
1226 unsigned SGID,
bool NeedsCache =
false)
1227 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
1231 class OccursAtOrAfterNode final :
public InstructionRule {
1242 bool NeedsCache =
false)
1248 class IsExactMFMA final :
public InstructionRule {
1256 if (!SU || !
TII->isMFMAorWMMA(*ChainSeed->
getInstr()))
1259 if (Cache->empty()) {
1260 auto *TempSU = ChainSeed;
1265 for (
auto &Succ : TempSU->Succs) {
1266 if (
TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1267 TempSU = Succ.getSUnit();
1276 Cache->push_back(TempSU);
1282 return (*Cache)[0] == SU;
1286 unsigned SGID,
bool NeedsCache =
false)
1288 ChainSeed(ChainSeed) {}
1294 class OccursAfterExp final :
public InstructionRule {
1300 auto *DAG = SyncPipe[0].DAG;
1301 if (Cache->empty()) {
1302 for (
auto &SU : DAG->SUnits)
1311 return SU->
NodeNum > (*Cache)[0]->NodeNum;
1315 bool NeedsCache =
false)
1316 : InstructionRule(
TII, SGID, NeedsCache) {}
1320 bool applyIGLPStrategy(
1329 : IGLPStrategy(DAG,
TII) {
1334unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
1335unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
1336unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
1337unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
1338unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
1339unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
1340unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
1341bool MFMAExpInterleaveOpt::HasCvt =
false;
1342bool MFMAExpInterleaveOpt::HasChainBetweenCvt =
false;
1343std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
1352 auto isBitPack = [](
unsigned Opc) {
1353 return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;
1356 auto isCvt = [](
unsigned Opc) {
1357 return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32;
1360 auto isAdd = [](
unsigned Opc) {
return Opc == AMDGPU::V_ADD_F32_e32; };
1363 for (
SUnit &SU : DAG->SUnits) {
1365 if (
TII->isTRANS(Opc)) {
1367 if (SU.
Succs.size() >= 7)
1369 for (
auto &Succ : SU.
Succs) {
1370 if (Succ.getSUnit()->Succs.size() >= 7)
1389 if (!(PackSUs.
size() && MFMAPipeCands.
size() && ExpPipeCands.
size()))
1394 std::optional<SUnit *> TempMFMA;
1395 std::optional<SUnit *> TempExp;
1397 for (
auto &PredSU : ExpPipeCands) {
1398 for (
auto &SuccSU : MFMAPipeCands) {
1399 if (DAG->IsReachable(SuccSU, PredSU)) {
1411 if (!(TempExp && TempMFMA))
1414 HasChainBetweenCvt =
none_of((*TempExp)->Succs, [&isCvt](
SDep &Succ) {
1415 return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
1419 for (
auto &SuccSU : MFMAPipeCands) {
1420 if (MFMAPipeSUs.
size() &&
1421 any_of(MFMAPipeSUs, [&SuccSU](
SUnit *PotentialMatch) {
1422 return PotentialMatch->
NodeNum == SuccSU->NodeNum;
1426 for (
auto &PredSU : ExpPipeCands) {
1427 if (DAG->IsReachable(SuccSU, PredSU)) {
1434 MFMAPipeCount = MFMAPipeSUs.
size();
1436 assert(TempExp && TempMFMA);
1437 assert(MFMAPipeCount > 0);
1439 std::optional<SUnit *> TempCvt;
1440 for (
auto &SuccSU : CvtSUs) {
1441 if (DAG->IsReachable(SuccSU, *TempExp)) {
1448 if (TempCvt.has_value()) {
1449 for (
auto &SuccSU : MFMAPipeSUs) {
1450 if (DAG->IsReachable(SuccSU, *TempCvt)) {
1458 for (
auto &MFMAPipeSU : MFMAPipeSUs) {
1462 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1464 MFMAChainSeeds.push_back(MFMAPipeSU);
1472 for (
auto Pred : MFMAChainSeeds[0]->Preds) {
1473 if (
TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
1474 Pred.getSUnit()->getInstr()->mayLoad())
1475 FirstPipeDSR = Pred.getSUnit()->NodeNum;
1478 MFMAChainLength = MFMAPipeCount / MFMAChains;
1481 unsigned PackSuccCount = std::count_if(
1482 PackSUs.
begin(), PackSUs.
end(), [
this, &TempExp](
SUnit *VPack) {
1483 return DAG->IsReachable(VPack, *TempExp);
1487 unsigned PackPredCount =
1488 std::count_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
1489 [&isBitPack](
SDep &Pred) {
1490 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1491 return isBitPack(Opc);
1495 std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
1496 [&isBitPack](
SDep &Pred) {
1497 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1498 return isBitPack(Opc);
1501 if (PackPred == (*TempMFMA)->Preds.end())
1508 std::count_if(PackPred->getSUnit()->Succs.begin(),
1509 PackPred->getSUnit()->Succs.end(), [&
TII](
SDep &Succ) {
1510 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1514 MFMAEnablement *= PackSuccCount;
1518 std::count_if(ExpPipeCands.
begin(), ExpPipeCands.
end(),
1519 [
this, &PackPred](
SUnit *ExpBase) {
1520 return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
1523 ExpRequirement *= PackPredCount;
1532 if (
Phase != AMDGPU::SchedulingPhase::PostRA)
1533 MFMAChainSeeds.clear();
1534 if (
Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(
TII))
1540bool MFMAExpInterleaveOpt::applyIGLPStrategy(
1545 bool IsSmallKernelType =
1546 MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
1547 bool IsLargeKernelType =
1548 MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;
1550 if (!(IsSmallKernelType || IsLargeKernelType))
1556 unsigned PipelineSyncID = 0;
1557 SchedGroup *SG =
nullptr;
1559 unsigned MFMAChain = 0;
1560 unsigned PositionInChain = 0;
1561 unsigned CurrMFMAForTransPosition = 0;
1563 auto incrementTransPosition = [&MFMAChain, &PositionInChain,
1564 &CurrMFMAForTransPosition]() {
1565 CurrMFMAForTransPosition += MFMAEnablement;
1566 PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
1567 MFMAChain = CurrMFMAForTransPosition % MFMAChains;
1570 auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
1571 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1572 return (TempMFMAForTrans / MFMAChains);
1575 auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
1576 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1577 return TempMFMAForTrans % MFMAChains;
1580 unsigned CurrMFMAPosition = 0;
1581 unsigned MFMAChainForMFMA = 0;
1582 unsigned PositionInChainForMFMA = 0;
1584 auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
1585 &PositionInChainForMFMA]() {
1587 MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
1588 PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
1591 bool IsPostRA =
Phase == AMDGPU::SchedulingPhase::PostRA;
1592 assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);
1594 bool UsesFMA = IsSmallKernelType || !IsPostRA;
1595 bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
1596 bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA);
1597 bool UsesVALU = IsSmallKernelType;
1602 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1603 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG,
TII);
1604 if (!IsPostRA && MFMAChains) {
1605 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1606 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1610 std::make_shared<EnablesNthMFMA>(1,
TII, SG->getSGID(),
true));
1611 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1612 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1615 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1616 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG,
TII);
1617 if (!IsPostRA && MFMAChains) {
1618 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1619 getNextTransPositionInChain(),
1620 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
true));
1622 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1,
TII,
1623 SG->getSGID(),
true));
1624 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1625 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1629 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1630 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG,
TII);
1631 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR,
TII,
1633 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1637 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1638 SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG,
TII);
1639 if (!IsPostRA && MFMAChains)
1640 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1641 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
true));
1643 SG->addRule(std::make_shared<EnablesNthMFMA>(1,
TII, SG->getSGID(),
true));
1644 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1645 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1646 HasChainBetweenCvt));
1647 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1649 incrementTransPosition();
1652 for (
unsigned I = 0;
I < ExpRequirement;
I++) {
1655 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1656 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1657 SG->addRule(std::make_shared<IsCvt>(
TII, SG->getSGID()));
1658 if (HasChainBetweenCvt)
1659 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1660 1 + (2 + UsesFMA) *
I,
TII, SG->getSGID()));
1662 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(
1663 1 + (2 + UsesFMA) *
I,
TII, SG->getSGID()));
1664 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1669 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1670 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1671 if (!IsPostRA && MFMAChains) {
1672 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1673 getNextTransPositionInChain(),
1674 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
true));
1676 SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1,
1677 TII, SG->getSGID(),
true));
1678 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1679 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1683 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1684 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1685 if (!IsPostRA && MFMAChains)
1686 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1687 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1690 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1,
TII,
1691 SG->getSGID(),
true));
1692 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1693 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1694 HasChainBetweenCvt));
1695 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1700 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1701 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1702 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1703 SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>(
1704 8,
TII, SG->getSGID(), HasChainBetweenCvt));
1705 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1710 unsigned MFMARatio =
1711 MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;
1714 MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;
1716 unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement)
1717 ? TransPipeCount - (2 * ExpRequirement)
1719 unsigned ExpLoopCount = RemainingExp / ExpRatio;
1721 unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2)
1722 ? MFMAPipeCount - (MFMAEnablement * 2)
1724 unsigned MFMALoopCount = MFMAInLoop / MFMARatio;
1726 AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount;
1727 unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount);
1729 for (
unsigned I = 0;
I < LoopSize;
I++) {
1730 if (!(
I * ExpRatio % ExpRequirement))
1731 incrementTransPosition();
1734 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1735 SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG,
TII);
1736 if (!IsPostRA && MFMAChains)
1737 SG->addRule(std::make_shared<IsExactMFMA>(
1738 PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA],
TII,
1739 SG->getSGID(),
true));
1741 SG->addRule(std::make_shared<OccursAfterExp>(
TII, SG->getSGID(),
true));
1742 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1743 incrementMFMAPosition();
1746 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1747 SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG,
TII);
1748 SG->addRule(std::make_shared<IsPipeAdd>(
TII, SG->getSGID()));
1749 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1752 if (UsesDSRead && !(
I % 4)) {
1753 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1754 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG,
TII);
1755 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR,
TII,
1757 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1761 for (
unsigned J = 0; J < ExpRatio; J++) {
1762 auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (
I + 1);
1763 auto MaxMFMAOffset =
1764 (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;
1768 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1769 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1770 SG->addRule(std::make_shared<IsCvt>(
TII, SG->getSGID()));
1771 auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;
1772 auto DSROffset =
I / 4 + 1;
1773 auto MaxDSROffset = MaxMFMAOffset / 4;
1775 auto ExpOffset =
I * ExpRatio + J >= ExpRequirement ? 0 : 1;
1776 auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +
1777 std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff +
1779 if (HasChainBetweenCvt)
1780 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1781 CurrentOffset,
TII, SG->getSGID()));
1783 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset,
TII,
1785 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1790 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1791 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1792 if (!IsPostRA && MFMAChains)
1793 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1794 getNextTransPositionInChain(),
1795 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
1798 SG->addRule(std::make_shared<EnablesNthMFMA>(
1799 (((
I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1,
1800 TII, SG->getSGID(),
true));
1801 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1802 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1806 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1807 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1808 if (!IsPostRA && MFMAChains)
1809 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1810 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1813 SG->addRule(std::make_shared<EnablesNthMFMA>(
1814 (((
I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,
1815 TII, SG->getSGID(),
true));
1816 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1817 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1818 HasChainBetweenCvt));
1819 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1824 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1825 SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG,
TII);
1826 SG->addRule(std::make_shared<OccursAfterExp>(
TII, SG->getSGID(),
true));
1827 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1831class MFMAExpSimpleInterleaveOpt final :
public IGLPStrategy {
1833 bool applyIGLPStrategy(
1844 : IGLPStrategy(DAG,
TII) {
1849bool MFMAExpSimpleInterleaveOpt::applyIGLPStrategy(
1854 unsigned MFMACount = 0;
1856 if (
TII->isMFMAorWMMA(
I))
1859 const unsigned PipelineSyncID = 0;
1860 for (
unsigned I = 0;
I < MFMACount * 3; ++
I) {
1861 SchedGroup *SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1862 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1863 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1865 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1866 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
1867 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1873class MFMASmallGemmSingleWaveOpt final :
public IGLPStrategy {
1876 class EnablesInitialMFMA final :
public InstructionRule {
1880 if (!SyncPipe.
size())
1883 if (!Cache->size()) {
1884 for (
auto &Elt : SyncPipe[0].DAG->
SUnits) {
1885 if (
TII->isMFMAorWMMA(*Elt.getInstr())) {
1889 Cache->push_back(&Elt);
1895 auto *DAG = SyncPipe[0].DAG;
1896 for (
auto &Elt : *Cache) {
1904 bool NeedsCache =
false)
1905 : InstructionRule(
TII, SGID, NeedsCache) {}
1909 class IsPermForDSW final :
public InstructionRule {
1914 if (
MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
1917 bool FitsInGroup =
false;
1919 if (!Collection.
size()) {
1920 for (
auto &Succ : SU->
Succs) {
1921 SUnit *SuccUnit = Succ.getSUnit();
1924 Cache->push_back(SuccUnit);
1937 return ThisSucc.getSUnit() == Elt;
1942 IsPermForDSW(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1943 : InstructionRule(
TII, SGID, NeedsCache) {}
1947 class IsSuccOfPrevGroup final :
public InstructionRule {
1951 SchedGroup *OtherGroup =
nullptr;
1952 for (
auto &PipeSG : SyncPipe) {
1953 if ((
unsigned)PipeSG.getSGID() == SGID - 1) {
1954 OtherGroup = &PipeSG;
1960 if (!OtherGroup->Collection.size())
1964 return any_of(OtherGroup->Collection, [&SU](
SUnit *Elt) {
1965 return any_of(Elt->Succs,
1966 [&SU](SDep &Succ) { return Succ.getSUnit() == SU; });
1970 bool NeedsCache =
false)
1971 : InstructionRule(
TII, SGID, NeedsCache) {}
1975 class VMEMSize final :
public InstructionRule {
1980 if (
MI->getOpcode() == TargetOpcode::BUNDLE)
1982 if (!Collection.
size())
1987 auto TRI =
TII->getRegisterInfo();
1988 auto &
MRI =
MI->getParent()->getParent()->getRegInfo();
1989 for (
auto &Elt : Collection) {
1990 auto Op = Elt->getInstr()->getOperand(0);
1992 TRI.getRegSizeInBits(*
TRI.getRegClassForOperandReg(
MRI,
Op));
1996 if (NumBits < 128) {
1998 if (NumBits +
TRI.getRegSizeInBits(*
TRI.getRegClassForOperandReg(
1999 MRI,
MI->getOperand(0))) <=
2007 VMEMSize(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
2008 : InstructionRule(
TII, SGID, NeedsCache) {}
2013 class SharesPredWithPrevNthGroup final :
public InstructionRule {
2015 unsigned Distance = 1;
2020 SchedGroup *OtherGroup =
nullptr;
2021 if (!SyncPipe.
size())
2024 if (!Cache->size()) {
2026 for (
auto &PipeSG : SyncPipe) {
2027 if ((
unsigned)PipeSG.getSGID() == SGID - Distance) {
2028 OtherGroup = &PipeSG;
2034 if (!OtherGroup->Collection.size())
2037 for (
auto &OtherEle : OtherGroup->Collection) {
2038 for (
auto &Pred : OtherEle->Preds) {
2039 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2040 AMDGPU::V_PERM_B32_e64)
2041 Cache->push_back(Pred.getSUnit());
2050 auto *DAG = SyncPipe[0].DAG;
2057 SharesPredWithPrevNthGroup(
unsigned Distance,
const SIInstrInfo *
TII,
2058 unsigned SGID,
bool NeedsCache =
false)
2059 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
2063 bool applyIGLPStrategy(
2074 : IGLPStrategy(DAG,
TII) {
2079static unsigned DSWCount = 0;
2080static unsigned DSWWithPermCount = 0;
2081static unsigned DSWWithSharedVMEMCount = 0;
2083bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
2087 unsigned MFMACount = 0;
2088 unsigned DSRCount = 0;
2090 bool IsInitial =
Phase == AMDGPU::SchedulingPhase::Initial;
2092 assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
2093 DSWWithSharedVMEMCount == 0)) &&
2094 "DSWCounters should be zero in pre-RA scheduling!");
2096 for (
auto &SU : DAG->SUnits) {
2097 auto *
I = SU.getInstr();
2098 if (
TII->isMFMAorWMMA(*
I))
2100 else if (
TII->isDS(*
I)) {
2103 else if (
I->mayStore() && IsInitial) {
2105 for (
auto Pred : SU.Preds) {
2106 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2107 AMDGPU::V_PERM_B32_e64) {
2117 DSWWithPermCount = DSWithPerms.
size();
2118 auto *
I = DSWithPerms.
begin();
2119 auto *E = DSWithPerms.
end();
2129 for (;
I != E;
I++) {
2130 SUnit *Cand =
nullptr;
2131 bool MissedAny =
false;
2132 for (
auto &Pred : (*I)->Preds) {
2133 if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
2139 for (
auto &Succ : Pred.getSUnit()->Succs) {
2140 auto *
MI = Succ.getSUnit()->getInstr();
2141 if (!
TII->isVMEM(*
MI) || !
MI->mayLoad())
2144 if (MissedAny || !VMEMLookup.
size()) {
2146 VMEMLookup[
MI] = *
I;
2163 if (!MissedAny && Cand) {
2164 DSWWithSharedVMEMCount += 2;
2171 assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
2173 unsigned PipelineSyncID = 0;
2175 if (DSWWithPermCount) {
2176 for (
unsigned I = 0;
I < MFMACount;
I++) {
2177 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2178 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2179 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2181 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2182 SchedGroupMask::VALU, 2, PipelineSyncID, DAG,
TII);
2183 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2193 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2194 SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG,
TII);
2195 SG->addRule(std::make_shared<EnablesInitialMFMA>(
TII, SG->getSGID(),
true));
2196 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2198 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2199 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2200 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2203 for (
unsigned I = 0;
I < DSRCount - 4; ++
I) {
2204 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2205 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG,
TII);
2206 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2208 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2209 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2210 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2216 for (
unsigned I = 0;
I < DSWWithPermCount - DSWWithSharedVMEMCount; ++
I) {
2217 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2218 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2219 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2220 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2222 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2223 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2224 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2225 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2227 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2228 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2229 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2230 1,
TII, SG->getSGID(),
true));
2231 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2232 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2234 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2235 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2236 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2238 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2239 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2240 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2241 3,
TII, SG->getSGID(),
true));
2242 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2243 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2245 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2246 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2247 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2253 for (
unsigned I = 0;
I < DSWCount - DSWWithPermCount;
I++) {
2254 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2255 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2256 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2258 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2259 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2260 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2261 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2263 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2264 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2265 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2273 for (
unsigned I = 0;
I < DSWWithSharedVMEMCount; ++
I) {
2274 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2275 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2276 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2277 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2279 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2280 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2281 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2282 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2284 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2285 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2286 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2288 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2289 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2290 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2291 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2293 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2294 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2295 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2296 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2298 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2299 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2300 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2302 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2303 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2304 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2305 2,
TII, SG->getSGID(),
true));
2306 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2307 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2309 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2310 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2311 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2313 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2314 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2315 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2316 4,
TII, SG->getSGID(),
true));
2317 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2318 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2320 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2321 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2322 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2328static std::unique_ptr<IGLPStrategy>
2332 case MFMASmallGemmOptID:
2333 return std::make_unique<MFMASmallGemmOpt>(DAG,
TII);
2334 case MFMASmallGemmSingleWaveOptID:
2335 return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG,
TII);
2336 case MFMAExpInterleaveID:
2337 return std::make_unique<MFMAExpInterleaveOpt>(DAG,
TII);
2338 case MFMAExpSimpleInterleaveID:
2339 return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG,
TII);
2360 void addSchedBarrierEdges(
SUnit &SU);
2371 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask)
const;
2374 void initSchedGroupBarrierPipelineStage(
2375 std::vector<SUnit>::reverse_iterator RIter);
2377 bool initIGLPOpt(
SUnit &SU);
2387 bool IsBottomUp =
true;
2392 IGroupLPDAGMutation() =
default;
2396unsigned SchedGroup::NumSchedGroups = 0;
2408 if (
MI.isMetaInstruction())
2411 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
2416 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
2420 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
2424 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
2425 TII->isMFMAorWMMA(
MI))
2428 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
2432 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
2437 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
2442 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
2446 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
2447 MI.mayLoad() &&
TII->isDS(
MI))
2450 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
2451 MI.mayStore() &&
TII->isDS(
MI))
2454 else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
2459 dbgs() <<
"For SchedGroup with mask " <<
format_hex((
int)SGMask, 10,
true)
2460 << (Result ?
" could classify " :
" unable to classify ") <<
MI);
2465int SchedGroup::link(
SUnit &SU,
bool MakePred,
2466 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
2467 int MissedEdges = 0;
2468 for (
auto *
A : Collection) {
2470 if (
A ==
B ||
A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2480 bool Added = tryAddEdge(
A,
B);
2482 AddedEdges.emplace_back(
A,
B);
2490void SchedGroup::link(
SUnit &SU,
bool MakePred) {
2491 for (
auto *
A : Collection) {
2493 if (
A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2502void SchedGroup::link(
SUnit &SU,
2504 for (
auto *
A : Collection) {
2513void SchedGroup::link(SchedGroup &OtherGroup) {
2514 for (
auto *
B : OtherGroup.Collection)
2518bool SchedGroup::canAddSU(
SUnit &SU)
const {
2520 if (
MI.getOpcode() != TargetOpcode::BUNDLE)
2521 return canAddMI(
MI);
2526 while (E !=
MBB->
end() && E->isBundledWithPred())
2533void SchedGroup::initSchedGroup() {
2534 for (
auto &SU : DAG->
SUnits) {
2543void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
2544 SUnitsToCandidateSGsMap &SyncedInstrs) {
2545 SUnit &InitSU = *RIter;
2546 for (
auto E = DAG->
SUnits.rend(); RIter != E; ++RIter) {
2552 SyncedInstrs[&SU].push_back(SGID);
2560void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
2561 auto I = DAG->
SUnits.rbegin();
2562 auto E = DAG->
SUnits.rend();
2563 for (;
I != E; ++
I) {
2568 SyncedInstrs[&SU].push_back(SGID);
2574 if (!TSchedModel || DAGInstrs->
SUnits.empty())
2579 TII =
ST.getInstrInfo();
2581 SyncedSchedGroups.clear();
2582 SyncedInstrs.clear();
2583 bool FoundSB =
false;
2584 bool FoundIGLP =
false;
2585 bool ShouldApplyIGLP =
false;
2586 for (
auto R = DAG->
SUnits.rbegin(), E = DAG->
SUnits.rend();
R != E; ++
R) {
2587 unsigned Opc =
R->getInstr()->getOpcode();
2589 if (Opc == AMDGPU::SCHED_BARRIER) {
2590 addSchedBarrierEdges(*R);
2592 }
else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
2593 initSchedGroupBarrierPipelineStage(R);
2595 }
else if (Opc == AMDGPU::IGLP_OPT) {
2596 if (!FoundSB && !FoundIGLP) {
2598 ShouldApplyIGLP = initIGLPOpt(*R);
2603 if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
2604 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
2612void IGroupLPDAGMutation::addSchedBarrierEdges(
SUnit &SchedBarrier) {
2614 assert(
MI.getOpcode() == AMDGPU::SCHED_BARRIER);
2617 LLVM_DEBUG(
dbgs() <<
"Building SchedGroup for SchedBarrier with Mask: "
2618 <<
MI.getOperand(0).getImm() <<
"\n");
2620 invertSchedBarrierMask((SchedGroupMask)
MI.getOperand(0).getImm());
2621 SchedGroup SG(InvertedMask, std::nullopt, DAG,
TII);
2622 SG.initSchedGroup();
2628 const SUnit *
A,
const SUnit *
B) {
return A->NodeNum >
B->NodeNum; });
2632IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask)
const {
2635 SchedGroupMask InvertedMask = ~Mask;
2638 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
2639 InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
2640 ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
2642 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
2643 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
2644 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
2645 (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
2646 InvertedMask &= ~SchedGroupMask::ALU;
2649 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
2650 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
2652 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
2653 (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
2654 InvertedMask &= ~SchedGroupMask::VMEM;
2657 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
2658 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
2660 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
2661 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
2662 InvertedMask &= ~SchedGroupMask::DS;
2664 LLVM_DEBUG(
dbgs() <<
"After Inverting, SchedGroup Mask: " << (
int)InvertedMask
2667 return InvertedMask;
2670void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
2671 std::vector<SUnit>::reverse_iterator RIter) {
2680 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
2683 SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
2686bool IGroupLPDAGMutation::initIGLPOpt(
SUnit &SU) {
2687 IGLPStrategyID StrategyID =
2689 auto S = createIGLPStrategy(StrategyID, DAG,
TII);
2690 if (!S->shouldApplyStrategy(DAG,
Phase))
2693 IsBottomUp = S->IsBottomUp;
2694 return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups,
Phase);
2706std::unique_ptr<ScheduleDAGMutation>
2708 return std::make_unique<IGroupLPDAGMutation>(
Phase);
unsigned const MachineRegisterInfo * MRI
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late Phase
Provides AMDGPU specific target descriptions.
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
This file defines the DenseMap class.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
This class represents an Operation in the Expression.
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Instructions::iterator instr_iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const MachineOperand & getOperand(unsigned i) const
@ Data
Regular data dependence (aka true-dependence).
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Scheduling unit. This is a node in the scheduling DAG.
unsigned NodeNum
Entry # of node in the node vector.
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
SmallVector< SDep, 4 > Succs
All sunit successors.
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
bool IsReachable(SUnit *SU, SUnit *TargetSU)
IsReachable - Checks if SU is reachable from TargetSU.
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU)
True if an edge can be added from PredSU to SuccSU without creating a cycle.
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Mutate the DAG as a postpass after normal DAG building.
virtual void apply(ScheduleDAGInstrs *DAG)=0
std::vector< SUnit > SUnits
The scheduling units.
MachineFunction & MF
Machine function.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Provide an instruction scheduling machine model to CodeGen passes.
An efficient, type-erasing, non-owning reference to a callable.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
initializer< Ty > init(const Ty &Val)
void link(std::unique_ptr< LinkGraph > G, std::unique_ptr< JITLinkContext > Ctx)
Link the given graph.
This is an optimization pass for GlobalISel generic memory operations.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
FormattedNumber format_hex(uint64_t N, unsigned Width, bool Upper=false)
format_hex - Output N as a fixed width hexadecimal.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.