30 #define DEBUG_TYPE "igrouplp"
36 cl::desc(
"Whether to use the exponential time solver to fit "
37 "the instructions to the pipeline as closely as "
43 cl::desc(
"The maximum number of scheduling group conflicts "
44 "which we attempt to solve with the exponential time "
45 "exact solver. Problem sizes greater than this will"
46 "be solved by the less accurate greedy algorithm. Selecting "
47 "solver by size is superseded by manually selecting "
48 "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
52 cl::desc(
"The amount of branches that we are willing to explore with"
53 "the exact algorithm before giving up."));
57 cl::desc(
"Whether to use the cost heuristic to make choices as we "
58 "traverse the search space using the exact solver. Defaulted "
59 "to on, and if turned off, we will use the node order -- "
60 "attempting to put the later nodes in the later sched groups. "
61 "Experimentally, results are mixed, so this should be set on a "
62 "case-by-case basis."));
66 enum class SchedGroupMask {
78 ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
93 SchedGroupMask SGMask;
96 std::optional<unsigned> MaxSize;
106 static unsigned NumSchedGroups;
124 bool canAddSU(
SUnit &SU)
const;
129 void link(
SUnit &SU,
bool MakePred =
false);
134 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
143 void link(SchedGroup &OtherGroup);
146 bool isFull()
const {
return MaxSize && Collection.size() >= *MaxSize; }
151 <<
format_hex((
int)SGMask, 10,
true) <<
" adding "
153 Collection.push_back(&SU);
157 void pop() { Collection.pop_back(); }
160 void initSchedGroup();
167 void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
168 SUnitsToCandidateSGsMap &SyncedInstrs);
170 void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
172 int getSyncID() {
return SyncID; }
174 int getSGID() {
return SGID; }
176 SchedGroupMask getMask() {
return SGMask; }
178 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
180 : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG),
TII(
TII) {
181 SGID = NumSchedGroups++;
184 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
int SyncID,
186 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG),
TII(
TII) {
187 SGID = NumSchedGroups++;
197 while (!SU.
Preds.empty())
201 while (!SU.
Succs.empty())
203 for (
auto &SP :
S.getSUnit()->Preds)
204 if (SP.getSUnit() == &SU)
205 S.getSUnit()->removePred(SP);
208 typedef std::pair<SUnit *, SmallVector<int, 4>> SUToCandSGsPair;
220 class PipelineSolver {
233 bool NeedsSolver =
false;
237 unsigned computeProblemSize();
248 int CurrConflInstNo = 0;
250 int CurrSyncGroupIdx = 0;
252 int BeginSyncGroupIdx = 0;
258 void advancePosition();
261 void retreatPosition();
270 void populateReadyList(SUToCandSGsPair &CurrSU,
278 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
280 void removeEdges(
const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
282 void convertSyncMapsToArrays();
294 : DAG(DAG), SyncedInstrs(SyncedInstrs),
295 SyncedSchedGroups(SyncedSchedGroups) {
297 for (
auto &PipelineInstrs : SyncedInstrs) {
298 if (PipelineInstrs.second.size() > 0) {
307 convertSyncMapsToArrays();
309 CurrPipeline = BestPipeline;
311 while (
static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&
312 PipelineInstrs[BeginSyncGroupIdx].size() == 0)
315 if (
static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())
320 void PipelineSolver::reset() {
322 for (
auto &SyncPipeline : CurrPipeline) {
323 for (
auto &SG : SyncPipeline) {
325 SG.Collection.
clear();
329 if (SchedBarr != TempCollection.end())
330 SG.Collection.push_back(*SchedBarr);
334 CurrSyncGroupIdx = BeginSyncGroupIdx;
339 void PipelineSolver::convertSyncMapsToArrays() {
340 for (
auto &SyncPipe : SyncedSchedGroups) {
341 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
344 int PipelineIDx = SyncedInstrs.size() - 1;
345 PipelineInstrs.resize(SyncedInstrs.size());
346 for (
auto &SyncInstrMap : SyncedInstrs) {
347 for (
auto &SUsToCandSGs : SyncInstrMap.second) {
348 if (PipelineInstrs[PipelineIDx].
size() == 0) {
349 PipelineInstrs[PipelineIDx].push_back(
350 std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
353 auto SortPosition = PipelineInstrs[PipelineIDx].begin();
356 while (SortPosition != PipelineInstrs[PipelineIDx].
end() &&
357 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
359 PipelineInstrs[PipelineIDx].insert(
360 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
366 void PipelineSolver::makePipeline() {
368 for (
auto &SyncPipeline : BestPipeline) {
369 for (
auto &SG : SyncPipeline) {
370 SUnit *SGBarr =
nullptr;
371 for (
auto &SU : SG.Collection) {
378 resetEdges(*SGBarr, DAG);
379 SG.link(*SGBarr,
false);
383 for (
auto &SyncPipeline : BestPipeline) {
384 auto I = SyncPipeline.rbegin();
385 auto E = SyncPipeline.rend();
386 for (;
I !=
E; ++
I) {
388 for (
auto J = std::next(
I); J !=
E; ++J) {
396 int PipelineSolver::addEdges(
398 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
400 bool MakePred =
false;
407 auto GroupNo = (
int)SyncPipeline.size() - 1;
408 for (; GroupNo >= 0; GroupNo--) {
409 if (SyncPipeline[GroupNo].getSGID() == SGID) {
413 auto Group = &SyncPipeline[GroupNo];
414 AddedCost += Group->link(*SU, MakePred, AddedEdges);
421 void PipelineSolver::removeEdges(
422 const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
425 for (
auto &PredSuccPair : EdgesToRemove) {
426 SUnit *Pred = PredSuccPair.first;
427 SUnit *Succ = PredSuccPair.second;
430 Succ->
Preds, [&Pred](
SDep &
P) { return P.getSUnit() == Pred; });
438 void PipelineSolver::advancePosition() {
441 if (
static_cast<size_t>(CurrConflInstNo) >=
442 PipelineInstrs[CurrSyncGroupIdx].
size()) {
446 while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
447 PipelineInstrs[CurrSyncGroupIdx].size() == 0)
452 void PipelineSolver::retreatPosition() {
453 assert(CurrConflInstNo >= 0);
454 assert(CurrSyncGroupIdx >= 0);
456 if (CurrConflInstNo > 0) {
461 if (CurrConflInstNo == 0) {
464 if (CurrSyncGroupIdx == BeginSyncGroupIdx)
469 while (PipelineInstrs[CurrSyncGroupIdx].
size() == 0)
472 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
476 bool PipelineSolver::checkOptimal() {
477 if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
478 if (BestCost == -1 || CurrCost < BestCost) {
479 BestPipeline = CurrPipeline;
486 bool DoneExploring =
false;
487 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
488 DoneExploring =
true;
490 return (DoneExploring || BestCost == 0);
493 void PipelineSolver::populateReadyList(
494 SUToCandSGsPair &CurrSU,
SmallVectorImpl<std::pair<int, int>> &ReadyList,
496 assert(CurrSU.second.size() >= 1);
497 auto I = CurrSU.second.rbegin();
498 auto E = CurrSU.second.rend();
499 for (;
I !=
E; ++
I) {
500 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
503 for (
auto &SG : SyncPipeline) {
504 if (SG.getSGID() == CandSGID)
509 if (
Match->isFull()) {
510 ReadyList.push_back(std::pair(*
I, MissPenalty));
514 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
515 ReadyList.push_back(std::pair(*
I, TempCost));
516 removeEdges(AddedEdges);
518 ReadyList.push_back(std::pair(*
I, -1));
522 std::sort(ReadyList.begin(), ReadyList.end(),
523 [](std::pair<int, int> A, std::pair<int, int>
B) {
524 return A.second < B.second;
528 assert(ReadyList.size() == CurrSU.second.size());
531 bool PipelineSolver::solveExact() {
535 if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
538 assert(
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
539 assert(
static_cast<size_t>(CurrConflInstNo) <
540 PipelineInstrs[CurrSyncGroupIdx].
size());
541 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
543 <<
") in Pipeline # " << CurrSyncGroupIdx <<
"\n");
548 populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]);
550 auto I = ReadyList.begin();
551 auto E = ReadyList.end();
552 for (;
I !=
E; ++
I) {
556 if (BestCost != -1 && (CurrCost +
I->second > BestCost))
559 int CandSGID =
I->first;
561 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
562 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
564 for (
auto &SG : SyncPipeline) {
565 if (SG.getSGID() == CandSGID)
573 << (
int)
Match->getMask() <<
"and ID " << CandSGID
575 Match->add(*CurrSU.first);
576 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
577 LLVM_DEBUG(
dbgs() <<
"Cost of Assignment: " << AddedCost <<
"\n");
578 CurrCost += AddedCost;
581 bool FinishedExploring =
false;
584 if (CurrCost < BestCost || BestCost == -1) {
586 FinishedExploring = BestCost != 0;
587 if (!FinishedExploring)
593 CurrCost -= AddedCost;
594 removeEdges(AddedEdges);
596 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
597 if (FinishedExploring)
604 CurrCost += MissPenalty;
607 LLVM_DEBUG(
dbgs() <<
"NOT Assigned (" << CurrSU.first->NodeNum <<
")\n");
609 bool FinishedExploring =
false;
610 if (CurrCost < BestCost || BestCost == -1) {
612 bool FinishedExploring = BestCost != 0;
613 if (!FinishedExploring)
619 CurrCost -= MissPenalty;
620 return FinishedExploring;
623 bool PipelineSolver::solveGreedy() {
625 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
627 while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
628 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
629 int BestNodeCost = -1;
631 SchedGroup *BestGroup =
nullptr;
632 int BestGroupID = -1;
633 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
635 <<
") in Pipeline # " << CurrSyncGroupIdx <<
"\n");
641 auto I = CurrSU.second.rbegin();
642 auto E = CurrSU.second.rend();
643 for (;
I !=
E; ++
I) {
644 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
647 for (
auto &SG : SyncPipeline) {
648 if (SG.getSGID() == CandSGID)
652 LLVM_DEBUG(
dbgs() <<
"Trying SGID # " << CandSGID <<
" with Mask "
653 << (
int)
Match->getMask() <<
"\n");
655 if (
Match->isFull()) {
659 TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
661 if (TempCost < BestNodeCost || BestNodeCost == -1) {
663 BestNodeCost = TempCost;
664 BestGroupID = CandSGID;
666 removeEdges(AddedEdges);
667 if (BestNodeCost == 0)
671 if (BestGroupID != -1) {
672 BestGroup->add(*CurrSU.first);
673 addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
674 LLVM_DEBUG(
dbgs() <<
"Best Group has ID: " << BestGroupID <<
" and Mask"
675 << (
int)BestGroup->getMask() <<
"\n");
676 BestCost += TempCost;
678 BestCost += MissPenalty;
680 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
683 BestPipeline = CurrPipeline;
684 removeEdges(AddedEdges);
688 unsigned PipelineSolver::computeProblemSize() {
689 unsigned ProblemSize = 0;
690 for (
auto &PipeConflicts : PipelineInstrs) {
691 ProblemSize += PipeConflicts.size();
701 unsigned ProblemSize = computeProblemSize();
704 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
705 MissPenalty = (ProblemSize / 2) + 1;
708 if (EnableExactSolver || BelowCutoff) {
712 LLVM_DEBUG(
dbgs() <<
"Greedy produced best cost of " << BestCost <<
"\n");
716 LLVM_DEBUG(
dbgs() <<
"Exact produced best cost of " << BestCost <<
"\n");
726 enum IGLPStrategyID :
int { MFMASmallGemmOptID = 0 };
737 virtual void applyIGLPStrategy(
747 virtual ~IGLPStrategy() =
default;
750 class MFMASmallGemmOpt final :
public IGLPStrategy {
752 void applyIGLPStrategy(
759 : IGLPStrategy(DAG,
TII) {}
762 void MFMASmallGemmOpt::applyIGLPStrategy(
766 unsigned MFMACount = 0;
768 if (
TII->isMFMAorWMMA(
I))
771 const unsigned PipelineSyncID = 0;
772 SchedGroup *SG =
nullptr;
773 for (
unsigned I = 0;
I < MFMACount * 3; ++
I) {
774 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
776 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
778 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
779 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
780 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
784 static std::unique_ptr<IGLPStrategy>
788 case MFMASmallGemmOptID:
789 return std::make_unique<MFMASmallGemmOpt>(DAG,
TII);
810 void addSchedBarrierEdges(
SUnit &SU);
821 SchedGroupMask invertSchedBarrierMask(SchedGroupMask
Mask)
const;
824 void initSchedGroupBarrierPipelineStage(
825 std::vector<SUnit>::reverse_iterator RIter);
827 void initIGLPOpt(
SUnit &SU);
832 IGroupLPDAGMutation() =
default;
835 unsigned SchedGroup::NumSchedGroups = 0;
847 if (
MI.isMetaInstruction())
863 TII->isMFMAorWMMA(
MI))
889 MI.mayStore() &&
TII->isDS(
MI))
893 dbgs() <<
"For SchedGroup with mask " <<
format_hex((
int)SGMask, 10,
true)
894 << (Result ?
" could classify " :
" unable to classify ") <<
MI);
900 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
902 for (
auto *A : Collection) {
904 if (A ==
B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
913 bool Added = tryAddEdge(A,
B);
915 AddedEdges.push_back(std::pair(A,
B));
924 for (
auto *A : Collection) {
926 if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
937 for (
auto *A : Collection) {
947 for (
auto *
B : OtherGroup.Collection)
951 bool SchedGroup::canAddSU(
SUnit &SU)
const {
953 if (
MI.getOpcode() != TargetOpcode::BUNDLE)
959 while (
E !=
MBB->
end() &&
E->isBundledWithPred())
966 void SchedGroup::initSchedGroup() {
967 for (
auto &SU : DAG->
SUnits) {
976 void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
977 SUnitsToCandidateSGsMap &SyncedInstrs) {
978 SUnit &InitSU = *RIter;
979 for (
auto E = DAG->
SUnits.rend(); RIter !=
E; ++RIter) {
985 SyncedInstrs[&SU].push_back(SGID);
993 void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
996 for (;
I !=
E; ++
I) {
1002 SyncedInstrs[&SU].push_back(SGID);
1008 if (!TSchedModel || DAGInstrs->
SUnits.empty())
1013 TII =
ST.getInstrInfo();
1015 SyncedSchedGroups.clear();
1016 SyncedInstrs.clear();
1017 bool foundSB =
false;
1018 bool foundIGLP =
false;
1019 for (
auto R = DAG->
SUnits.rbegin(),
E = DAG->
SUnits.rend(); R !=
E; ++R) {
1020 unsigned Opc = R->getInstr()->getOpcode();
1022 if (Opc == AMDGPU::SCHED_BARRIER) {
1023 addSchedBarrierEdges(*R);
1025 }
else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
1026 initSchedGroupBarrierPipelineStage(R);
1028 }
else if (Opc == AMDGPU::IGLP_OPT) {
1029 resetEdges(*R, DAG);
1030 if (!foundSB && !foundIGLP)
1036 if (foundSB || foundIGLP) {
1037 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG);
1044 void IGroupLPDAGMutation::addSchedBarrierEdges(
SUnit &SchedBarrier) {
1046 assert(
MI.getOpcode() == AMDGPU::SCHED_BARRIER);
1049 resetEdges(SchedBarrier, DAG);
1051 invertSchedBarrierMask((SchedGroupMask)
MI.getOperand(0).getImm());
1052 SchedGroup SG(InvertedMask, std::nullopt, DAG,
TII);
1053 SG.initSchedGroup();
1058 const SUnit *A,
const SUnit *
B) {
return A->NodeNum >
B->NodeNum; });
1062 IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask
Mask)
const {
1065 SchedGroupMask InvertedMask = ~
Mask;
1070 ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & ~SchedGroupMask::MFMA;
1075 InvertedMask &= ~SchedGroupMask::ALU;
1079 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
1083 InvertedMask &= ~SchedGroupMask::VMEM;
1087 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
1091 InvertedMask &= ~SchedGroupMask::DS;
1093 return InvertedMask;
1096 void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
1097 std::vector<SUnit>::reverse_iterator RIter) {
1100 resetEdges(*RIter, DAG);
1107 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
1108 Size, SyncID, DAG,
TII);
1110 SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
1113 void IGroupLPDAGMutation::initIGLPOpt(
SUnit &SU) {
1114 IGLPStrategyID StrategyID =
1116 auto S = createIGLPStrategy(StrategyID, DAG,
TII);
1117 if (
S->shouldApplyStrategy(DAG))
1118 S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups);
1126 return std::make_unique<IGroupLPDAGMutation>();