32#define DEBUG_TYPE "igrouplp"
38 cl::desc(
"Whether to use the exponential time solver to fit "
39 "the instructions to the pipeline as closely as "
45 cl::desc(
"The maximum number of scheduling group conflicts "
46 "which we attempt to solve with the exponential time "
47 "exact solver. Problem sizes greater than this will"
48 "be solved by the less accurate greedy algorithm. Selecting "
49 "solver by size is superseded by manually selecting "
50 "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
54 cl::desc(
"The amount of branches that we are willing to explore with"
55 "the exact algorithm before giving up."));
59 cl::desc(
"Whether to use the cost heuristic to make choices as we "
60 "traverse the search space using the exact solver. Defaulted "
61 "to on, and if turned off, we will use the node order -- "
62 "attempting to put the later nodes in the later sched groups. "
63 "Experimentally, results are mixed, so this should be set on a "
64 "case-by-case basis."));
68enum class SchedGroupMask {
82 ALL = ALU | VALU |
SALU |
MFMA |
VMEM | VMEM_READ | VMEM_WRITE |
DS |
83 DS_READ | DS_WRITE |
TRANS | LDSDMA,
92class InstructionRule {
98 std::optional<SmallVector<SUnit *, 4>> Cache;
108 bool NeedsCache =
false)
115 virtual ~InstructionRule() =
default;
128 SchedGroupMask SGMask;
131 std::optional<unsigned> MaxSize;
144 static unsigned NumSchedGroups;
161 bool canAddSU(
SUnit &SU)
const;
166 void link(
SUnit &SU,
bool MakePred =
false);
170 int link(
SUnit &SU,
bool MakePred,
171 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges);
180 void link(SchedGroup &OtherGroup);
183 bool isFull()
const {
return MaxSize && Collection.
size() >= *MaxSize; }
189 void addRule(std::shared_ptr<InstructionRule> NewRule) {
194 bool allowedByRules(
const SUnit *SU,
196 for (
auto &Rule : Rules) {
197 if (!Rule->apply(SU, Collection, SyncPipe))
204 void add(
SUnit &SU) {
206 <<
format_hex((
int)SGMask, 10,
true) <<
" adding "
212 void pop() { Collection.
pop_back(); }
215 void findCandidateSUnits(
T Begin,
T End,
216 SUnitsToCandidateSGsMap &SyncedInstrs);
221 void findCandidateSUnits(SUnitsToCandidateSGsMap &SyncedInstrs);
223 int getSyncID() {
return SyncID; }
225 int getSGID() {
return SGID; }
227 SchedGroupMask
getMask() {
return SGMask; }
229 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
231 : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG),
TII(
TII) {
232 SGID = NumSchedGroups++;
235 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
int SyncID,
237 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG),
TII(
TII) {
238 SGID = NumSchedGroups++;
242using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;
254class PipelineSolver {
267 bool NeedsSolver =
false;
271 unsigned computeProblemSize();
282 int CurrConflInstNo = 0;
284 int CurrSyncGroupIdx = 0;
286 int BeginSyncGroupIdx = 0;
292 bool IsBottomUp =
true;
295 void advancePosition();
298 void retreatPosition();
307 template <
typename T>
308 void greedyFind(std::list<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
T E);
313 template <
typename T>
320 template <
typename T>
void linkSchedGroups(
T I,
T E);
324 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges);
330 class EdgeSetBuilder {
336 bool Initialized =
false;
340 template <
bool ComputePreds>
354 : SU(SU), SyncPipeline(SyncPipeline), IsBottomUp(IsBottomUp) {}
360 int build(
int SGID, std::list<std::pair<SUnit *, SUnit *>> &NewEdges);
363 template <
typename T>
365 std::list<std::pair<SUnit *, SUnit *>> &NewEdges);
371 template <
typename T>
372 int linkSUnit(
SUnit *SU,
int SGID,
373 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
T E);
375 void removeEdges(
const std::list<std::pair<SUnit *, SUnit *>> &AddedEdges);
377 void convertSyncMapsToArrays();
389 : DAG(DAG), SyncedInstrs(SyncedInstrs),
390 SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
392 for (
auto &PipelineInstrs : SyncedInstrs) {
393 if (!PipelineInstrs.second.
empty()) {
402 convertSyncMapsToArrays();
404 CurrPipeline = BestPipeline;
406 while (
static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.
size() &&
407 PipelineInstrs[BeginSyncGroupIdx].
empty())
410 if (
static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.
size())
415void PipelineSolver::reset() {
417 for (
auto &SyncPipeline : CurrPipeline) {
418 for (
auto &SG : SyncPipeline) {
420 SG.Collection.
clear();
424 if (SchedBarr != TempCollection.
end())
425 SG.Collection.push_back(*SchedBarr);
429 CurrSyncGroupIdx = BeginSyncGroupIdx;
434void PipelineSolver::convertSyncMapsToArrays() {
435 for (
auto &SyncPipe : SyncedSchedGroups) {
436 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
439 int PipelineIDx = SyncedInstrs.size() - 1;
440 PipelineInstrs.resize(SyncedInstrs.size());
441 for (
auto &SyncInstrMap : SyncedInstrs) {
442 for (
auto &SUsToCandSGs : SyncInstrMap.second) {
443 if (PipelineInstrs[PipelineIDx].empty()) {
444 PipelineInstrs[PipelineIDx].push_back(
445 std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
448 auto *SortPosition = PipelineInstrs[PipelineIDx].begin();
451 while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
452 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
454 PipelineInstrs[PipelineIDx].insert(
455 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
461template <
typename T>
void PipelineSolver::linkSchedGroups(
T I,
T E) {
462 for (;
I !=
E; ++
I) {
464 for (
auto J = std::next(
I); J !=
E; ++J) {
471void PipelineSolver::makePipeline() {
473 for (
auto &SyncPipeline : BestPipeline) {
475 for (
auto &SG : SyncPipeline) {
478 SUnit *SGBarr =
nullptr;
479 for (
auto &SU : SG.Collection) {
480 if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
487 SG.link(*SGBarr,
false);
491 for (
auto &SyncPipeline : BestPipeline) {
492 IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
493 : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
498int PipelineSolver::linkSUnit(
499 SUnit *SU,
int SGID, std::list<std::pair<SUnit *, SUnit *>> &AddedEdges,
501 bool MakePred =
false;
504 if (
I->getSGID() == SGID) {
509 AddedCost += Group.link(*SU, MakePred, AddedEdges);
515template <
bool ComputePreds>
516void PipelineSolver::EdgeSetBuilder::computeReachable(
518 if (!Reachable.insert(Start).second)
523 while (!WorkList.
empty()) {
526 for (
const SDep &Dep : ComputePreds ? Current->
Preds : Current->
Succs) {
527 if (Reachable.insert(Dep.getSUnit()).second)
535 computeReachable<
true>(Preds, Start);
540 computeReachable<
false>(Succs, Start);
543int PipelineSolver::EdgeSetBuilder::build(
544 int SGID, std::list<std::pair<SUnit *, SUnit *>> &NewEdges) {
546 computePreds(InitialPreds, SU);
547 computeSuccs(Succs, SU);
552 return IsBottomUp ? buildImpl(SGID,
reverse(SyncPipeline), NewEdges)
560int PipelineSolver::EdgeSetBuilder::buildImpl(
562 std::list<std::pair<SUnit *, SUnit *>> &NewEdges) {
580 bool MakePred =
false;
581 for (SchedGroup &SG : SchedGroups) {
582 if (SG.getSGID() == SGID) {
587 for (
SUnit *
A : SG.Collection) {
588 if (
A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
599 NewEdges.emplace_back(SU,
A);
608 NewEdges.emplace_back(
A, SU);
609 computePreds(Preds,
A);
616int PipelineSolver::addEdges(
618 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges) {
628 return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.
rbegin(),
630 : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.
begin(),
634void PipelineSolver::removeEdges(
635 const std::list<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
638 for (
auto &PredSuccPair : EdgesToRemove) {
639 SUnit *Pred = PredSuccPair.first;
640 SUnit *Succ = PredSuccPair.second;
643 return P.getSUnit() == Pred && P.isArtificial();
645 if (Match != Succ->
Preds.end())
650void PipelineSolver::advancePosition() {
653 if (
static_cast<size_t>(CurrConflInstNo) >=
654 PipelineInstrs[CurrSyncGroupIdx].
size()) {
658 while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
659 PipelineInstrs[CurrSyncGroupIdx].empty())
664void PipelineSolver::retreatPosition() {
665 assert(CurrConflInstNo >= 0);
666 assert(CurrSyncGroupIdx >= 0);
668 if (CurrConflInstNo > 0) {
673 if (CurrConflInstNo == 0) {
676 if (CurrSyncGroupIdx == BeginSyncGroupIdx)
681 while (PipelineInstrs[CurrSyncGroupIdx].empty())
684 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
688bool PipelineSolver::checkOptimal() {
689 if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
690 if (BestCost == -1 || CurrCost < BestCost) {
691 BestPipeline = CurrPipeline;
698 bool DoneExploring =
false;
699 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
700 DoneExploring =
true;
702 return (DoneExploring || BestCost == 0);
706void PipelineSolver::populateReadyList(
708 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
709 auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
710 assert(CurrSU.second.size() >= 1);
712 for (;
I !=
E; ++
I) {
713 std::list<std::pair<SUnit *, SUnit *>> AddedEdges;
715 SchedGroup *Match =
llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
716 return SG.getSGID() == CandSGID;
721 if (Match->isFull()) {
722 ReadyList.push_back(std::pair(*
I, MissPenalty));
726 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
727 ReadyList.push_back(std::pair(*
I, TempCost));
728 removeEdges(AddedEdges);
730 ReadyList.push_back(std::pair(*
I, -1));
736 assert(ReadyList.size() == CurrSU.second.size());
739bool PipelineSolver::solveExact() {
743 if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
746 assert(
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
747 assert(
static_cast<size_t>(CurrConflInstNo) <
748 PipelineInstrs[CurrSyncGroupIdx].
size());
749 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
751 <<
") in Pipeline # " << CurrSyncGroupIdx <<
"\n");
756 IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.
rbegin(),
757 CurrSU.second.rend())
758 : populateReadyList(ReadyList, CurrSU.second.
begin(),
759 CurrSU.second.end());
761 auto *
I = ReadyList.
begin();
762 auto *
E = ReadyList.
end();
763 for (;
I !=
E; ++
I) {
767 if (BestCost != -1 && (CurrCost +
I->second > BestCost))
770 int CandSGID =
I->first;
772 std::list<std::pair<SUnit *, SUnit *>> AddedEdges;
773 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
775 for (
auto &SG : SyncPipeline) {
776 if (SG.getSGID() == CandSGID)
783 if (!Match->allowedByRules(CurrSU.first, SyncPipeline))
787 << (
int)Match->getMask() <<
"and ID " << CandSGID
789 Match->add(*CurrSU.first);
790 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
791 LLVM_DEBUG(
dbgs() <<
"Cost of Assignment: " << AddedCost <<
"\n");
792 CurrCost += AddedCost;
795 bool FinishedExploring =
false;
798 if (CurrCost < BestCost || BestCost == -1) {
800 FinishedExploring = BestCost != 0;
801 if (!FinishedExploring)
807 CurrCost -= AddedCost;
808 removeEdges(AddedEdges);
810 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
811 if (FinishedExploring)
818 CurrCost += MissPenalty;
821 LLVM_DEBUG(
dbgs() <<
"NOT Assigned (" << CurrSU.first->NodeNum <<
")\n");
823 bool FinishedExploring =
false;
824 if (CurrCost < BestCost || BestCost == -1) {
826 bool FinishedExploring = BestCost != 0;
827 if (!FinishedExploring)
833 CurrCost -= MissPenalty;
834 return FinishedExploring;
838void PipelineSolver::greedyFind(
839 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
T E) {
840 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
844 std::list<std::pair<SUnit *, SUnit *>> Edges;
847 std::optional<GroupInfo> Best;
849 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
851 <<
") in Pipeline # " << CurrSyncGroupIdx <<
"\n");
853 EdgeSetBuilder Builder(CurrSU.first, SyncPipeline, IsBottomUp);
859 for (;
I !=
E; ++
I) {
861 SchedGroup *Match =
llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
862 return SG.getSGID() == CandSGID;
866 LLVM_DEBUG(
dbgs() <<
"Trying SGID # " << CandSGID <<
" with Mask "
867 << (
int)Match->getMask() <<
"\n");
869 if (Match->isFull()) {
873 if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) {
874 LLVM_DEBUG(
dbgs() <<
"SGID # " << CandSGID <<
" has conflicting rule\n");
878 std::list<std::pair<SUnit *, SUnit *>> TempEdges;
879 int TempCost = Builder.build(CandSGID, TempEdges);
882 if (!Best || TempCost < Best->Cost) {
883 Best = {Match, TempEdges, TempCost};
890 SchedGroup *SG = Best->SG;
891 std::list<std::pair<SUnit *, SUnit *>> &Edges = Best->Edges;
893 SG->add(*CurrSU.first);
894 if (AddedEdges.empty())
897 AddedEdges.splice(std::prev(AddedEdges.cend()), Edges);
899 for (
const std::pair<SUnit *, SUnit *> &
E : Edges) {
900 if (!SG->tryAddEdge(
E.first,
E.second))
904 LLVM_DEBUG(
dbgs() <<
"Best Group has ID: " << SG->getSGID() <<
" and Mask"
905 << (
int)SG->getMask() <<
"\n");
906 BestCost += Best->Cost;
908 BestCost += MissPenalty;
911bool PipelineSolver::solveGreedy() {
913 std::list<std::pair<SUnit *, SUnit *>> AddedEdges;
915 while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
916 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
918 ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
919 : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
922 BestPipeline = CurrPipeline;
923 removeEdges(AddedEdges);
927unsigned PipelineSolver::computeProblemSize() {
928 unsigned ProblemSize = 0;
929 for (
auto &PipeConflicts : PipelineInstrs) {
930 ProblemSize += PipeConflicts.size();
936void PipelineSolver::solve() {
940 unsigned ProblemSize = computeProblemSize();
943 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
944 MissPenalty = (ProblemSize / 2) + 1;
947 if (EnableExactSolver || BelowCutoff) {
951 LLVM_DEBUG(
dbgs() <<
"Greedy produced best cost of " << BestCost <<
"\n");
955 LLVM_DEBUG(
dbgs() <<
"Exact produced best cost of " << BestCost <<
"\n");
960 LLVM_DEBUG(
dbgs() <<
"Greedy produced best cost of " << BestCost <<
"\n");
977 virtual bool applyIGLPStrategy(
986 bool IsBottomUp =
true;
991 virtual ~IGLPStrategy() =
default;
994class MFMASmallGemmOpt final :
public IGLPStrategy {
997 bool applyIGLPStrategy(
1008 : IGLPStrategy(DAG,
TII) {
1013bool MFMASmallGemmOpt::applyIGLPStrategy(
1018 unsigned MFMACount = 0;
1020 if (
TII->isMFMAorWMMA(
I))
1023 const unsigned PipelineSyncID = 0;
1024 SchedGroup *SG =
nullptr;
1025 for (
unsigned I = 0;
I < MFMACount * 3; ++
I) {
1026 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1027 SchedGroupMask::DS, 2, PipelineSyncID, DAG,
TII);
1028 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1030 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1031 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
1032 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1038class MFMAExpInterleaveOpt final :
public IGLPStrategy {
1041 static unsigned TransPipeCount;
1043 static unsigned MFMAPipeCount;
1045 static unsigned AddPipeCount;
1047 static unsigned MFMAEnablement;
1049 static unsigned ExpRequirement;
1051 static unsigned MFMAChains;
1056 static bool HasChainBetweenCvt;
1058 static std::optional<unsigned> FirstPipeDSR;
1067 class IsPipeExp final :
public InstructionRule {
1072 auto *DAG = SyncPipe[0].DAG;
1074 if (Cache->empty()) {
1075 auto I = DAG->SUnits.rbegin();
1076 auto E = DAG->SUnits.rend();
1077 for (;
I !=
E;
I++) {
1078 if (
TII->isMFMAorWMMA(*
I->getInstr()))
1079 Cache->push_back(&*
I);
1085 auto Reaches =
any_of(*Cache, [&SU, &DAG](
SUnit *TargetSU) {
1086 return DAG->IsReachable(TargetSU,
const_cast<SUnit *
>(SU));
1091 IsPipeExp(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1092 : InstructionRule(
TII, SGID, NeedsCache) {}
1097 class EnablesNthMFMA final :
public InstructionRule {
1104 bool FoundTrans =
false;
1105 unsigned Counter = 1;
1106 auto *DAG = SyncPipe[0].DAG;
1108 if (Cache->empty()) {
1109 auto I = DAG->SUnits.begin();
1110 auto E = DAG->SUnits.end();
1111 for (;
I !=
E;
I++) {
1112 if (FoundTrans &&
TII->isMFMAorWMMA(*
I->getInstr())) {
1114 Cache->push_back(&*
I);
1119 if (!FoundTrans &&
TII->isTRANS(
I->getInstr()->getOpcode()))
1126 return DAG->IsReachable((*Cache)[0],
const_cast<SUnit *
>(SU));
1130 bool NeedsCache =
false)
1136 class EnablesNthMFMAInChain final :
public InstructionRule {
1144 auto *DAG = SyncPipe[0].DAG;
1146 if (!SU || !
TII->isMFMAorWMMA(*ChainSeed->
getInstr()))
1149 if (Cache->empty()) {
1150 auto *TempSU = ChainSeed;
1155 for (
auto &Succ : TempSU->Succs) {
1156 if (
TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1157 TempSU = Succ.getSUnit();
1166 Cache->push_back(TempSU);
1172 return DAG->IsReachable((*Cache)[0],
const_cast<SUnit *
>(SU));
1175 EnablesNthMFMAInChain(
unsigned Number,
SUnit *ChainSeed,
1177 bool NeedsCache =
false)
1179 ChainSeed(ChainSeed) {}
1185 class LessThanNSuccs final :
public InstructionRule {
1188 bool HasIntermediary =
false;
1193 if (!SyncPipe.
size())
1197 return Succ.getKind() == SDep::Data;
1199 if (SuccSize >=
Size)
1202 if (HasIntermediary) {
1203 for (
auto Succ : SU->
Succs) {
1206 return SuccSucc.getKind() == SDep::Data;
1208 if (SuccSize >=
Size)
1216 bool HasIntermediary =
false,
bool NeedsCache =
false)
1217 : InstructionRule(
TII, SGID, NeedsCache),
Size(
Size),
1218 HasIntermediary(HasIntermediary) {}
1225 class GreaterThanOrEqualToNSuccs final :
public InstructionRule {
1228 bool HasIntermediary =
false;
1233 if (!SyncPipe.
size())
1237 return Succ.getKind() == SDep::Data;
1239 if (SuccSize >=
Size)
1242 if (HasIntermediary) {
1243 for (
auto Succ : SU->
Succs) {
1246 return SuccSucc.getKind() == SDep::Data;
1248 if (SuccSize >=
Size)
1256 unsigned SGID,
bool HasIntermediary =
false,
1257 bool NeedsCache =
false)
1258 : InstructionRule(
TII, SGID, NeedsCache),
Size(
Size),
1259 HasIntermediary(HasIntermediary) {}
1263 class IsCvt final :
public InstructionRule {
1268 return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
1269 Opc == AMDGPU::V_CVT_I32_F32_e32;
1271 IsCvt(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1272 : InstructionRule(
TII, SGID, NeedsCache) {}
1276 class IsFMA final :
public InstructionRule {
1283 IsFMA(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1284 : InstructionRule(
TII, SGID, NeedsCache) {}
1288 class IsPipeAdd final :
public InstructionRule {
1294 IsPipeAdd(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1295 : InstructionRule(
TII, SGID, NeedsCache) {}
1300 class IsSuccOfPrevNthGroup final :
public InstructionRule {
1302 unsigned Distance = 1;
1307 SchedGroup *OtherGroup =
nullptr;
1308 if (!SyncPipe.
size())
1311 for (
auto &PipeSG : SyncPipe) {
1312 if ((
unsigned)PipeSG.getSGID() == SGID - Distance)
1313 OtherGroup = &PipeSG;
1318 if (!OtherGroup->Collection.size())
1321 for (
auto &OtherEle : OtherGroup->Collection) {
1322 for (
auto &Succ : OtherEle->Succs) {
1323 if (Succ.getSUnit() == SU && Succ.getKind() ==
SDep::Data)
1331 unsigned SGID,
bool NeedsCache =
false)
1332 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
1337 class IsReachableFromPrevNthGroup final :
public InstructionRule {
1339 unsigned Distance = 1;
1344 SchedGroup *OtherGroup =
nullptr;
1345 if (!SyncPipe.
size())
1348 for (
auto &PipeSG : SyncPipe) {
1349 if ((
unsigned)PipeSG.getSGID() == SGID - Distance)
1350 OtherGroup = &PipeSG;
1355 if (!OtherGroup->Collection.size())
1358 auto *DAG = SyncPipe[0].DAG;
1360 for (
auto &OtherEle : OtherGroup->Collection)
1361 if (DAG->IsReachable(
const_cast<SUnit *
>(SU), OtherEle))
1366 IsReachableFromPrevNthGroup(
unsigned Distance,
const SIInstrInfo *
TII,
1367 unsigned SGID,
bool NeedsCache =
false)
1368 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
1372 class OccursAtOrAfterNode final :
public InstructionRule {
1383 bool NeedsCache =
false)
1389 class IsExactMFMA final :
public InstructionRule {
1397 if (!SU || !
TII->isMFMAorWMMA(*ChainSeed->
getInstr()))
1400 if (Cache->empty()) {
1401 auto *TempSU = ChainSeed;
1406 for (
auto &Succ : TempSU->Succs) {
1407 if (
TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1408 TempSU = Succ.getSUnit();
1417 Cache->push_back(TempSU);
1423 return (*Cache)[0] == SU;
1427 unsigned SGID,
bool NeedsCache =
false)
1429 ChainSeed(ChainSeed) {}
1435 class OccursAfterExp final :
public InstructionRule {
1440 auto *DAG = SyncPipe[0].DAG;
1441 if (Cache->empty()) {
1442 for (
auto &SU : DAG->SUnits)
1444 Cache->push_back(&SU);
1451 return SU->
NodeNum > (*Cache)[0]->NodeNum;
1455 bool NeedsCache =
false)
1456 : InstructionRule(
TII, SGID, NeedsCache) {}
1460 bool applyIGLPStrategy(
1469 : IGLPStrategy(DAG,
TII) {
1474unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
1475unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
1476unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
1477unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
1478unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
1479unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
1480bool MFMAExpInterleaveOpt::HasCvt =
false;
1481bool MFMAExpInterleaveOpt::HasChainBetweenCvt =
false;
1482std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
1491 auto isBitPack = [](
unsigned Opc) {
1492 return Opc == AMDGPU::V_PACK_B32_F16_e64 ||
Opc == AMDGPU::V_PERM_B32_e64;
1495 auto isCvt = [](
unsigned Opc) {
1496 return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
Opc == AMDGPU::V_CVT_I32_F32_e32;
1499 auto isAdd = [](
unsigned Opc) {
return Opc == AMDGPU::V_ADD_F32_e32; };
1506 if (SU.
Succs.size() >= 7)
1508 for (
auto &Succ : SU.
Succs) {
1509 if (Succ.getSUnit()->Succs.size() >= 7)
1528 if (!(PackSUs.
size() && MFMAPipeCands.
size() && ExpPipeCands.
size()))
1533 std::optional<SUnit *> TempMFMA;
1534 std::optional<SUnit *> TempExp;
1536 for (
auto &PredSU : ExpPipeCands) {
1537 for (
auto &SuccSU : MFMAPipeCands) {
1550 if (!(TempExp && TempMFMA))
1553 HasChainBetweenCvt =
none_of((*TempExp)->Succs, [&isCvt](
SDep &Succ) {
1554 return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
1558 for (
auto &SuccSU : MFMAPipeCands) {
1559 if (MFMAPipeSUs.
size() &&
1560 any_of(MFMAPipeSUs, [&SuccSU](
SUnit *PotentialMatch) {
1561 return PotentialMatch->
NodeNum == SuccSU->NodeNum;
1565 for (
auto &PredSU : ExpPipeCands) {
1573 MFMAPipeCount = MFMAPipeSUs.
size();
1575 assert(TempExp && TempMFMA);
1576 assert(MFMAPipeCount > 0);
1578 std::optional<SUnit *> TempCvt;
1579 for (
auto &SuccSU : CvtSUs) {
1587 if (TempCvt.has_value()) {
1588 for (
auto &SuccSU : MFMAPipeSUs) {
1597 for (
auto &MFMAPipeSU : MFMAPipeSUs) {
1601 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1603 MFMAChainSeeds.push_back(MFMAPipeSU);
1611 for (
auto Pred : MFMAChainSeeds[0]->Preds) {
1612 if (
TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
1613 Pred.getSUnit()->getInstr()->mayLoad())
1614 FirstPipeDSR = Pred.getSUnit()->NodeNum;
1618 unsigned PackSuccCount =
1624 unsigned PackPredCount =
1626 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1627 return isBitPack(Opc);
1631 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1632 return isBitPack(Opc);
1635 if (PackPred == (*TempMFMA)->Preds.end())
1643 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1647 MFMAEnablement *= PackSuccCount;
1652 return DAG->
IsReachable(PackPred->getSUnit(), ExpBase);
1655 ExpRequirement *= PackPredCount;
1665 MFMAChainSeeds.clear();
1672bool MFMAExpInterleaveOpt::applyIGLPStrategy(
1677 bool IsSmallKernelType =
1678 MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
1679 bool IsLargeKernelType =
1680 MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;
1682 if (!(IsSmallKernelType || IsLargeKernelType))
1688 unsigned PipelineSyncID = 0;
1689 SchedGroup *SG =
nullptr;
1691 unsigned MFMAChain = 0;
1692 unsigned PositionInChain = 0;
1693 unsigned CurrMFMAForTransPosition = 0;
1695 auto incrementTransPosition = [&MFMAChain, &PositionInChain,
1696 &CurrMFMAForTransPosition]() {
1697 CurrMFMAForTransPosition += MFMAEnablement;
1698 PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
1699 MFMAChain = CurrMFMAForTransPosition % MFMAChains;
1702 auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
1703 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1704 return (TempMFMAForTrans / MFMAChains);
1707 auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
1708 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1709 return TempMFMAForTrans % MFMAChains;
1712 unsigned CurrMFMAPosition = 0;
1713 unsigned MFMAChainForMFMA = 0;
1714 unsigned PositionInChainForMFMA = 0;
1716 auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
1717 &PositionInChainForMFMA]() {
1719 MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
1720 PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
1724 assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);
1726 bool UsesFMA = IsSmallKernelType || !IsPostRA;
1727 bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
1728 bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA);
1729 bool UsesVALU = IsSmallKernelType;
1734 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1735 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG,
TII);
1736 if (!IsPostRA && MFMAChains) {
1737 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1738 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1742 std::make_shared<EnablesNthMFMA>(1,
TII, SG->getSGID(),
true));
1743 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1744 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1747 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1748 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG,
TII);
1749 if (!IsPostRA && MFMAChains) {
1750 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1751 getNextTransPositionInChain(),
1752 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
true));
1754 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1,
TII,
1755 SG->getSGID(),
true));
1756 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1757 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1761 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1762 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG,
TII);
1763 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR,
TII,
1765 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1769 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1770 SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG,
TII);
1771 if (!IsPostRA && MFMAChains)
1772 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1773 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
true));
1775 SG->addRule(std::make_shared<EnablesNthMFMA>(1,
TII, SG->getSGID(),
true));
1776 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1777 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1778 HasChainBetweenCvt));
1779 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1781 incrementTransPosition();
1784 for (
unsigned I = 0;
I < ExpRequirement;
I++) {
1787 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1788 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1789 SG->addRule(std::make_shared<IsCvt>(
TII, SG->getSGID()));
1790 if (HasChainBetweenCvt)
1791 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1792 1 + (2 + UsesFMA) *
I,
TII, SG->getSGID()));
1794 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(
1795 1 + (2 + UsesFMA) *
I,
TII, SG->getSGID()));
1796 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1801 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1802 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1803 if (!IsPostRA && MFMAChains) {
1804 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1805 getNextTransPositionInChain(),
1806 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
true));
1808 SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1,
1809 TII, SG->getSGID(),
true));
1810 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1811 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1815 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1816 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1817 if (!IsPostRA && MFMAChains)
1818 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1819 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1822 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1,
TII,
1823 SG->getSGID(),
true));
1824 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1825 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1826 HasChainBetweenCvt));
1827 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1832 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1833 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1834 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1835 SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>(
1836 8,
TII, SG->getSGID(), HasChainBetweenCvt));
1837 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1842 unsigned MFMARatio =
1843 MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;
1846 MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;
1848 unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement)
1849 ? TransPipeCount - (2 * ExpRequirement)
1851 unsigned ExpLoopCount = RemainingExp / ExpRatio;
1853 unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2)
1854 ? MFMAPipeCount - (MFMAEnablement * 2)
1856 unsigned MFMALoopCount = MFMAInLoop / MFMARatio;
1858 AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount;
1859 unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount);
1861 for (
unsigned I = 0;
I < LoopSize;
I++) {
1862 if (!(
I * ExpRatio % ExpRequirement))
1863 incrementTransPosition();
1866 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1867 SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG,
TII);
1868 if (!IsPostRA && MFMAChains)
1869 SG->addRule(std::make_shared<IsExactMFMA>(
1870 PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA],
TII,
1871 SG->getSGID(),
true));
1873 SG->addRule(std::make_shared<OccursAfterExp>(
TII, SG->getSGID(),
true));
1874 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1875 incrementMFMAPosition();
1878 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1879 SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG,
TII);
1880 SG->addRule(std::make_shared<IsPipeAdd>(
TII, SG->getSGID()));
1881 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1884 if (UsesDSRead && !(
I % 4)) {
1885 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1886 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG,
TII);
1887 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR,
TII,
1889 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1893 for (
unsigned J = 0; J < ExpRatio; J++) {
1894 auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (
I + 1);
1895 auto MaxMFMAOffset =
1896 (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;
1900 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1901 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1902 SG->addRule(std::make_shared<IsCvt>(
TII, SG->getSGID()));
1903 auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;
1904 auto DSROffset =
I / 4 + 1;
1905 auto MaxDSROffset = MaxMFMAOffset / 4;
1907 auto ExpOffset =
I * ExpRatio + J >= ExpRequirement ? 0 : 1;
1908 auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +
1909 std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff +
1911 if (HasChainBetweenCvt)
1912 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1913 CurrentOffset,
TII, SG->getSGID()));
1915 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset,
TII,
1917 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1922 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1923 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1924 if (!IsPostRA && MFMAChains)
1925 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1926 getNextTransPositionInChain(),
1927 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
1930 SG->addRule(std::make_shared<EnablesNthMFMA>(
1931 (((
I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1,
1932 TII, SG->getSGID(),
true));
1933 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1934 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1938 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1939 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1940 if (!IsPostRA && MFMAChains)
1941 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1942 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1945 SG->addRule(std::make_shared<EnablesNthMFMA>(
1946 (((
I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,
1947 TII, SG->getSGID(),
true));
1948 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1949 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1950 HasChainBetweenCvt));
1951 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1956 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1957 SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG,
TII);
1958 SG->addRule(std::make_shared<OccursAfterExp>(
TII, SG->getSGID(),
true));
1959 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1963class MFMAExpSimpleInterleaveOpt final :
public IGLPStrategy {
1965 bool applyIGLPStrategy(
1976 : IGLPStrategy(DAG,
TII) {
1981bool MFMAExpSimpleInterleaveOpt::applyIGLPStrategy(
1986 unsigned MFMACount = 0;
1988 if (
TII->isMFMAorWMMA(
I))
1991 const unsigned PipelineSyncID = 0;
1992 for (
unsigned I = 0;
I < MFMACount * 3; ++
I) {
1993 SchedGroup *SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1994 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1995 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
1997 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1998 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
1999 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2005class MFMASmallGemmSingleWaveOpt final :
public IGLPStrategy {
2008 class EnablesInitialMFMA final :
public InstructionRule {
2012 if (!SyncPipe.
size())
2015 if (!Cache->size()) {
2016 for (
auto &Elt : SyncPipe[0].DAG->
SUnits) {
2017 if (
TII->isMFMAorWMMA(*Elt.getInstr())) {
2021 Cache->push_back(&Elt);
2026 auto *DAG = SyncPipe[0].DAG;
2027 for (
auto &Elt : *Cache) {
2035 bool NeedsCache =
false)
2036 : InstructionRule(
TII, SGID, NeedsCache) {}
2040 class IsPermForDSW final :
public InstructionRule {
2045 if (
MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
2048 bool FitsInGroup =
false;
2050 if (!Collection.
size()) {
2051 for (
auto &Succ : SU->
Succs) {
2052 SUnit *SuccUnit = Succ.getSUnit();
2055 Cache->push_back(SuccUnit);
2066 return ThisSucc.getSUnit() == Elt;
2071 IsPermForDSW(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
2072 : InstructionRule(
TII, SGID, NeedsCache) {}
2076 class IsSuccOfPrevGroup final :
public InstructionRule {
2080 SchedGroup *OtherGroup =
nullptr;
2081 for (
auto &PipeSG : SyncPipe) {
2082 if ((
unsigned)PipeSG.getSGID() == SGID - 1) {
2083 OtherGroup = &PipeSG;
2089 if (!OtherGroup->Collection.size())
2093 return any_of(OtherGroup->Collection, [&SU](
SUnit *Elt) {
2094 return any_of(Elt->Succs,
2095 [&SU](SDep &Succ) { return Succ.getSUnit() == SU; });
2099 bool NeedsCache =
false)
2100 : InstructionRule(
TII, SGID, NeedsCache) {}
2104 class VMEMSize final :
public InstructionRule {
2109 if (
MI->getOpcode() == TargetOpcode::BUNDLE)
2111 if (!Collection.
size())
2116 auto TRI =
TII->getRegisterInfo();
2117 auto &MRI =
MI->getMF()->getRegInfo();
2118 for (
auto &Elt : Collection) {
2119 auto Op = Elt->getInstr()->getOperand(0);
2121 TRI.getRegSizeInBits(*
TRI.getRegClassForOperandReg(MRI,
Op));
2125 if (NumBits < 128) {
2127 if (NumBits +
TRI.getRegSizeInBits(*
TRI.getRegClassForOperandReg(
2128 MRI,
MI->getOperand(0))) <=
2136 VMEMSize(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
2137 : InstructionRule(
TII, SGID, NeedsCache) {}
2142 class SharesPredWithPrevNthGroup final :
public InstructionRule {
2144 unsigned Distance = 1;
2149 SchedGroup *OtherGroup =
nullptr;
2150 if (!SyncPipe.
size())
2153 if (!Cache->size()) {
2155 for (
auto &PipeSG : SyncPipe) {
2156 if ((
unsigned)PipeSG.getSGID() == SGID - Distance) {
2157 OtherGroup = &PipeSG;
2163 if (!OtherGroup->Collection.size())
2166 for (
auto &OtherEle : OtherGroup->Collection) {
2167 for (
auto &Pred : OtherEle->Preds) {
2168 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2169 AMDGPU::V_PERM_B32_e64)
2170 Cache->push_back(Pred.getSUnit());
2179 auto *DAG = SyncPipe[0].DAG;
2186 SharesPredWithPrevNthGroup(
unsigned Distance,
const SIInstrInfo *
TII,
2187 unsigned SGID,
bool NeedsCache =
false)
2188 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
2192 bool applyIGLPStrategy(
2203 : IGLPStrategy(DAG,
TII) {
2208static unsigned DSWCount = 0;
2209static unsigned DSWWithPermCount = 0;
2210static unsigned DSWWithSharedVMEMCount = 0;
2212bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
2213 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
2216 unsigned MFMACount = 0;
2217 unsigned DSRCount = 0;
2219 bool IsInitial =
Phase == AMDGPU::SchedulingPhase::Initial;
2221 assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
2222 DSWWithSharedVMEMCount == 0)) &&
2223 "DSWCounters should be zero in pre-RA scheduling!");
2225 for (
auto &SU : DAG->
SUnits) {
2226 auto *
I = SU.getInstr();
2227 if (
TII->isMFMAorWMMA(*
I))
2229 else if (
TII->isDS(*
I)) {
2232 else if (
I->mayStore() && IsInitial) {
2234 for (
auto Pred : SU.Preds) {
2235 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2236 AMDGPU::V_PERM_B32_e64) {
2246 DSWWithPermCount = DSWithPerms.
size();
2247 auto *
I = DSWithPerms.
begin();
2248 auto *
E = DSWithPerms.
end();
2256 DenseMap<MachineInstr *, SUnit *> VMEMLookup;
2258 for (;
I !=
E;
I++) {
2259 SUnit *Cand =
nullptr;
2260 bool MissedAny =
false;
2261 for (
auto &Pred : (*I)->Preds) {
2262 if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
2268 for (
auto &Succ : Pred.getSUnit()->Succs) {
2269 auto *
MI = Succ.getSUnit()->getInstr();
2270 if (!
TII->isVMEM(*
MI) || !
MI->mayLoad())
2273 if (MissedAny || !VMEMLookup.
size()) {
2275 VMEMLookup[
MI] = *
I;
2292 if (!MissedAny && Cand) {
2293 DSWWithSharedVMEMCount += 2;
2300 assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
2302 unsigned PipelineSyncID = 0;
2304 if (DSWWithPermCount) {
2305 for (
unsigned I = 0;
I < MFMACount;
I++) {
2306 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2307 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2308 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2310 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2311 SchedGroupMask::VALU, 2, PipelineSyncID, DAG,
TII);
2312 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2322 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2323 SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG,
TII);
2324 SG->addRule(std::make_shared<EnablesInitialMFMA>(
TII, SG->getSGID(),
true));
2325 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2327 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2328 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2329 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2332 for (
unsigned I = 4;
I < DSRCount; ++
I) {
2333 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2334 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG,
TII);
2335 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2337 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2338 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2339 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2345 for (
unsigned I = DSWWithSharedVMEMCount;
I < DSWWithPermCount; ++
I) {
2346 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2347 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2348 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2349 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2351 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2352 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2353 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2354 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2356 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2357 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2358 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2359 1,
TII, SG->getSGID(),
true));
2360 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2361 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2363 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2364 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2365 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2367 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2368 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2369 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2370 3,
TII, SG->getSGID(),
true));
2371 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2372 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2374 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2375 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2376 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2382 for (
unsigned I = DSWWithPermCount;
I < DSWCount;
I++) {
2383 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2384 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2385 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2387 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2388 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2389 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2390 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2392 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2393 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2394 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2402 for (
unsigned I = 0;
I < DSWWithSharedVMEMCount; ++
I) {
2403 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2404 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2405 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2406 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2408 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2409 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2410 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2411 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2413 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2414 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2415 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2417 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2418 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2419 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2420 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2422 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2423 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2424 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2425 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2427 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2428 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2429 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2431 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2432 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2433 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2434 2,
TII, SG->getSGID(),
true));
2435 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2436 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2438 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2439 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2440 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2442 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2443 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2444 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2445 4,
TII, SG->getSGID(),
true));
2446 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2447 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2449 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2450 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2451 SG->findCandidateSUnits(SyncedInstrs[SG->getSyncID()]);
2457static std::unique_ptr<IGLPStrategy>
2459 const SIInstrInfo *
TII) {
2462 return std::make_unique<MFMASmallGemmOpt>(DAG,
TII);
2464 return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG,
TII);
2466 return std::make_unique<MFMAExpInterleaveOpt>(DAG,
TII);
2468 return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG,
TII);
2474class IGroupLPDAGMutation :
public ScheduleDAGMutation {
2476 const SIInstrInfo *
TII;
2483 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
2486 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
2489 void addSchedBarrierEdges(SUnit &SU);
2500 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask)
const;
2503 void initSchedGroupBarrierPipelineStage(
2504 std::vector<SUnit>::reverse_iterator RIter);
2506 bool initIGLPOpt(SUnit &SU);
2509 void apply(ScheduleDAGInstrs *DAGInstrs)
override;
2516 bool IsBottomUp =
true;
2521 IGroupLPDAGMutation() =
default;
2525unsigned SchedGroup::NumSchedGroups = 0;
2527bool SchedGroup::tryAddEdge(SUnit *
A, SUnit *
B) {
2531bool SchedGroup::canAddMI(
const MachineInstr &
MI)
const {
2533 if (
MI.isMetaInstruction())
2536 else if (
MI.isInlineAsm()) {
2538 auto &MRI =
MI.getParent()->getParent()->getRegInfo();
2539 bool SGPR_used =
false, SGPR_big_def =
false, VGPR_used =
false,
2540 VMFMA_used =
false, VReg32_used =
false,
MayLoad =
MI.mayLoad(),
2542 for (
const MachineOperand &Operand :
MI.operands())
2543 if (Operand.isReg()) {
2545 *
TRI.getRegClassForOperandReg(MRI, Operand);
2546 if (
TRI.hasVGPRs(&RegClass)) {
2548 if (Operand.isUse() &&
TRI.getRegSizeInBits(RegClass) == 32)
2554 if (
TRI.hasAGPRs(&RegClass) ||
TRI.getRegSizeInBits(RegClass) > 128)
2556 if (
TRI.hasSGPRs(&RegClass))
2558 if (
TRI.getRegSizeInBits(RegClass) > 64 && Operand.isDef())
2559 SGPR_big_def =
true;
2562 typedef std::underlying_type_t<SchedGroupMask> SGMask_t;
2563 SGMask_t InlineAsmMask = 0;
2564 if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore)
2565 InlineAsmMask |= (SGMask_t)SchedGroupMask::VALU;
2566 if (SGPR_used && !VGPR_used && !MayLoad && !MayStore)
2567 InlineAsmMask |= (SGMask_t)SchedGroupMask::SALU;
2569 InlineAsmMask |= (SGMask_t)SchedGroupMask::MFMA;
2570 if (VGPR_used && MayLoad)
2571 InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_READ
2572 : SchedGroupMask::VMEM_READ);
2573 if (VGPR_used && MayStore)
2574 InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_WRITE
2575 : SchedGroupMask::VMEM_WRITE);
2577 InlineAsmMask |= (SGMask_t)SchedGroupMask::DS_READ;
2578 if (InlineAsmMask & (SGMask_t)SchedGroupMask::VALU ||
2579 InlineAsmMask & (SGMask_t)SchedGroupMask::SALU)
2580 InlineAsmMask |= (SGMask_t)SchedGroupMask::ALU;
2581 if (InlineAsmMask & (SGMask_t)SchedGroupMask::DS_READ ||
2582 InlineAsmMask & (SGMask_t)SchedGroupMask::DS_WRITE)
2583 InlineAsmMask |= (SGMask_t)SchedGroupMask::DS;
2584 if (InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_READ ||
2585 InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_WRITE)
2586 InlineAsmMask |= (SGMask_t)SchedGroupMask::VMEM;
2588 Result = ((SGMask_t)SGMask & InlineAsmMask) != 0;
2591 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
2592 (
TII->isVALU(
MI,
true) ||
TII->isMFMAorWMMA(
MI) ||
2596 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
2597 TII->isVALU(
MI,
true) && !
TII->isMFMAorWMMA(
MI) &&
2605 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
2609 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
2610 TII->isMFMAorWMMA(
MI))
2613 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
2617 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
2618 MI.mayLoad() &&
TII->isVMEM(
MI))
2621 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
2622 MI.mayStore() &&
TII->isVMEM(
MI))
2625 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
2629 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
2630 MI.mayLoad() &&
TII->isDS(
MI))
2633 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
2634 MI.mayStore() &&
TII->isDS(
MI))
2637 else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
2641 else if (((SGMask & SchedGroupMask::LDSDMA) != SchedGroupMask::NONE) &&
2646 dbgs() <<
"For SchedGroup with mask " <<
format_hex((
int)SGMask, 10,
true)
2647 << (Result ?
" could classify " :
" unable to classify ") <<
MI);
2652int SchedGroup::link(SUnit &SU,
bool MakePred,
2653 std::list<std::pair<SUnit *, SUnit *>> &AddedEdges) {
2654 int MissedEdges = 0;
2655 for (
auto *
A : Collection) {
2657 if (
A ==
B ||
A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2667 bool Added = tryAddEdge(
A,
B);
2669 AddedEdges.emplace_back(
A,
B);
2677void SchedGroup::link(SUnit &SU,
bool MakePred) {
2678 for (
auto *
A : Collection) {
2680 if (
A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2689void SchedGroup::link(SUnit &SU,
2690 function_ref<
bool(
const SUnit *
A,
const SUnit *
B)>
P) {
2691 for (
auto *
A : Collection) {
2700void SchedGroup::link(SchedGroup &OtherGroup) {
2701 for (
auto *
B : OtherGroup.Collection)
2705bool SchedGroup::canAddSU(SUnit &SU)
const {
2707 if (
MI.getOpcode() != TargetOpcode::BUNDLE)
2708 return canAddMI(
MI);
2711 const MachineBasicBlock *
MBB =
MI.getParent();
2713 while (
E !=
MBB->
end() &&
E->isBundledWithPred())
2717 return std::all_of(
B,
E, [
this](MachineInstr &
MI) {
return canAddMI(
MI); });
2721void SchedGroup::findCandidateSUnits(
T Begin,
T End,
2722 SUnitsToCandidateSGsMap &SyncedInstrs) {
2725 SyncedInstrs[&SU].push_back(SGID);
2729void SchedGroup::findCandidateSUnits(SUnitsToCandidateSGsMap &SyncedInstrs) {
2730 findCandidateSUnits(DAG->
SUnits.rbegin(), DAG->
SUnits.rend(), SyncedInstrs);
2733void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
2734 const TargetSchedModel *TSchedModel = DAGInstrs->
getSchedModel();
2735 if (!TSchedModel || DAGInstrs->
SUnits.empty())
2740 TII =
ST.getInstrInfo();
2741 DAG =
static_cast<ScheduleDAGMI *
>(DAGInstrs);
2742 SyncedSchedGroups.clear();
2743 SyncedInstrs.clear();
2744 bool FoundSB =
false;
2745 bool FoundIGLP =
false;
2746 bool ShouldApplyIGLP =
false;
2747 for (
auto R = DAG->
SUnits.rbegin(),
E = DAG->
SUnits.rend(); R !=
E; ++R) {
2748 unsigned Opc =
R->getInstr()->getOpcode();
2750 if (
Opc == AMDGPU::SCHED_BARRIER) {
2751 addSchedBarrierEdges(*R);
2753 }
else if (
Opc == AMDGPU::SCHED_GROUP_BARRIER) {
2754 initSchedGroupBarrierPipelineStage(R);
2756 }
else if (
Opc == AMDGPU::IGLP_OPT) {
2757 if (!FoundSB && !FoundIGLP) {
2759 ShouldApplyIGLP = initIGLPOpt(*R);
2764 if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
2765 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
2773void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
2775 assert(
MI.getOpcode() == AMDGPU::SCHED_BARRIER);
2776 LLVM_DEBUG(
dbgs() <<
"Building SchedGroup for SchedBarrier with Mask: "
2777 <<
MI.getOperand(0).getImm() <<
"\n");
2779 invertSchedBarrierMask((SchedGroupMask)
MI.getOperand(0).getImm());
2780 SchedGroup SG(InvertedMask, std::nullopt, DAG,
TII);
2782 for (SUnit &SU : DAG->
SUnits)
2783 if (SG.canAddSU(SU))
2789 (function_ref<
bool(
const SUnit *
A,
const SUnit *
B)>)[](
2790 const SUnit *
A,
const SUnit *
B) {
return A->NodeNum >
B->NodeNum; });
2794IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask)
const {
2797 SchedGroupMask InvertedMask = ~Mask;
2800 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
2801 InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask
::SALU &
2804 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
2805 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
2806 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
2807 (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
2808 InvertedMask &= ~SchedGroupMask::ALU;
2811 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
2812 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE &
2813 ~SchedGroupMask::LDSDMA;
2815 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
2816 (InvertedMask & SchedGroupMask::VMEM_WRITE) ==
2817 SchedGroupMask::NONE ||
2818 (InvertedMask & SchedGroupMask::LDSDMA) == SchedGroupMask::NONE)
2819 InvertedMask &= ~SchedGroupMask
::VMEM;
2822 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
2823 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE &
2824 ~SchedGroupMask::LDSDMA;
2826 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
2827 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
2828 InvertedMask &= ~SchedGroupMask
::DS;
2830 LLVM_DEBUG(
dbgs() <<
"After Inverting, SchedGroup Mask: " << (
int)InvertedMask
2833 return InvertedMask;
2836void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
2837 std::vector<SUnit>::reverse_iterator RIter) {
2838 MachineInstr &SGB = *RIter->getInstr();
2845 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
2848 SG.findCandidateSUnits(RIter, SG.DAG->
SUnits.rend(),
2849 SyncedInstrs[SG.getSyncID()]);
2852bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
2855 auto S = createIGLPStrategy(StrategyID, DAG,
TII);
2856 if (!S->shouldApplyStrategy(DAG,
Phase))
2859 IsBottomUp = S->IsBottomUp;
2860 return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups,
Phase);
2870std::unique_ptr<ScheduleDAGMutation>
2872 return std::make_unique<IGroupLPDAGMutation>(
Phase);
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late Phase
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
AMDGPU Rewrite AGPR Copy MFMA
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file defines the DenseMap class.
const HexagonInstrInfo * TII
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
Register const TargetRegisterInfo * TRI
Interface definition for SIInstrInfo.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
Get the array size.
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Implements a dense probed hash-table based set.
const HexagonRegisterInfo & getRegisterInfo() const
Instructions::iterator instr_iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const MachineOperand & getOperand(unsigned i) const
@ Data
Regular data dependence (aka true-dependence).
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Scheduling unit. This is a node in the scheduling DAG.
unsigned NodeNum
Entry # of node in the node vector.
LLVM_ABI void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
SmallVector< SDep, 4 > Succs
All sunit successors.
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
bool IsReachable(SUnit *SU, SUnit *TargetSU)
IsReachable - Checks if SU is reachable from TargetSU.
void dump() const override
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
std::vector< SUnit > SUnits
The scheduling units.
MachineFunction & MF
Machine function.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
An efficient, type-erasing, non-owning reference to a callable.
A range adaptor for a pair of iterators.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
IGLPStrategyID
Operand 0 immediate for IGLP_OPT pseudo instructions.
@ MFMASmallGemmSingleWaveOptID
@ MFMAExpSimpleInterleaveID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
void apply(Opt *O, const Mod &M, const Mods &... Ms)
initializer< Ty > init(const Ty &Val)
LLVM_ABI void link(std::unique_ptr< LinkGraph > G, std::unique_ptr< JITLinkContext > Ctx)
Link the given graph.
This is an optimization pass for GlobalISel generic memory operations.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
@ LLVM_MARK_AS_BITMASK_ENUM
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
auto reverse(ContainerTy &&C)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
FormattedNumber format_hex(uint64_t N, unsigned Width, bool Upper=false)
format_hex - Output N as a fixed width hexadecimal.
DWARFExpression::Operation Op
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
MCRegisterClass TargetRegisterClass
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Function object to check whether the second component of a container supported by std::get (like std:...