30#define DEBUG_TYPE "igrouplp"
36 cl::desc(
"Whether to use the exponential time solver to fit "
37 "the instructions to the pipeline as closely as "
43 cl::desc(
"The maximum number of scheduling group conflicts "
44 "which we attempt to solve with the exponential time "
45 "exact solver. Problem sizes greater than this will"
46 "be solved by the less accurate greedy algorithm. Selecting "
47 "solver by size is superseded by manually selecting "
48 "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
52 cl::desc(
"The amount of branches that we are willing to explore with"
53 "the exact algorithm before giving up."));
57 cl::desc(
"Whether to use the cost heuristic to make choices as we "
58 "traverse the search space using the exact solver. Defaulted "
59 "to on, and if turned off, we will use the node order -- "
60 "attempting to put the later nodes in the later sched groups. "
61 "Experimentally, results are mixed, so this should be set on a "
62 "case-by-case basis."));
66enum class SchedGroupMask {
79 ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
80 DS_READ | DS_WRITE | TRANS,
89class InstructionRule {
95 std::optional<SmallVector<SUnit *, 4>> Cache;
105 bool NeedsCache =
false)
112 virtual ~InstructionRule() =
default;
125 SchedGroupMask SGMask;
128 std::optional<unsigned> MaxSize;
141 static unsigned NumSchedGroups;
158 bool canAddSU(
SUnit &SU)
const;
163 void link(
SUnit &SU,
bool MakePred =
false);
167 int link(
SUnit &SU,
bool MakePred,
168 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
177 void link(SchedGroup &OtherGroup);
180 bool isFull()
const {
return MaxSize && Collection.
size() >= *MaxSize; }
186 void addRule(std::shared_ptr<InstructionRule> NewRule) {
191 bool allowedByRules(
const SUnit *SU,
193 for (
auto &Rule : Rules) {
194 if (!Rule.get()->apply(SU, Collection, SyncPipe))
201 void add(
SUnit &SU) {
203 <<
format_hex((
int)SGMask, 10,
true) <<
" adding "
209 void pop() { Collection.
pop_back(); }
212 void initSchedGroup();
219 void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
220 SUnitsToCandidateSGsMap &SyncedInstrs);
222 void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
224 int getSyncID() {
return SyncID; }
226 int getSGID() {
return SGID; }
228 SchedGroupMask getMask() {
return SGMask; }
230 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
232 : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG),
TII(
TII) {
233 SGID = NumSchedGroups++;
236 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
int SyncID,
238 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG),
TII(
TII) {
239 SGID = NumSchedGroups++;
249 while (!SU.
Preds.empty())
253 while (!SU.
Succs.empty())
254 for (
auto &S : SU.
Succs)
255 for (
auto &SP : S.getSUnit()->Preds)
256 if (SP.getSUnit() == &SU)
257 S.getSUnit()->removePred(SP);
260using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;
272class PipelineSolver {
285 bool NeedsSolver =
false;
289 unsigned computeProblemSize();
300 int CurrConflInstNo = 0;
302 int CurrSyncGroupIdx = 0;
304 int BeginSyncGroupIdx = 0;
310 bool IsBottomUp =
true;
313 void advancePosition();
316 void retreatPosition();
325 template <
typename T>
326 void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
332 template <
typename T>
339 template <
typename T>
void linkSchedGroups(
T I,
T E);
343 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
347 template <
typename T>
348 int linkSUnit(
SUnit *SU,
int SGID,
349 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
T E);
351 void removeEdges(
const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
353 void convertSyncMapsToArrays();
365 : DAG(DAG), SyncedInstrs(SyncedInstrs),
366 SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
368 for (
auto &PipelineInstrs : SyncedInstrs) {
369 if (PipelineInstrs.second.
size() > 0) {
378 convertSyncMapsToArrays();
380 CurrPipeline = BestPipeline;
382 while (
static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.
size() &&
383 PipelineInstrs[BeginSyncGroupIdx].
size() == 0)
386 if (
static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.
size())
391void PipelineSolver::reset() {
393 for (
auto &SyncPipeline : CurrPipeline) {
394 for (
auto &SG : SyncPipeline) {
396 SG.Collection.
clear();
400 if (SchedBarr != TempCollection.
end())
401 SG.Collection.push_back(*SchedBarr);
405 CurrSyncGroupIdx = BeginSyncGroupIdx;
410void PipelineSolver::convertSyncMapsToArrays() {
411 for (
auto &SyncPipe : SyncedSchedGroups) {
412 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
415 int PipelineIDx = SyncedInstrs.size() - 1;
416 PipelineInstrs.resize(SyncedInstrs.size());
417 for (
auto &SyncInstrMap : SyncedInstrs) {
418 for (
auto &SUsToCandSGs : SyncInstrMap.second) {
419 if (PipelineInstrs[PipelineIDx].
size() == 0) {
420 PipelineInstrs[PipelineIDx].push_back(
421 std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
424 auto SortPosition = PipelineInstrs[PipelineIDx].begin();
427 while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
428 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
430 PipelineInstrs[PipelineIDx].insert(
431 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
437template <
typename T>
void PipelineSolver::linkSchedGroups(
T I,
T E) {
438 for (;
I != E; ++
I) {
440 for (
auto J = std::next(
I); J != E; ++J) {
447void PipelineSolver::makePipeline() {
449 for (
auto &SyncPipeline : BestPipeline) {
451 for (
auto &SG : SyncPipeline) {
454 SUnit *SGBarr =
nullptr;
455 for (
auto &SU : SG.Collection) {
463 resetEdges(*SGBarr, DAG);
464 SG.link(*SGBarr,
false);
468 for (
auto &SyncPipeline : BestPipeline) {
469 IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
470 : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
475int PipelineSolver::linkSUnit(
476 SUnit *SU,
int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
478 bool MakePred =
false;
481 if (
I->getSGID() == SGID) {
486 AddedCost += Group.link(*SU, MakePred, AddedEdges);
492int PipelineSolver::addEdges(
494 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
504 return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.
rbegin(),
506 : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.
begin(),
510void PipelineSolver::removeEdges(
511 const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
514 for (
auto &PredSuccPair : EdgesToRemove) {
515 SUnit *Pred = PredSuccPair.first;
516 SUnit *Succ = PredSuccPair.second;
519 Succ->
Preds, [&Pred](
SDep &
P) { return P.getSUnit() == Pred; });
527void PipelineSolver::advancePosition() {
530 if (
static_cast<size_t>(CurrConflInstNo) >=
531 PipelineInstrs[CurrSyncGroupIdx].
size()) {
535 while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
536 PipelineInstrs[CurrSyncGroupIdx].size() == 0)
541void PipelineSolver::retreatPosition() {
542 assert(CurrConflInstNo >= 0);
543 assert(CurrSyncGroupIdx >= 0);
545 if (CurrConflInstNo > 0) {
550 if (CurrConflInstNo == 0) {
553 if (CurrSyncGroupIdx == BeginSyncGroupIdx)
558 while (PipelineInstrs[CurrSyncGroupIdx].
size() == 0)
561 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
565bool PipelineSolver::checkOptimal() {
566 if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
567 if (BestCost == -1 || CurrCost < BestCost) {
568 BestPipeline = CurrPipeline;
575 bool DoneExploring =
false;
576 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
577 DoneExploring =
true;
579 return (DoneExploring || BestCost == 0);
583void PipelineSolver::populateReadyList(
585 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
586 auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
587 assert(CurrSU.second.size() >= 1);
589 for (;
I != E; ++
I) {
590 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
593 return SG.getSGID() == CandSGID;
598 if (
Match->isFull()) {
599 ReadyList.push_back(std::pair(*
I, MissPenalty));
603 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
604 ReadyList.push_back(std::pair(*
I, TempCost));
605 removeEdges(AddedEdges);
607 ReadyList.push_back(std::pair(*
I, -1));
611 std::sort(ReadyList.begin(), ReadyList.end(),
612 [](std::pair<int, int>
A, std::pair<int, int>
B) {
613 return A.second < B.second;
617 assert(ReadyList.size() == CurrSU.second.size());
620bool PipelineSolver::solveExact() {
624 if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
627 assert(
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
628 assert(
static_cast<size_t>(CurrConflInstNo) <
629 PipelineInstrs[CurrSyncGroupIdx].
size());
630 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
632 <<
") in Pipeline # " << CurrSyncGroupIdx <<
"\n");
637 IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.
rbegin(),
638 CurrSU.second.rend())
639 : populateReadyList(ReadyList, CurrSU.second.
begin(),
640 CurrSU.second.end());
642 auto I = ReadyList.
begin();
643 auto E = ReadyList.
end();
644 for (;
I != E; ++
I) {
648 if (BestCost != -1 && (CurrCost +
I->second > BestCost))
651 int CandSGID =
I->first;
653 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
654 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
656 for (
auto &SG : SyncPipeline) {
657 if (SG.getSGID() == CandSGID)
664 if (!
Match->allowedByRules(CurrSU.first, SyncPipeline))
668 << (
int)
Match->getMask() <<
"and ID " << CandSGID
670 Match->add(*CurrSU.first);
671 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
672 LLVM_DEBUG(
dbgs() <<
"Cost of Assignment: " << AddedCost <<
"\n");
673 CurrCost += AddedCost;
676 bool FinishedExploring =
false;
679 if (CurrCost < BestCost || BestCost == -1) {
681 FinishedExploring = BestCost != 0;
682 if (!FinishedExploring)
688 CurrCost -= AddedCost;
689 removeEdges(AddedEdges);
691 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
692 if (FinishedExploring)
699 CurrCost += MissPenalty;
702 LLVM_DEBUG(
dbgs() <<
"NOT Assigned (" << CurrSU.first->NodeNum <<
")\n");
704 bool FinishedExploring =
false;
705 if (CurrCost < BestCost || BestCost == -1) {
707 bool FinishedExploring = BestCost != 0;
708 if (!FinishedExploring)
714 CurrCost -= MissPenalty;
715 return FinishedExploring;
719void PipelineSolver::greedyFind(
720 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
T I,
T E) {
721 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
722 int BestNodeCost = -1;
724 SchedGroup *BestGroup =
nullptr;
725 int BestGroupID = -1;
726 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
728 <<
") in Pipeline # " << CurrSyncGroupIdx <<
"\n");
734 for (;
I != E; ++
I) {
735 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
738 return SG.getSGID() == CandSGID;
742 LLVM_DEBUG(
dbgs() <<
"Trying SGID # " << CandSGID <<
" with Mask "
743 << (
int)
Match->getMask() <<
"\n");
745 if (
Match->isFull()) {
749 if (!
Match->allowedByRules(CurrSU.first, SyncPipeline)) {
750 LLVM_DEBUG(
dbgs() <<
"SGID # " << CandSGID <<
" has conflicting rule\n");
753 TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
755 if (TempCost < BestNodeCost || BestNodeCost == -1) {
757 BestNodeCost = TempCost;
758 BestGroupID = CandSGID;
760 removeEdges(AddedEdges);
761 if (BestNodeCost == 0)
765 if (BestGroupID != -1) {
766 BestGroup->add(*CurrSU.first);
767 addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
768 LLVM_DEBUG(
dbgs() <<
"Best Group has ID: " << BestGroupID <<
" and Mask"
769 << (
int)BestGroup->getMask() <<
"\n");
770 BestCost += TempCost;
772 BestCost += MissPenalty;
774 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
777bool PipelineSolver::solveGreedy() {
779 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
781 while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
782 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
784 ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
785 : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
788 BestPipeline = CurrPipeline;
789 removeEdges(AddedEdges);
793unsigned PipelineSolver::computeProblemSize() {
794 unsigned ProblemSize = 0;
795 for (
auto &PipeConflicts : PipelineInstrs) {
796 ProblemSize += PipeConflicts.size();
802void PipelineSolver::solve() {
806 unsigned ProblemSize = computeProblemSize();
809 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
810 MissPenalty = (ProblemSize / 2) + 1;
813 if (EnableExactSolver || BelowCutoff) {
817 LLVM_DEBUG(
dbgs() <<
"Greedy produced best cost of " << BestCost <<
"\n");
821 LLVM_DEBUG(
dbgs() <<
"Exact produced best cost of " << BestCost <<
"\n");
833enum IGLPStrategyID :
int {
834 MFMASmallGemmOptID = 0,
835 MFMASmallGemmSingleWaveOptID = 1,
836 MFMAExpInterleave = 2
848 virtual bool applyIGLPStrategy(
857 bool IsBottomUp =
true;
862 virtual ~IGLPStrategy() =
default;
865class MFMASmallGemmOpt final :
public IGLPStrategy {
868 bool applyIGLPStrategy(
879 : IGLPStrategy(DAG,
TII) {
884bool MFMASmallGemmOpt::applyIGLPStrategy(
889 unsigned MFMACount = 0;
891 if (
TII->isMFMAorWMMA(
I))
894 const unsigned PipelineSyncID = 0;
895 SchedGroup *SG =
nullptr;
896 for (
unsigned I = 0;
I < MFMACount * 3; ++
I) {
897 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
898 SchedGroupMask::DS, 2, PipelineSyncID, DAG,
TII);
899 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
901 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
902 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
903 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
909class MFMAExpInterleaveOpt final :
public IGLPStrategy {
912 static unsigned TransPipeCount;
914 static unsigned MFMAPipeCount;
916 static unsigned AddPipeCount;
918 static unsigned MFMAEnablement;
920 static unsigned ExpRequirement;
922 static unsigned MFMAChains;
924 static unsigned MFMAChainLength;
929 static bool HasChainBetweenCvt;
931 static std::optional<unsigned> FirstPipeDSR;
940 class IsPipeExp final :
public InstructionRule {
945 auto DAG = SyncPipe[0].DAG;
947 if (Cache->empty()) {
949 auto E = DAG->
SUnits.rend();
950 for (;
I != E;
I++) {
951 if (
TII->isMFMAorWMMA(*
I->getInstr()))
952 Cache->push_back(&*
I);
958 auto Reaches = (std::any_of(
959 Cache->begin(), Cache->end(), [&SU, &DAG](
SUnit *TargetSU) {
960 return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));
965 IsPipeExp(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
966 : InstructionRule(
TII, SGID, NeedsCache) {}
971 class EnablesNthMFMA final :
public InstructionRule {
978 bool FoundTrans =
false;
979 unsigned Counter = 1;
980 auto DAG = SyncPipe[0].DAG;
982 if (Cache->empty()) {
986 auto E = DAG->
SUnits.end();
987 for (;
I != E;
I++) {
988 if (FoundTrans &&
TII->isMFMAorWMMA(*
I->getInstr())) {
990 Cache->push_back(&*
I);
995 if (!FoundTrans &&
TII->isTRANS(
I->getInstr()->getOpcode()))
1006 bool NeedsCache =
false)
1012 class EnablesNthMFMAInChain final :
public InstructionRule {
1020 auto DAG = SyncPipe[0].DAG;
1022 if (!SU || !
TII->isMFMAorWMMA(*ChainSeed->
getInstr()))
1025 if (Cache->empty()) {
1026 auto TempSU = ChainSeed;
1031 for (
auto &Succ : TempSU->Succs) {
1032 if (
TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1033 TempSU = Succ.getSUnit();
1042 Cache->push_back(TempSU);
1051 EnablesNthMFMAInChain(
unsigned Number,
SUnit *ChainSeed,
1053 bool NeedsCache =
false)
1055 ChainSeed(ChainSeed) {}
1061 class LessThanNSuccs final :
public InstructionRule {
1064 bool HasIntermediary =
false;
1069 if (!SyncPipe.
size())
1072 auto SuccSize = std::count_if(
1074 [](
const SDep &Succ) { return Succ.getKind() == SDep::Data; });
1075 if (SuccSize >= Size)
1078 if (HasIntermediary) {
1079 for (
auto Succ : SU->
Succs) {
1080 auto SuccSize = std::count_if(
1082 [](
const SDep &SuccSucc) {
1083 return SuccSucc.getKind() == SDep::Data;
1085 if (SuccSize >= Size)
1092 LessThanNSuccs(
unsigned Size,
const SIInstrInfo *
TII,
unsigned SGID,
1093 bool HasIntermediary =
false,
bool NeedsCache =
false)
1094 : InstructionRule(
TII, SGID, NeedsCache), Size(Size),
1095 HasIntermediary(HasIntermediary) {}
1102 class GreaterThanOrEqualToNSuccs final :
public InstructionRule {
1105 bool HasIntermediary =
false;
1110 if (!SyncPipe.
size())
1113 auto SuccSize = std::count_if(
1115 [](
const SDep &Succ) { return Succ.getKind() == SDep::Data; });
1116 if (SuccSize >= Size)
1119 if (HasIntermediary) {
1120 for (
auto Succ : SU->
Succs) {
1121 auto SuccSize = std::count_if(
1123 [](
const SDep &SuccSucc) {
1124 return SuccSucc.getKind() == SDep::Data;
1126 if (SuccSize >= Size)
1133 GreaterThanOrEqualToNSuccs(
unsigned Size,
const SIInstrInfo *
TII,
1134 unsigned SGID,
bool HasIntermediary =
false,
1135 bool NeedsCache =
false)
1136 : InstructionRule(
TII, SGID, NeedsCache), Size(Size),
1137 HasIntermediary(HasIntermediary) {}
1141 class IsCvt final :
public InstructionRule {
1146 return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
1147 Opc == AMDGPU::V_CVT_I32_F32_e32;
1149 IsCvt(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1150 : InstructionRule(
TII, SGID, NeedsCache) {}
1154 class IsFMA final :
public InstructionRule {
1161 IsFMA(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1162 : InstructionRule(
TII, SGID, NeedsCache) {}
1166 class IsPipeAdd final :
public InstructionRule {
1172 IsPipeAdd(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1173 : InstructionRule(
TII, SGID, NeedsCache) {}
1178 class IsSuccOfPrevNthGroup final :
public InstructionRule {
1180 unsigned Distance = 1;
1185 SchedGroup *OtherGroup =
nullptr;
1186 if (!SyncPipe.
size())
1189 for (
auto &PipeSG : SyncPipe) {
1190 if ((
unsigned)PipeSG.getSGID() == SGID - Distance)
1191 OtherGroup = &PipeSG;
1196 if (!OtherGroup->Collection.size())
1199 for (
auto &OtherEle : OtherGroup->Collection) {
1200 for (
auto &Succ : OtherEle->Succs) {
1201 if (Succ.getSUnit() == SU && Succ.getKind() ==
SDep::Data)
1209 unsigned SGID,
bool NeedsCache =
false)
1210 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
1215 class IsReachableFromPrevNthGroup final :
public InstructionRule {
1217 unsigned Distance = 1;
1222 SchedGroup *OtherGroup =
nullptr;
1223 if (!SyncPipe.
size())
1226 for (
auto &PipeSG : SyncPipe) {
1227 if ((
unsigned)PipeSG.getSGID() == SGID - Distance)
1228 OtherGroup = &PipeSG;
1233 if (!OtherGroup->Collection.size())
1236 auto DAG = SyncPipe[0].DAG;
1238 for (
auto &OtherEle : OtherGroup->Collection)
1244 IsReachableFromPrevNthGroup(
unsigned Distance,
const SIInstrInfo *
TII,
1245 unsigned SGID,
bool NeedsCache =
false)
1246 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
1250 class OccursAtOrAfterNode final :
public InstructionRule {
1261 bool NeedsCache =
false)
1267 class IsExactMFMA final :
public InstructionRule {
1275 if (!SU || !
TII->isMFMAorWMMA(*ChainSeed->
getInstr()))
1278 if (Cache->empty()) {
1279 auto TempSU = ChainSeed;
1284 for (
auto &Succ : TempSU->Succs) {
1285 if (
TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1286 TempSU = Succ.getSUnit();
1295 Cache->push_back(TempSU);
1301 return (*Cache)[0] == SU;
1305 unsigned SGID,
bool NeedsCache =
false)
1307 ChainSeed(ChainSeed) {}
1313 class OccursAfterExp final :
public InstructionRule {
1319 auto DAG = SyncPipe[0].DAG;
1320 if (Cache->empty()) {
1321 for (
auto &SU : DAG->
SUnits)
1330 return SU->
NodeNum > (*Cache)[0]->NodeNum;
1334 bool NeedsCache =
false)
1335 : InstructionRule(
TII, SGID, NeedsCache) {}
1339 bool applyIGLPStrategy(
1348 : IGLPStrategy(DAG,
TII) {
1353unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
1354unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
1355unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
1356unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
1357unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
1358unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
1359unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
1360bool MFMAExpInterleaveOpt::HasCvt =
false;
1361bool MFMAExpInterleaveOpt::HasChainBetweenCvt =
false;
1362std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
1371 auto isBitPack = [](
unsigned Opc) {
1372 return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;
1375 auto isCvt = [](
unsigned Opc) {
1376 return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32;
1379 auto isAdd = [](
unsigned Opc) {
return Opc == AMDGPU::V_ADD_F32_e32; };
1384 if (
TII->isTRANS(Opc)) {
1386 if (SU.
Succs.size() >= 7)
1388 for (
auto &Succ : SU.
Succs) {
1389 if (Succ.getSUnit()->Succs.size() >= 7)
1408 if (!(PackSUs.
size() && MFMAPipeCands.
size() && ExpPipeCands.
size()))
1413 std::optional<SUnit *> TempMFMA;
1414 std::optional<SUnit *> TempExp;
1416 for (
auto &PredSU : ExpPipeCands) {
1417 for (
auto &SuccSU : MFMAPipeCands) {
1430 if (!(TempExp && TempMFMA))
1433 HasChainBetweenCvt =
1434 std::find_if((*TempExp)->Succs.begin(), (*TempExp)->Succs.end(),
1435 [&isCvt](
SDep &Succ) {
1436 return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
1437 }) == (*TempExp)->Succs.end();
1440 for (
auto &SuccSU : MFMAPipeCands) {
1441 if (MFMAPipeSUs.
size() &&
1442 std::find_if(MFMAPipeSUs.
begin(), MFMAPipeSUs.
end(),
1443 [&SuccSU](
SUnit *PotentialMatch) {
1444 return PotentialMatch->NodeNum == SuccSU->NodeNum;
1445 }) != MFMAPipeSUs.
end())
1448 for (
auto &PredSU : ExpPipeCands) {
1456 MFMAPipeCount = MFMAPipeSUs.
size();
1458 assert(TempExp && TempMFMA);
1459 assert(MFMAPipeCount > 0);
1461 std::optional<SUnit *> TempCvt;
1462 for (
auto &SuccSU : CvtSUs) {
1470 if (TempCvt.has_value()) {
1471 for (
auto &SuccSU : MFMAPipeSUs) {
1480 for (
auto &MFMAPipeSU : MFMAPipeSUs) {
1483 if (!std::any_of(MFMAPipeSU->Preds.begin(), MFMAPipeSU->Preds.end(),
1485 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1487 MFMAChainSeeds.push_back(MFMAPipeSU);
1495 for (
auto Pred : MFMAChainSeeds[0]->Preds) {
1496 if (
TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
1497 Pred.getSUnit()->getInstr()->mayLoad())
1498 FirstPipeDSR = Pred.getSUnit()->NodeNum;
1501 MFMAChainLength = MFMAPipeCount / MFMAChains;
1504 unsigned PackSuccCount = std::count_if(
1505 PackSUs.
begin(), PackSUs.
end(), [
this, &TempExp](
SUnit *VPack) {
1506 return DAG->IsReachable(VPack, *TempExp);
1510 unsigned PackPredCount =
1511 std::count_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
1512 [&isBitPack](
SDep &Pred) {
1513 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1514 return isBitPack(Opc);
1518 std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
1519 [&isBitPack](
SDep &Pred) {
1520 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1521 return isBitPack(Opc);
1524 if (PackPred == (*TempMFMA)->Preds.end())
1531 std::count_if(PackPred->getSUnit()->Succs.begin(),
1532 PackPred->getSUnit()->Succs.end(), [&
TII](
SDep &Succ) {
1533 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1537 MFMAEnablement *= PackSuccCount;
1541 std::count_if(ExpPipeCands.
begin(), ExpPipeCands.
end(),
1542 [
this, &PackPred](
SUnit *ExpBase) {
1543 return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
1546 ExpRequirement *= PackPredCount;
1555 if (
Phase != AMDGPU::SchedulingPhase::PostRA)
1556 MFMAChainSeeds.clear();
1557 if (
Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(
TII))
1563bool MFMAExpInterleaveOpt::applyIGLPStrategy(
1568 bool IsSmallKernelType =
1569 MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
1570 bool IsLargeKernelType =
1571 MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;
1573 if (!(IsSmallKernelType || IsLargeKernelType))
1579 unsigned PipelineSyncID = 0;
1580 SchedGroup *SG =
nullptr;
1582 unsigned MFMAChain = 0;
1583 unsigned PositionInChain = 0;
1584 unsigned CurrMFMAForTransPosition = 0;
1586 auto incrementTransPosition = [&MFMAChain, &PositionInChain,
1587 &CurrMFMAForTransPosition]() {
1588 CurrMFMAForTransPosition += MFMAEnablement;
1589 PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
1590 MFMAChain = CurrMFMAForTransPosition % MFMAChains;
1593 auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
1594 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1595 return (TempMFMAForTrans / MFMAChains);
1598 auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
1599 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1600 return TempMFMAForTrans % MFMAChains;
1603 unsigned CurrMFMAPosition = 0;
1604 unsigned MFMAChainForMFMA = 0;
1605 unsigned PositionInChainForMFMA = 0;
1607 auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
1608 &PositionInChainForMFMA]() {
1610 MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
1611 PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
1614 bool IsPostRA =
Phase == AMDGPU::SchedulingPhase::PostRA;
1615 assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);
1617 bool UsesFMA = IsSmallKernelType || !IsPostRA;
1618 bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
1619 bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA);
1620 bool UsesVALU = IsSmallKernelType;
1625 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1626 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG,
TII);
1627 if (!IsPostRA && MFMAChains) {
1628 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1629 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1633 std::make_shared<EnablesNthMFMA>(1,
TII, SG->getSGID(),
true));
1634 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1635 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1638 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1639 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG,
TII);
1640 if (!IsPostRA && MFMAChains) {
1641 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1642 getNextTransPositionInChain(),
1643 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
true));
1645 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1,
TII,
1646 SG->getSGID(),
true));
1647 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1648 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1652 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1653 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG,
TII);
1654 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR,
TII,
1656 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1660 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1661 SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG,
TII);
1662 if (!IsPostRA && MFMAChains)
1663 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1664 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
true));
1666 SG->addRule(std::make_shared<EnablesNthMFMA>(1,
TII, SG->getSGID(),
true));
1667 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1668 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1669 HasChainBetweenCvt));
1670 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1672 incrementTransPosition();
1675 for (
unsigned I = 0;
I < ExpRequirement;
I++) {
1678 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1679 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1680 SG->addRule(std::make_shared<IsCvt>(
TII, SG->getSGID()));
1681 if (HasChainBetweenCvt)
1682 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1683 1 + (2 + UsesFMA) *
I,
TII, SG->getSGID()));
1685 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(
1686 1 + (2 + UsesFMA) *
I,
TII, SG->getSGID()));
1687 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1692 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1693 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1694 if (!IsPostRA && MFMAChains) {
1695 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1696 getNextTransPositionInChain(),
1697 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
true));
1699 SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1,
1700 TII, SG->getSGID(),
true));
1701 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1702 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1706 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1707 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1708 if (!IsPostRA && MFMAChains)
1709 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1710 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1713 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1,
TII,
1714 SG->getSGID(),
true));
1715 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1716 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1717 HasChainBetweenCvt));
1718 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1723 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1724 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1725 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1726 SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>(
1727 8,
TII, SG->getSGID(), HasChainBetweenCvt));
1728 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1733 unsigned MFMARatio =
1734 MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;
1737 MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;
1739 unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement)
1740 ? TransPipeCount - (2 * ExpRequirement)
1742 unsigned ExpLoopCount = RemainingExp / ExpRatio;
1744 unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2)
1745 ? MFMAPipeCount - (MFMAEnablement * 2)
1747 unsigned MFMALoopCount = MFMAInLoop / MFMARatio;
1749 AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount;
1750 unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount);
1752 for (
unsigned I = 0;
I < LoopSize;
I++) {
1753 if (!(
I * ExpRatio % ExpRequirement))
1754 incrementTransPosition();
1757 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1758 SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG,
TII);
1759 if (!IsPostRA && MFMAChains)
1760 SG->addRule(std::make_shared<IsExactMFMA>(
1761 PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA],
TII,
1762 SG->getSGID(),
true));
1764 SG->addRule(std::make_shared<OccursAfterExp>(
TII, SG->getSGID(),
true));
1765 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1766 incrementMFMAPosition();
1769 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1770 SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG,
TII);
1771 SG->addRule(std::make_shared<IsPipeAdd>(
TII, SG->getSGID()));
1772 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1775 if (UsesDSRead && !(
I % 4)) {
1776 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1777 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG,
TII);
1778 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR,
TII,
1780 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1784 for (
unsigned J = 0; J < ExpRatio; J++) {
1785 auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (
I + 1);
1786 auto MaxMFMAOffset =
1787 (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;
1791 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1792 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1793 SG->addRule(std::make_shared<IsCvt>(
TII, SG->getSGID()));
1794 auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;
1795 auto DSROffset =
I / 4 + 1;
1796 auto MaxDSROffset = MaxMFMAOffset / 4;
1798 auto ExpOffset =
I * ExpRatio + J >= ExpRequirement ? 0 : 1;
1799 auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +
1800 std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff +
1802 if (HasChainBetweenCvt)
1803 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1804 CurrentOffset,
TII, SG->getSGID()));
1806 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset,
TII,
1808 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1813 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1814 SchedGroupMask::VALU, 1, PipelineSyncID, DAG,
TII);
1815 if (!IsPostRA && MFMAChains)
1816 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1817 getNextTransPositionInChain(),
1818 MFMAChainSeeds[getNextTransMFMAChain()],
TII, SG->getSGID(),
1821 SG->addRule(std::make_shared<EnablesNthMFMA>(
1822 (((
I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1,
1823 TII, SG->getSGID(),
true));
1824 SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
1825 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1829 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1830 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG,
TII);
1831 if (!IsPostRA && MFMAChains)
1832 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1833 PositionInChain, MFMAChainSeeds[MFMAChain],
TII, SG->getSGID(),
1836 SG->addRule(std::make_shared<EnablesNthMFMA>(
1837 (((
I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,
1838 TII, SG->getSGID(),
true));
1839 SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(),
true));
1840 SG->addRule(std::make_shared<LessThanNSuccs>(8,
TII, SG->getSGID(),
1841 HasChainBetweenCvt));
1842 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1847 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1848 SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG,
TII);
1849 SG->addRule(std::make_shared<OccursAfterExp>(
TII, SG->getSGID(),
true));
1850 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1854class MFMASmallGemmSingleWaveOpt final :
public IGLPStrategy {
1857 class EnablesInitialMFMA final :
public InstructionRule {
1861 if (!SyncPipe.
size())
1864 if (!Cache->size()) {
1865 for (
auto &Elt : SyncPipe[0].DAG->
SUnits) {
1866 if (
TII->isMFMAorWMMA(*Elt.getInstr())) {
1870 Cache->push_back(&Elt);
1876 auto DAG = SyncPipe[0].DAG;
1877 for (
auto &Elt : *Cache) {
1885 bool NeedsCache =
false)
1886 : InstructionRule(
TII, SGID, NeedsCache) {}
1890 class IsPermForDSW final :
public InstructionRule {
1895 if (
MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
1898 bool FitsInGroup =
false;
1900 if (!Collection.
size()) {
1901 for (
auto &Succ : SU->
Succs) {
1902 SUnit *SuccUnit = Succ.getSUnit();
1905 Cache->push_back(SuccUnit);
1918 return ThisSucc.getSUnit() == Elt;
1923 IsPermForDSW(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1924 : InstructionRule(
TII, SGID, NeedsCache) {}
1928 class IsSuccOfPrevGroup final :
public InstructionRule {
1932 SchedGroup *OtherGroup =
nullptr;
1933 for (
auto &PipeSG : SyncPipe) {
1934 if ((
unsigned)PipeSG.getSGID() == SGID - 1) {
1935 OtherGroup = &PipeSG;
1941 if (!OtherGroup->Collection.size())
1945 return (std::any_of(OtherGroup->Collection.begin(),
1946 OtherGroup->Collection.end(), [&SU](
SUnit *Elt) {
1947 return std::any_of(Elt->Succs.begin(),
1950 return Succ.getSUnit() == SU;
1955 bool NeedsCache =
false)
1956 : InstructionRule(
TII, SGID, NeedsCache) {}
1960 class VMEMSize final :
public InstructionRule {
1965 if (
MI->getOpcode() == TargetOpcode::BUNDLE)
1967 if (!Collection.
size())
1972 auto TRI =
TII->getRegisterInfo();
1973 auto &
MRI =
MI->getParent()->getParent()->getRegInfo();
1974 for (
auto &Elt : Collection) {
1975 auto Op = Elt->getInstr()->getOperand(0);
1977 TRI.getRegSizeInBits(*
TRI.getRegClassForOperandReg(
MRI,
Op));
1981 if (NumBits < 128) {
1983 if (NumBits +
TRI.getRegSizeInBits(*
TRI.getRegClassForOperandReg(
1984 MRI,
MI->getOperand(0))) <=
1992 VMEMSize(
const SIInstrInfo *
TII,
unsigned SGID,
bool NeedsCache =
false)
1993 : InstructionRule(
TII, SGID, NeedsCache) {}
1998 class SharesPredWithPrevNthGroup final :
public InstructionRule {
2000 unsigned Distance = 1;
2005 SchedGroup *OtherGroup =
nullptr;
2006 if (!SyncPipe.
size())
2009 if (!Cache->size()) {
2011 for (
auto &PipeSG : SyncPipe) {
2012 if ((
unsigned)PipeSG.getSGID() == SGID - Distance) {
2013 OtherGroup = &PipeSG;
2019 if (!OtherGroup->Collection.size())
2022 for (
auto &OtherEle : OtherGroup->Collection) {
2023 for (
auto &Pred : OtherEle->Preds) {
2024 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2025 AMDGPU::V_PERM_B32_e64)
2026 Cache->push_back(Pred.getSUnit());
2035 auto DAG = SyncPipe[0].DAG;
2042 SharesPredWithPrevNthGroup(
unsigned Distance,
const SIInstrInfo *
TII,
2043 unsigned SGID,
bool NeedsCache =
false)
2044 : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
2048 bool applyIGLPStrategy(
2059 : IGLPStrategy(DAG,
TII) {
2064static unsigned DSWCount = 0;
2065static unsigned DSWWithPermCount = 0;
2066static unsigned DSWWithSharedVMEMCount = 0;
2068bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
2072 unsigned MFMACount = 0;
2073 unsigned DSRCount = 0;
2075 bool IsInitial =
Phase == AMDGPU::SchedulingPhase::Initial;
2077 assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
2078 DSWWithSharedVMEMCount == 0)) &&
2079 "DSWCounters should be zero in pre-RA scheduling!");
2081 for (
auto &SU : DAG->SUnits) {
2082 auto I = SU.getInstr();
2083 if (
TII->isMFMAorWMMA(*
I))
2085 else if (
TII->isDS(*
I)) {
2088 else if (
I->mayStore() && IsInitial) {
2090 for (
auto Pred : SU.Preds) {
2091 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2092 AMDGPU::V_PERM_B32_e64) {
2102 DSWWithPermCount = DSWithPerms.
size();
2103 auto I = DSWithPerms.
begin();
2104 auto E = DSWithPerms.
end();
2114 for (;
I != E;
I++) {
2115 SUnit *Cand =
nullptr;
2116 bool MissedAny =
false;
2117 for (
auto &Pred : (*I)->Preds) {
2118 if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
2124 for (
auto &Succ : Pred.getSUnit()->Succs) {
2125 auto MI = Succ.getSUnit()->getInstr();
2126 if (!
TII->isVMEM(*
MI) || !
MI->mayLoad())
2129 if (MissedAny || !VMEMLookup.
size()) {
2131 VMEMLookup[
MI] = *
I;
2137 VMEMLookup[
MI] = *
I;
2141 Cand = VMEMLookup[
MI];
2148 if (!MissedAny && Cand) {
2149 DSWWithSharedVMEMCount += 2;
2156 assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
2158 unsigned PipelineSyncID = 0;
2160 if (DSWWithPermCount) {
2161 for (
unsigned I = 0;
I < MFMACount;
I++) {
2162 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2163 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2164 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2166 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2167 SchedGroupMask::VALU, 2, PipelineSyncID, DAG,
TII);
2168 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2178 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2179 SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG,
TII);
2180 SG->addRule(std::make_shared<EnablesInitialMFMA>(
TII, SG->getSGID(),
true));
2181 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2183 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2184 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2185 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2188 for (
unsigned I = 0;
I < DSRCount - 4; ++
I) {
2189 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2190 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG,
TII);
2191 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2193 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2194 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2195 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2201 for (
unsigned I = 0;
I < DSWWithPermCount - DSWWithSharedVMEMCount; ++
I) {
2202 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2203 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2204 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2205 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2207 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2208 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2209 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2210 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2212 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2213 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2214 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2215 1,
TII, SG->getSGID(),
true));
2216 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2217 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2219 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2220 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2221 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2223 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2224 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2225 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2226 3,
TII, SG->getSGID(),
true));
2227 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2228 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2230 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2231 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2232 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2238 for (
unsigned I = 0;
I < DSWCount - DSWWithPermCount;
I++) {
2239 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2240 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2241 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2243 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2244 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2245 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2246 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2248 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2249 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2250 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2258 for (
unsigned I = 0;
I < DSWWithSharedVMEMCount; ++
I) {
2259 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2260 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2261 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2262 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2264 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2265 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2266 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2267 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2269 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2270 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2271 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2273 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2274 SchedGroupMask::VALU, 4, PipelineSyncID, DAG,
TII);
2275 SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(),
true));
2276 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2278 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2279 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG,
TII);
2280 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
2281 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2283 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2284 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2285 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2287 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2288 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2289 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2290 2,
TII, SG->getSGID(),
true));
2291 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2292 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2294 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2295 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2296 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2298 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2299 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG,
TII);
2300 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2301 4,
TII, SG->getSGID(),
true));
2302 SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
2303 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2305 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2306 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG,
TII);
2307 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2313static std::unique_ptr<IGLPStrategy>
2317 case MFMASmallGemmOptID:
2318 return std::make_unique<MFMASmallGemmOpt>(DAG,
TII);
2319 case MFMASmallGemmSingleWaveOptID:
2320 return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG,
TII);
2321 case MFMAExpInterleave:
2322 return std::make_unique<MFMAExpInterleaveOpt>(DAG,
TII);
2343 void addSchedBarrierEdges(
SUnit &SU);
2354 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask)
const;
2357 void initSchedGroupBarrierPipelineStage(
2358 std::vector<SUnit>::reverse_iterator RIter);
2360 bool initIGLPOpt(
SUnit &SU);
2370 bool IsBottomUp =
true;
2375 IGroupLPDAGMutation() =
default;
2379unsigned SchedGroup::NumSchedGroups = 0;
2391 if (
MI.isMetaInstruction())
2394 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
2399 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
2403 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
2407 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
2408 TII->isMFMAorWMMA(
MI))
2411 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
2415 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
2420 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
2425 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
2429 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
2430 MI.mayLoad() &&
TII->isDS(
MI))
2433 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
2434 MI.mayStore() &&
TII->isDS(
MI))
2437 else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
2442 dbgs() <<
"For SchedGroup with mask " <<
format_hex((
int)SGMask, 10,
true)
2443 << (Result ?
" could classify " :
" unable to classify ") <<
MI);
2448int SchedGroup::link(
SUnit &SU,
bool MakePred,
2449 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
2450 int MissedEdges = 0;
2451 for (
auto *
A : Collection) {
2453 if (
A ==
B ||
A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2463 bool Added = tryAddEdge(
A,
B);
2465 AddedEdges.emplace_back(
A,
B);
2473void SchedGroup::link(
SUnit &SU,
bool MakePred) {
2474 for (
auto *
A : Collection) {
2476 if (
A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2485void SchedGroup::link(
SUnit &SU,
2487 for (
auto *
A : Collection) {
2496void SchedGroup::link(SchedGroup &OtherGroup) {
2497 for (
auto *
B : OtherGroup.Collection)
2501bool SchedGroup::canAddSU(
SUnit &SU)
const {
2503 if (
MI.getOpcode() != TargetOpcode::BUNDLE)
2504 return canAddMI(
MI);
2509 while (E !=
MBB->
end() && E->isBundledWithPred())
2516void SchedGroup::initSchedGroup() {
2517 for (
auto &SU : DAG->
SUnits) {
2526void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
2527 SUnitsToCandidateSGsMap &SyncedInstrs) {
2528 SUnit &InitSU = *RIter;
2529 for (
auto E = DAG->
SUnits.rend(); RIter != E; ++RIter) {
2535 SyncedInstrs[&SU].push_back(SGID);
2543void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
2544 auto I = DAG->
SUnits.rbegin();
2545 auto E = DAG->
SUnits.rend();
2546 for (;
I != E; ++
I) {
2551 SyncedInstrs[&SU].push_back(SGID);
2557 if (!TSchedModel || DAGInstrs->
SUnits.empty())
2562 TII =
ST.getInstrInfo();
2564 SyncedSchedGroups.clear();
2565 SyncedInstrs.clear();
2566 bool FoundSB =
false;
2567 bool FoundIGLP =
false;
2568 bool ShouldApplyIGLP =
false;
2569 for (
auto R = DAG->
SUnits.rbegin(), E = DAG->
SUnits.rend();
R != E; ++
R) {
2570 unsigned Opc =
R->getInstr()->getOpcode();
2572 if (Opc == AMDGPU::SCHED_BARRIER) {
2573 addSchedBarrierEdges(*R);
2575 }
else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
2576 initSchedGroupBarrierPipelineStage(R);
2578 }
else if (Opc == AMDGPU::IGLP_OPT) {
2579 resetEdges(*R, DAG);
2580 if (!FoundSB && !FoundIGLP) {
2582 ShouldApplyIGLP = initIGLPOpt(*R);
2587 if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
2588 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
2596void IGroupLPDAGMutation::addSchedBarrierEdges(
SUnit &SchedBarrier) {
2598 assert(
MI.getOpcode() == AMDGPU::SCHED_BARRIER);
2601 resetEdges(SchedBarrier, DAG);
2602 LLVM_DEBUG(
dbgs() <<
"Building SchedGroup for SchedBarrier with Mask: "
2603 <<
MI.getOperand(0).getImm() <<
"\n");
2605 invertSchedBarrierMask((SchedGroupMask)
MI.getOperand(0).getImm());
2606 SchedGroup SG(InvertedMask, std::nullopt, DAG,
TII);
2607 SG.initSchedGroup();
2613 const SUnit *
A,
const SUnit *
B) {
return A->NodeNum >
B->NodeNum; });
2617IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask)
const {
2620 SchedGroupMask InvertedMask = ~Mask;
2623 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
2624 InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
2625 ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
2627 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
2628 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
2629 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
2630 (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
2631 InvertedMask &= ~SchedGroupMask::ALU;
2634 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
2635 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
2637 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
2638 (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
2639 InvertedMask &= ~SchedGroupMask::VMEM;
2642 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
2643 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
2645 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
2646 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
2647 InvertedMask &= ~SchedGroupMask::DS;
2649 LLVM_DEBUG(
dbgs() <<
"After Inverting, SchedGroup Mask: " << (
int)InvertedMask
2652 return InvertedMask;
2655void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
2656 std::vector<SUnit>::reverse_iterator RIter) {
2659 resetEdges(*RIter, DAG);
2666 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
2669 SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
2672bool IGroupLPDAGMutation::initIGLPOpt(
SUnit &SU) {
2673 IGLPStrategyID StrategyID =
2675 auto S = createIGLPStrategy(StrategyID, DAG,
TII);
2676 if (!S->shouldApplyStrategy(DAG,
Phase))
2679 IsBottomUp = S->IsBottomUp;
2680 return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups,
Phase);
2692std::unique_ptr<ScheduleDAGMutation>
2694 return std::make_unique<IGroupLPDAGMutation>(
Phase);
unsigned const MachineRegisterInfo * MRI
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late Phase
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
This file defines the DenseMap class.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
This class represents an Operation in the Expression.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Instructions::iterator instr_iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const MachineOperand & getOperand(unsigned i) const
@ Data
Regular data dependence (aka true-dependence).
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Scheduling unit. This is a node in the scheduling DAG.
unsigned NodeNum
Entry # of node in the node vector.
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
SmallVector< SDep, 4 > Succs
All sunit successors.
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
bool IsReachable(SUnit *SU, SUnit *TargetSU)
IsReachable - Checks if SU is reachable from TargetSU.
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU)
True if an edge can be added from PredSU to SuccSU without creating a cycle.
void dump() const override
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Mutate the DAG as a postpass after normal DAG building.
virtual void apply(ScheduleDAGInstrs *DAG)=0
std::vector< SUnit > SUnits
The scheduling units.
MachineFunction & MF
Machine function.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Provide an instruction scheduling machine model to CodeGen passes.
An efficient, type-erasing, non-owning reference to a callable.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
initializer< Ty > init(const Ty &Val)
void link(std::unique_ptr< LinkGraph > G, std::unique_ptr< JITLinkContext > Ctx)
Link the given graph.
This is an optimization pass for GlobalISel generic memory operations.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
FormattedNumber format_hex(uint64_t N, unsigned Width, bool Upper=false)
format_hex - Output N as a fixed width hexadecimal.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.