29#define DEBUG_TYPE "igrouplp" 
   35    cl::desc(
"Whether to use the exponential time solver to fit " 
   36             "the instructions to the pipeline as closely as " 
   42    cl::desc(
"The maximum number of scheduling group conflicts " 
   43             "which we attempt to solve with the exponential time " 
   44             "exact solver. Problem sizes greater than this will" 
   45             "be solved by the less accurate greedy algorithm. Selecting " 
   46             "solver by size is superseded by manually selecting " 
   47             "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
 
   51    cl::desc(
"The amount of branches that we are willing to explore with" 
   52             "the exact algorithm before giving up."));
 
   56    cl::desc(
"Whether to use the cost heuristic to make choices as we " 
   57             "traverse the search space using the exact solver. Defaulted " 
   58             "to on, and if turned off, we will use the node order -- " 
   59             "attempting to put the later nodes in the later sched groups. " 
   60             "Experimentally, results are mixed, so this should be set on a " 
   61             "case-by-case basis."));
 
   65enum class SchedGroupMask {
 
   78  ALL = ALU | VALU | SALU | 
MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
 
   79        DS_READ | DS_WRITE | TRANS,
 
   88class InstructionRule {
 
   94  std::optional<SmallVector<SUnit *, 4>> Cache;
 
  104                  bool NeedsCache = 
false)
 
  111  virtual ~InstructionRule() = 
default;
 
  124  SchedGroupMask SGMask;
 
  127  std::optional<unsigned> MaxSize;
 
  140  static unsigned NumSchedGroups;
 
  157  bool canAddSU(
SUnit &SU) 
const;
 
  162  void link(
SUnit &SU, 
bool MakePred = 
false);
 
  166  int link(
SUnit &SU, 
bool MakePred,
 
  167           std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
 
  176  void link(SchedGroup &OtherGroup);
 
  179  bool isFull()
 const { 
return MaxSize && Collection.
size() >= *MaxSize; }
 
  185  void addRule(std::shared_ptr<InstructionRule> NewRule) {
 
  190  bool allowedByRules(
const SUnit *SU,
 
  192    for (
auto &Rule : Rules) {
 
  193      if (!Rule->apply(SU, Collection, SyncPipe))
 
  200  void add(
SUnit &SU) {
 
  202                      << 
format_hex((
int)SGMask, 10, 
true) << 
" adding " 
  208  void pop() { Collection.
pop_back(); }
 
  211  void initSchedGroup();
 
  218  void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
 
  219                      SUnitsToCandidateSGsMap &SyncedInstrs);
 
  221  void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
 
  223  int getSyncID() { 
return SyncID; }
 
  225  int getSGID() { 
return SGID; }
 
  227  SchedGroupMask 
getMask() { 
return SGMask; }
 
  229  SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
 
  231      : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), 
TII(
TII) {
 
  232    SGID = NumSchedGroups++;
 
  235  SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, 
int SyncID,
 
  237      : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), 
TII(
TII) {
 
  238    SGID = NumSchedGroups++;
 
  242using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;
 
  254class PipelineSolver {
 
  267  bool NeedsSolver = 
false;
 
  271  unsigned computeProblemSize();
 
  282  int CurrConflInstNo = 0;
 
  284  int CurrSyncGroupIdx = 0;
 
  286  int BeginSyncGroupIdx = 0;
 
  292  bool IsBottomUp = 
true;
 
  295  void advancePosition();
 
  298  void retreatPosition();
 
  307  template <
typename T>
 
  308  void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, 
T I,
 
  314  template <
typename T>
 
  321  template <
typename T> 
void linkSchedGroups(
T I, 
T E);
 
  325               std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
 
  329  template <
typename T>
 
  330  int linkSUnit(
SUnit *SU, 
int SGID,
 
  331                std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, 
T I, 
T E);
 
  333  void removeEdges(
const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
 
  335  void convertSyncMapsToArrays();
 
  347      : DAG(DAG), SyncedInstrs(SyncedInstrs),
 
  348        SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
 
  350    for (
auto &PipelineInstrs : SyncedInstrs) {
 
  351      if (PipelineInstrs.second.
size() > 0) {
 
  360    convertSyncMapsToArrays();
 
  362    CurrPipeline = BestPipeline;
 
  364    while (
static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.
size() &&
 
  365           PipelineInstrs[BeginSyncGroupIdx].
size() == 0)
 
  368    if (
static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.
size())
 
  373void PipelineSolver::reset() {
 
  375  for (
auto &SyncPipeline : CurrPipeline) {
 
  376    for (
auto &SG : SyncPipeline) {
 
  378      SG.Collection.
clear();
 
  382      if (SchedBarr != TempCollection.
end())
 
  383        SG.Collection.push_back(*SchedBarr);
 
  387  CurrSyncGroupIdx = BeginSyncGroupIdx;
 
  392void PipelineSolver::convertSyncMapsToArrays() {
 
  393  for (
auto &SyncPipe : SyncedSchedGroups) {
 
  394    BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
 
  397  int PipelineIDx = SyncedInstrs.size() - 1;
 
  398  PipelineInstrs.resize(SyncedInstrs.size());
 
  399  for (
auto &SyncInstrMap : SyncedInstrs) {
 
  400    for (
auto &SUsToCandSGs : SyncInstrMap.second) {
 
  401      if (PipelineInstrs[PipelineIDx].
size() == 0) {
 
  402        PipelineInstrs[PipelineIDx].push_back(
 
  403            std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
 
  406      auto *SortPosition = PipelineInstrs[PipelineIDx].begin();
 
  409      while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
 
  410             SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
 
  412      PipelineInstrs[PipelineIDx].insert(
 
  413          SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
 
  419template <
typename T> 
void PipelineSolver::linkSchedGroups(
T I, 
T E) {
 
  420  for (; 
I != 
E; ++
I) {
 
  422    for (
auto J = std::next(
I); J != 
E; ++J) {
 
  429void PipelineSolver::makePipeline() {
 
  431  for (
auto &SyncPipeline : BestPipeline) {
 
  433    for (
auto &SG : SyncPipeline) {
 
  436      SUnit *SGBarr = 
nullptr;
 
  437      for (
auto &SU : SG.Collection) {
 
  438        if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
 
  445      SG.link(*SGBarr, 
false);
 
  449  for (
auto &SyncPipeline : BestPipeline) {
 
  450    IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
 
  451               : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
 
  456int PipelineSolver::linkSUnit(
 
  457    SUnit *SU, 
int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
 
  459  bool MakePred = 
false;
 
  462    if (
I->getSGID() == SGID) {
 
  467    AddedCost += Group.link(*SU, MakePred, AddedEdges);
 
  473int PipelineSolver::addEdges(
 
  475    std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
 
  485  return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.
rbegin(),
 
  487                    : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.
begin(),
 
  491void PipelineSolver::removeEdges(
 
  492    const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
 
  495  for (
auto &PredSuccPair : EdgesToRemove) {
 
  496    SUnit *Pred = PredSuccPair.first;
 
  497    SUnit *Succ = PredSuccPair.second;
 
  500        Succ->
Preds, [&Pred](
SDep &
P) { return P.getSUnit() == Pred; });
 
  501    if (Match != Succ->
Preds.end()) {
 
  502      assert(Match->isArtificial());
 
  508void PipelineSolver::advancePosition() {
 
  511  if (
static_cast<size_t>(CurrConflInstNo) >=
 
  512      PipelineInstrs[CurrSyncGroupIdx].
size()) {
 
  516    while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
 
  517           PipelineInstrs[CurrSyncGroupIdx].size() == 0)
 
  522void PipelineSolver::retreatPosition() {
 
  523  assert(CurrConflInstNo >= 0);
 
  524  assert(CurrSyncGroupIdx >= 0);
 
  526  if (CurrConflInstNo > 0) {
 
  531  if (CurrConflInstNo == 0) {
 
  534    if (CurrSyncGroupIdx == BeginSyncGroupIdx)
 
  539    while (PipelineInstrs[CurrSyncGroupIdx].
size() == 0)
 
  542    CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
 
  546bool PipelineSolver::checkOptimal() {
 
  547  if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
 
  548    if (BestCost == -1 || CurrCost < BestCost) {
 
  549      BestPipeline = CurrPipeline;
 
  556  bool DoneExploring = 
false;
 
  557  if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
 
  558    DoneExploring = 
true;
 
  560  return (DoneExploring || BestCost == 0);
 
  564void PipelineSolver::populateReadyList(
 
  566  SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
 
  567  auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
 
  568  assert(CurrSU.second.size() >= 1);
 
  570  for (; 
I != 
E; ++
I) {
 
  571    std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
 
  573    SchedGroup *Match = 
llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
 
  574      return SG.getSGID() == CandSGID;
 
  579      if (Match->isFull()) {
 
  580        ReadyList.push_back(std::pair(*
I, MissPenalty));
 
  584      int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
 
  585      ReadyList.push_back(std::pair(*
I, TempCost));
 
  586      removeEdges(AddedEdges);
 
  588      ReadyList.push_back(std::pair(*
I, -1));
 
  594  assert(ReadyList.size() == CurrSU.second.size());
 
  597bool PipelineSolver::solveExact() {
 
  601  if (
static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
 
  604  assert(
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
 
  605  assert(
static_cast<size_t>(CurrConflInstNo) <
 
  606         PipelineInstrs[CurrSyncGroupIdx].
size());
 
  607  SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
 
  609                    << 
") in Pipeline # " << CurrSyncGroupIdx << 
"\n");
 
  614  IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.
rbegin(),
 
  615                                 CurrSU.second.rend())
 
  616             : populateReadyList(ReadyList, CurrSU.second.
begin(),
 
  617                                 CurrSU.second.end());
 
  619  auto *
I = ReadyList.
begin();
 
  620  auto *
E = ReadyList.
end();
 
  621  for (; 
I != 
E; ++
I) {
 
  625    if (BestCost != -1 && (CurrCost + 
I->second > BestCost))
 
  628    int CandSGID = 
I->first;
 
  630    std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
 
  631    auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
 
  633    for (
auto &SG : SyncPipeline) {
 
  634      if (SG.getSGID() == CandSGID)
 
  641    if (!Match->allowedByRules(CurrSU.first, SyncPipeline))
 
  645                      << (
int)Match->getMask() << 
"and ID " << CandSGID
 
  647    Match->add(*CurrSU.first);
 
  648    AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
 
  649    LLVM_DEBUG(
dbgs() << 
"Cost of Assignment: " << AddedCost << 
"\n");
 
  650    CurrCost += AddedCost;
 
  653    bool FinishedExploring = 
false;
 
  656    if (CurrCost < BestCost || BestCost == -1) {
 
  658        FinishedExploring = BestCost != 0;
 
  659        if (!FinishedExploring)
 
  665    CurrCost -= AddedCost;
 
  666    removeEdges(AddedEdges);
 
  668    CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
 
  669    if (FinishedExploring)
 
  676  CurrCost += MissPenalty;
 
  679  LLVM_DEBUG(
dbgs() << 
"NOT Assigned (" << CurrSU.first->NodeNum << 
")\n");
 
  681  bool FinishedExploring = 
false;
 
  682  if (CurrCost < BestCost || BestCost == -1) {
 
  684      bool FinishedExploring = BestCost != 0;
 
  685      if (!FinishedExploring)
 
  691  CurrCost -= MissPenalty;
 
  692  return FinishedExploring;
 
  696void PipelineSolver::greedyFind(
 
  697    std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, 
T I, 
T E) {
 
  698  SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
 
  699  int BestNodeCost = -1;
 
  701  SchedGroup *BestGroup = 
nullptr;
 
  702  int BestGroupID = -1;
 
  703  auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
 
  705                    << 
") in Pipeline # " << CurrSyncGroupIdx << 
"\n");
 
  711  for (; 
I != 
E; ++
I) {
 
  712    std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
 
  714    SchedGroup *Match = 
llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
 
  715      return SG.getSGID() == CandSGID;
 
  719    LLVM_DEBUG(
dbgs() << 
"Trying SGID # " << CandSGID << 
" with Mask " 
  720                      << (
int)Match->getMask() << 
"\n");
 
  722    if (Match->isFull()) {
 
  726    if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) {
 
  727      LLVM_DEBUG(
dbgs() << 
"SGID # " << CandSGID << 
" has conflicting rule\n");
 
  730    TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
 
  732    if (TempCost < BestNodeCost || BestNodeCost == -1) {
 
  734      BestNodeCost = TempCost;
 
  735      BestGroupID = CandSGID;
 
  737    removeEdges(AddedEdges);
 
  738    if (BestNodeCost == 0)
 
  742  if (BestGroupID != -1) {
 
  743    BestGroup->add(*CurrSU.first);
 
  744    addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
 
  745    LLVM_DEBUG(
dbgs() << 
"Best Group has ID: " << BestGroupID << 
" and Mask" 
  746                      << (
int)BestGroup->getMask() << 
"\n");
 
  747    BestCost += TempCost;
 
  749    BestCost += MissPenalty;
 
  751  CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
 
  754bool PipelineSolver::solveGreedy() {
 
  756  std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
 
  758  while (
static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
 
  759    SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
 
  761        ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
 
  762        : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
 
  765  BestPipeline = CurrPipeline;
 
  766  removeEdges(AddedEdges);
 
  770unsigned PipelineSolver::computeProblemSize() {
 
  771  unsigned ProblemSize = 0;
 
  772  for (
auto &PipeConflicts : PipelineInstrs) {
 
  773    ProblemSize += PipeConflicts.size();
 
  779void PipelineSolver::solve() {
 
  783  unsigned ProblemSize = computeProblemSize();
 
  786  bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
 
  787  MissPenalty = (ProblemSize / 2) + 1;
 
  790  if (EnableExactSolver || BelowCutoff) {
 
  794    LLVM_DEBUG(
dbgs() << 
"Greedy produced best cost of " << BestCost << 
"\n");
 
  798      LLVM_DEBUG(
dbgs() << 
"Exact produced best cost of " << BestCost << 
"\n");
 
  810enum IGLPStrategyID : 
int {
 
  811  MFMASmallGemmOptID = 0,
 
  812  MFMASmallGemmSingleWaveOptID = 1,
 
  813  MFMAExpInterleaveID = 2,
 
  814  MFMAExpSimpleInterleaveID = 3
 
  826  virtual bool applyIGLPStrategy(
 
  835  bool IsBottomUp = 
true;
 
  840  virtual ~IGLPStrategy() = 
default;
 
  843class MFMASmallGemmOpt final : 
public IGLPStrategy {
 
  846  bool applyIGLPStrategy(
 
  857      : IGLPStrategy(DAG, 
TII) {
 
  862bool MFMASmallGemmOpt::applyIGLPStrategy(
 
  867  unsigned MFMACount = 0;
 
  869    if (
TII->isMFMAorWMMA(
I))
 
  872  const unsigned PipelineSyncID = 0;
 
  873  SchedGroup *SG = 
nullptr;
 
  874  for (
unsigned I = 0; 
I < MFMACount * 3; ++
I) {
 
  875    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
  876        SchedGroupMask::DS, 2, PipelineSyncID, DAG, 
TII);
 
  877    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
  879    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
  880        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, 
TII);
 
  881    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
  887class MFMAExpInterleaveOpt final : 
public IGLPStrategy {
 
  890  static unsigned TransPipeCount;
 
  892  static unsigned MFMAPipeCount;
 
  894  static unsigned AddPipeCount;
 
  896  static unsigned MFMAEnablement;
 
  898  static unsigned ExpRequirement;
 
  900  static unsigned MFMAChains;
 
  902  static unsigned MFMAChainLength;
 
  907  static bool HasChainBetweenCvt;
 
  909  static std::optional<unsigned> FirstPipeDSR;
 
  918  class IsPipeExp final : 
public InstructionRule {
 
  923      auto *DAG = SyncPipe[0].DAG;
 
  925      if (Cache->empty()) {
 
  926        auto I = DAG->SUnits.rbegin();
 
  927        auto E = DAG->SUnits.rend();
 
  928        for (; 
I != 
E; 
I++) {
 
  929          if (
TII->isMFMAorWMMA(*
I->getInstr()))
 
  930            Cache->push_back(&*
I);
 
  936      auto Reaches = 
any_of(*Cache, [&SU, &DAG](
SUnit *TargetSU) {
 
  937        return DAG->IsReachable(TargetSU, 
const_cast<SUnit *
>(SU));
 
  942    IsPipeExp(
const SIInstrInfo *
TII, 
unsigned SGID, 
bool NeedsCache = 
false)
 
  943        : InstructionRule(
TII, SGID, NeedsCache) {}
 
  948  class EnablesNthMFMA final : 
public InstructionRule {
 
  955      bool FoundTrans = 
false;
 
  956      unsigned Counter = 1;
 
  957      auto *DAG = SyncPipe[0].DAG;
 
  959      if (Cache->empty()) {
 
  960        auto I = DAG->SUnits.begin();
 
  961        auto E = DAG->SUnits.end();
 
  962        for (; 
I != 
E; 
I++) {
 
  963          if (FoundTrans && 
TII->isMFMAorWMMA(*
I->getInstr())) {
 
  965              Cache->push_back(&*
I);
 
  970          if (!FoundTrans && 
TII->isTRANS(
I->getInstr()->getOpcode()))
 
  977      return DAG->IsReachable((*Cache)[0], 
const_cast<SUnit *
>(SU));
 
  981                   bool NeedsCache = 
false)
 
  987  class EnablesNthMFMAInChain final : 
public InstructionRule {
 
  995      auto *DAG = SyncPipe[0].DAG;
 
  997      if (!SU || !
TII->isMFMAorWMMA(*ChainSeed->
getInstr()))
 
 1000      if (Cache->empty()) {
 
 1001        auto *TempSU = ChainSeed;
 
 1006          for (
auto &Succ : TempSU->Succs) {
 
 1007            if (
TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
 
 1008              TempSU = Succ.getSUnit();
 
 1017        Cache->push_back(TempSU);
 
 1023      return DAG->IsReachable((*Cache)[0], 
const_cast<SUnit *
>(SU));
 
 1026    EnablesNthMFMAInChain(
unsigned Number, 
SUnit *ChainSeed,
 
 1028                          bool NeedsCache = 
false)
 
 1030          ChainSeed(ChainSeed) {}
 
 1036  class LessThanNSuccs final : 
public InstructionRule {
 
 1039    bool HasIntermediary = 
false;
 
 1044      if (!SyncPipe.
size())
 
 1048        return Succ.getKind() == SDep::Data;
 
 1050      if (SuccSize >= 
Size)
 
 1053      if (HasIntermediary) {
 
 1054        for (
auto Succ : SU->
Succs) {
 
 1057                return SuccSucc.getKind() == SDep::Data;
 
 1059          if (SuccSize >= 
Size)
 
 1067                   bool HasIntermediary = 
false, 
bool NeedsCache = 
false)
 
 1068        : InstructionRule(
TII, SGID, NeedsCache), 
Size(
Size),
 
 1069          HasIntermediary(HasIntermediary) {}
 
 1076  class GreaterThanOrEqualToNSuccs final : 
public InstructionRule {
 
 1079    bool HasIntermediary = 
false;
 
 1084      if (!SyncPipe.
size())
 
 1088        return Succ.getKind() == SDep::Data;
 
 1090      if (SuccSize >= 
Size)
 
 1093      if (HasIntermediary) {
 
 1094        for (
auto Succ : SU->
Succs) {
 
 1097                return SuccSucc.getKind() == SDep::Data;
 
 1099          if (SuccSize >= 
Size)
 
 1107                               unsigned SGID, 
bool HasIntermediary = 
false,
 
 1108                               bool NeedsCache = 
false)
 
 1109        : InstructionRule(
TII, SGID, NeedsCache), 
Size(
Size),
 
 1110          HasIntermediary(HasIntermediary) {}
 
 1114  class IsCvt final : 
public InstructionRule {
 
 1119      return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
 
 1120             Opc == AMDGPU::V_CVT_I32_F32_e32;
 
 1122    IsCvt(
const SIInstrInfo *
TII, 
unsigned SGID, 
bool NeedsCache = 
false)
 
 1123        : InstructionRule(
TII, SGID, NeedsCache) {}
 
 1127  class IsFMA final : 
public InstructionRule {
 
 1134    IsFMA(
const SIInstrInfo *
TII, 
unsigned SGID, 
bool NeedsCache = 
false)
 
 1135        : InstructionRule(
TII, SGID, NeedsCache) {}
 
 1139  class IsPipeAdd final : 
public InstructionRule {
 
 1145    IsPipeAdd(
const SIInstrInfo *
TII, 
unsigned SGID, 
bool NeedsCache = 
false)
 
 1146        : InstructionRule(
TII, SGID, NeedsCache) {}
 
 1151  class IsSuccOfPrevNthGroup final : 
public InstructionRule {
 
 1153    unsigned Distance = 1;
 
 1158      SchedGroup *OtherGroup = 
nullptr;
 
 1159      if (!SyncPipe.
size())
 
 1162      for (
auto &PipeSG : SyncPipe) {
 
 1163        if ((
unsigned)PipeSG.getSGID() == SGID - Distance)
 
 1164          OtherGroup = &PipeSG;
 
 1169      if (!OtherGroup->Collection.size())
 
 1172      for (
auto &OtherEle : OtherGroup->Collection) {
 
 1173        for (
auto &Succ : OtherEle->Succs) {
 
 1174          if (Succ.getSUnit() == SU && Succ.getKind() == 
SDep::Data)
 
 1182                         unsigned SGID, 
bool NeedsCache = 
false)
 
 1183        : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
 
 1188  class IsReachableFromPrevNthGroup final : 
public InstructionRule {
 
 1190    unsigned Distance = 1;
 
 1195      SchedGroup *OtherGroup = 
nullptr;
 
 1196      if (!SyncPipe.
size())
 
 1199      for (
auto &PipeSG : SyncPipe) {
 
 1200        if ((
unsigned)PipeSG.getSGID() == SGID - Distance)
 
 1201          OtherGroup = &PipeSG;
 
 1206      if (!OtherGroup->Collection.size())
 
 1209      auto *DAG = SyncPipe[0].DAG;
 
 1211      for (
auto &OtherEle : OtherGroup->Collection)
 
 1212        if (DAG->IsReachable(
const_cast<SUnit *
>(SU), OtherEle))
 
 1217    IsReachableFromPrevNthGroup(
unsigned Distance, 
const SIInstrInfo *
TII,
 
 1218                                unsigned SGID, 
bool NeedsCache = 
false)
 
 1219        : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
 
 1223  class OccursAtOrAfterNode final : 
public InstructionRule {
 
 1234                        bool NeedsCache = 
false)
 
 1240  class IsExactMFMA final : 
public InstructionRule {
 
 1248      if (!SU || !
TII->isMFMAorWMMA(*ChainSeed->
getInstr()))
 
 1251      if (Cache->empty()) {
 
 1252        auto *TempSU = ChainSeed;
 
 1257          for (
auto &Succ : TempSU->Succs) {
 
 1258            if (
TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
 
 1259              TempSU = Succ.getSUnit();
 
 1268        Cache->push_back(TempSU);
 
 1274      return (*Cache)[0] == SU;
 
 1278                unsigned SGID, 
bool NeedsCache = 
false)
 
 1280          ChainSeed(ChainSeed) {}
 
 1286  class OccursAfterExp final : 
public InstructionRule {
 
 1291      auto *DAG = SyncPipe[0].DAG;
 
 1292      if (Cache->empty()) {
 
 1293        for (
auto &SU : DAG->SUnits)
 
 1295            Cache->push_back(&SU);
 
 1302      return SU->
NodeNum > (*Cache)[0]->NodeNum;
 
 1306                   bool NeedsCache = 
false)
 
 1307        : InstructionRule(
TII, SGID, NeedsCache) {}
 
 1311  bool applyIGLPStrategy(
 
 1320      : IGLPStrategy(DAG, 
TII) {
 
 1325unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
 
 1326unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
 
 1327unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
 
 1328unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
 
 1329unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
 
 1330unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
 
 1331unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
 
 1332bool MFMAExpInterleaveOpt::HasCvt = 
false;
 
 1333bool MFMAExpInterleaveOpt::HasChainBetweenCvt = 
false;
 
 1334std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
 
 1343  auto isBitPack = [](
unsigned Opc) {
 
 1344    return Opc == AMDGPU::V_PACK_B32_F16_e64 || 
Opc == AMDGPU::V_PERM_B32_e64;
 
 1347  auto isCvt = [](
unsigned Opc) {
 
 1348    return Opc == AMDGPU::V_CVT_F16_F32_e32 || 
Opc == AMDGPU::V_CVT_I32_F32_e32;
 
 1351  auto isAdd = [](
unsigned Opc) { 
return Opc == AMDGPU::V_ADD_F32_e32; };
 
 1354  for (
SUnit &SU : DAG->SUnits) {
 
 1358      if (SU.
Succs.size() >= 7)
 
 1360      for (
auto &Succ : SU.
Succs) {
 
 1361        if (Succ.getSUnit()->Succs.size() >= 7)
 
 1380  if (!(PackSUs.
size() && MFMAPipeCands.
size() && ExpPipeCands.
size()))
 
 1385  std::optional<SUnit *> TempMFMA;
 
 1386  std::optional<SUnit *> TempExp;
 
 1388  for (
auto &PredSU : ExpPipeCands) {
 
 1389    for (
auto &SuccSU : MFMAPipeCands) {
 
 1390      if (DAG->IsReachable(SuccSU, PredSU)) {
 
 1402  if (!(TempExp && TempMFMA))
 
 1405  HasChainBetweenCvt = 
none_of((*TempExp)->Succs, [&isCvt](
SDep &Succ) {
 
 1406    return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
 
 1410  for (
auto &SuccSU : MFMAPipeCands) {
 
 1411    if (MFMAPipeSUs.
size() &&
 
 1412        any_of(MFMAPipeSUs, [&SuccSU](
SUnit *PotentialMatch) {
 
 1413          return PotentialMatch->
NodeNum == SuccSU->NodeNum;
 
 1417    for (
auto &PredSU : ExpPipeCands) {
 
 1418      if (DAG->IsReachable(SuccSU, PredSU)) {
 
 1425  MFMAPipeCount = MFMAPipeSUs.
size();
 
 1427  assert(TempExp && TempMFMA);
 
 1428  assert(MFMAPipeCount > 0);
 
 1430  std::optional<SUnit *> TempCvt;
 
 1431  for (
auto &SuccSU : CvtSUs) {
 
 1432    if (DAG->IsReachable(SuccSU, *TempExp)) {
 
 1439  if (TempCvt.has_value()) {
 
 1440    for (
auto &SuccSU : MFMAPipeSUs) {
 
 1441      if (DAG->IsReachable(SuccSU, *TempCvt)) {
 
 1449  for (
auto &MFMAPipeSU : MFMAPipeSUs) {
 
 1453          return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
 
 1455      MFMAChainSeeds.push_back(MFMAPipeSU);
 
 1463  for (
auto Pred : MFMAChainSeeds[0]->Preds) {
 
 1464    if (
TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
 
 1465        Pred.getSUnit()->getInstr()->mayLoad())
 
 1466      FirstPipeDSR = Pred.getSUnit()->NodeNum;
 
 1469  MFMAChainLength = MFMAPipeCount / MFMAChains;
 
 1472  unsigned PackSuccCount =
 
 1474        return DAG->IsReachable(VPack, *TempExp);
 
 1478  unsigned PackPredCount =
 
 1480        auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
 
 1481        return isBitPack(Opc);
 
 1485    auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
 
 1486    return isBitPack(Opc);
 
 1489  if (PackPred == (*TempMFMA)->Preds.end())
 
 1497        return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
 
 1501  MFMAEnablement *= PackSuccCount;
 
 1506        return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
 
 1509  ExpRequirement *= PackPredCount;
 
 1519    MFMAChainSeeds.clear();
 
 1526bool MFMAExpInterleaveOpt::applyIGLPStrategy(
 
 1531  bool IsSmallKernelType =
 
 1532      MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
 
 1533  bool IsLargeKernelType =
 
 1534      MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;
 
 1536  if (!(IsSmallKernelType || IsLargeKernelType))
 
 1542  unsigned PipelineSyncID = 0;
 
 1543  SchedGroup *SG = 
nullptr;
 
 1545  unsigned MFMAChain = 0;
 
 1546  unsigned PositionInChain = 0;
 
 1547  unsigned CurrMFMAForTransPosition = 0;
 
 1549  auto incrementTransPosition = [&MFMAChain, &PositionInChain,
 
 1550                                 &CurrMFMAForTransPosition]() {
 
 1551    CurrMFMAForTransPosition += MFMAEnablement;
 
 1552    PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
 
 1553    MFMAChain = CurrMFMAForTransPosition % MFMAChains;
 
 1556  auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
 
 1557    auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
 
 1558    return (TempMFMAForTrans / MFMAChains);
 
 1561  auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
 
 1562    auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
 
 1563    return TempMFMAForTrans % MFMAChains;
 
 1566  unsigned CurrMFMAPosition = 0;
 
 1567  unsigned MFMAChainForMFMA = 0;
 
 1568  unsigned PositionInChainForMFMA = 0;
 
 1570  auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
 
 1571                                &PositionInChainForMFMA]() {
 
 1573    MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
 
 1574    PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
 
 1578  assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);
 
 1580  bool UsesFMA = IsSmallKernelType || !IsPostRA;
 
 1581  bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
 
 1582  bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA);
 
 1583  bool UsesVALU = IsSmallKernelType;
 
 1588    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 1589        SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, 
TII);
 
 1590    if (!IsPostRA && MFMAChains) {
 
 1591      SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
 
 1592          PositionInChain, MFMAChainSeeds[MFMAChain], 
TII, SG->getSGID(),
 
 1596          std::make_shared<EnablesNthMFMA>(1, 
TII, SG->getSGID(), 
true));
 
 1597    SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
 
 1598    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 1601    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 1602        SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, 
TII);
 
 1603    if (!IsPostRA && MFMAChains) {
 
 1604      SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
 
 1605          getNextTransPositionInChain(),
 
 1606          MFMAChainSeeds[getNextTransMFMAChain()], 
TII, SG->getSGID(), 
true));
 
 1608      SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, 
TII,
 
 1609                                                   SG->getSGID(), 
true));
 
 1610    SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
 
 1611    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 1615    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 1616        SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, 
TII);
 
 1617    SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, 
TII,
 
 1619    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 1623  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 1624      SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG, 
TII);
 
 1625  if (!IsPostRA && MFMAChains)
 
 1626    SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
 
 1627        PositionInChain, MFMAChainSeeds[MFMAChain], 
TII, SG->getSGID(), 
true));
 
 1629    SG->addRule(std::make_shared<EnablesNthMFMA>(1, 
TII, SG->getSGID(), 
true));
 
 1630  SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(), 
true));
 
 1631  SG->addRule(std::make_shared<LessThanNSuccs>(8, 
TII, SG->getSGID(),
 
 1632                                               HasChainBetweenCvt));
 
 1633  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 1635  incrementTransPosition();
 
 1638  for (
unsigned I = 0; 
I < ExpRequirement; 
I++) {
 
 1641      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 1642          SchedGroupMask::VALU, 1, PipelineSyncID, DAG, 
TII);
 
 1643      SG->addRule(std::make_shared<IsCvt>(
TII, SG->getSGID()));
 
 1644      if (HasChainBetweenCvt)
 
 1645        SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
 
 1646            1 + (2 + UsesFMA) * 
I, 
TII, SG->getSGID()));
 
 1648        SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(
 
 1649            1 + (2 + UsesFMA) * 
I, 
TII, SG->getSGID()));
 
 1650      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 1655      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 1656          SchedGroupMask::VALU, 1, PipelineSyncID, DAG, 
TII);
 
 1657      if (!IsPostRA && MFMAChains) {
 
 1658        SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
 
 1659            getNextTransPositionInChain(),
 
 1660            MFMAChainSeeds[getNextTransMFMAChain()], 
TII, SG->getSGID(), 
true));
 
 1662        SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1,
 
 1663                                                     TII, SG->getSGID(), 
true));
 
 1664      SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
 
 1665      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 1669    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 1670        SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, 
TII);
 
 1671    if (!IsPostRA && MFMAChains)
 
 1672      SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
 
 1673          PositionInChain, MFMAChainSeeds[MFMAChain], 
TII, SG->getSGID(),
 
 1676      SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, 
TII,
 
 1677                                                   SG->getSGID(), 
true));
 
 1678    SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(), 
true));
 
 1679    SG->addRule(std::make_shared<LessThanNSuccs>(8, 
TII, SG->getSGID(),
 
 1680                                                 HasChainBetweenCvt));
 
 1681    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 1686  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 1687      SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, 
TII);
 
 1688  SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(), 
true));
 
 1689  SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>(
 
 1690      8, 
TII, SG->getSGID(), HasChainBetweenCvt));
 
 1691  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 1696  unsigned MFMARatio =
 
 1697      MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;
 
 1700      MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;
 
 1702  unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement)
 
 1703                              ? TransPipeCount - (2 * ExpRequirement)
 
 1705  unsigned ExpLoopCount = RemainingExp / ExpRatio;
 
 1707  unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2)
 
 1708                            ? MFMAPipeCount - (MFMAEnablement * 2)
 
 1710  unsigned MFMALoopCount = MFMAInLoop / MFMARatio;
 
 1712      AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount;
 
 1713  unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount);
 
 1715  for (
unsigned I = 0; 
I < LoopSize; 
I++) {
 
 1716    if (!(
I * ExpRatio % ExpRequirement))
 
 1717      incrementTransPosition();
 
 1720    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 1721        SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG, 
TII);
 
 1722    if (!IsPostRA && MFMAChains)
 
 1723      SG->addRule(std::make_shared<IsExactMFMA>(
 
 1724          PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA], 
TII,
 
 1725          SG->getSGID(), 
true));
 
 1727      SG->addRule(std::make_shared<OccursAfterExp>(
TII, SG->getSGID(), 
true));
 
 1728    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 1729    incrementMFMAPosition();
 
 1732      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 1733          SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG, 
TII);
 
 1734      SG->addRule(std::make_shared<IsPipeAdd>(
TII, SG->getSGID()));
 
 1735      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 1738    if (UsesDSRead && !(
I % 4)) {
 
 1739      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 1740          SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, 
TII);
 
 1741      SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, 
TII,
 
 1743      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 1747    for (
unsigned J = 0; J < ExpRatio; J++) {
 
 1748      auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (
I + 1);
 
 1749      auto MaxMFMAOffset =
 
 1750          (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;
 
 1754        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 1755            SchedGroupMask::VALU, 1, PipelineSyncID, DAG, 
TII);
 
 1756        SG->addRule(std::make_shared<IsCvt>(
TII, SG->getSGID()));
 
 1757        auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;
 
 1758        auto DSROffset = 
I / 4 + 1;
 
 1759        auto MaxDSROffset = MaxMFMAOffset / 4;
 
 1761        auto ExpOffset = 
I * ExpRatio + J >= ExpRequirement ? 0 : 1;
 
 1762        auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +
 
 1763                             std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff +
 
 1765        if (HasChainBetweenCvt)
 
 1766          SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
 
 1767              CurrentOffset, 
TII, SG->getSGID()));
 
 1769          SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset, 
TII,
 
 1771        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 1776        SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 1777            SchedGroupMask::VALU, 1, PipelineSyncID, DAG, 
TII);
 
 1778        if (!IsPostRA && MFMAChains)
 
 1779          SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
 
 1780              getNextTransPositionInChain(),
 
 1781              MFMAChainSeeds[getNextTransMFMAChain()], 
TII, SG->getSGID(),
 
 1784          SG->addRule(std::make_shared<EnablesNthMFMA>(
 
 1785              (((
I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1,
 
 1786              TII, SG->getSGID(), 
true));
 
 1787        SG->addRule(std::make_shared<IsFMA>(
TII, SG->getSGID()));
 
 1788        SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 1792      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 1793          SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, 
TII);
 
 1794      if (!IsPostRA && MFMAChains)
 
 1795        SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
 
 1796            PositionInChain, MFMAChainSeeds[MFMAChain], 
TII, SG->getSGID(),
 
 1799        SG->addRule(std::make_shared<EnablesNthMFMA>(
 
 1800            (((
I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,
 
 1801            TII, SG->getSGID(), 
true));
 
 1802      SG->addRule(std::make_shared<IsPipeExp>(
TII, SG->getSGID(), 
true));
 
 1803      SG->addRule(std::make_shared<LessThanNSuccs>(8, 
TII, SG->getSGID(),
 
 1804                                                   HasChainBetweenCvt));
 
 1805      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 1810  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 1811      SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG, 
TII);
 
 1812  SG->addRule(std::make_shared<OccursAfterExp>(
TII, SG->getSGID(), 
true));
 
 1813  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 1817class MFMAExpSimpleInterleaveOpt final : 
public IGLPStrategy {
 
 1819  bool applyIGLPStrategy(
 
 1830      : IGLPStrategy(DAG, 
TII) {
 
 1835bool MFMAExpSimpleInterleaveOpt::applyIGLPStrategy(
 
 1840  unsigned MFMACount = 0;
 
 1842    if (
TII->isMFMAorWMMA(
I))
 
 1845  const unsigned PipelineSyncID = 0;
 
 1846  for (
unsigned I = 0; 
I < MFMACount * 3; ++
I) {
 
 1847    SchedGroup *SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 1848        SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, 
TII);
 
 1849    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 1851    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 1852        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, 
TII);
 
 1853    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 1859class MFMASmallGemmSingleWaveOpt final : 
public IGLPStrategy {
 
 1862  class EnablesInitialMFMA final : 
public InstructionRule {
 
 1866      if (!SyncPipe.
size())
 
 1869      if (!Cache->size()) {
 
 1870        for (
auto &Elt : SyncPipe[0].DAG->
SUnits) {
 
 1871          if (
TII->isMFMAorWMMA(*Elt.getInstr())) {
 
 1875            Cache->push_back(&Elt);
 
 1880      auto *DAG = SyncPipe[0].DAG;
 
 1881      for (
auto &Elt : *Cache) {
 
 1889                       bool NeedsCache = 
false)
 
 1890        : InstructionRule(
TII, SGID, NeedsCache) {}
 
 1894  class IsPermForDSW final : 
public InstructionRule {
 
 1899      if (
MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
 
 1902      bool FitsInGroup = 
false;
 
 1904      if (!Collection.
size()) {
 
 1905        for (
auto &Succ : SU->
Succs) {
 
 1906          SUnit *SuccUnit = Succ.getSUnit();
 
 1909            Cache->push_back(SuccUnit);
 
 1920          return ThisSucc.getSUnit() == Elt;
 
 1925    IsPermForDSW(
const SIInstrInfo *
TII, 
unsigned SGID, 
bool NeedsCache = 
false)
 
 1926        : InstructionRule(
TII, SGID, NeedsCache) {}
 
 1930  class IsSuccOfPrevGroup final : 
public InstructionRule {
 
 1934      SchedGroup *OtherGroup = 
nullptr;
 
 1935      for (
auto &PipeSG : SyncPipe) {
 
 1936        if ((
unsigned)PipeSG.getSGID() == SGID - 1) {
 
 1937          OtherGroup = &PipeSG;
 
 1943      if (!OtherGroup->Collection.size())
 
 1947      return any_of(OtherGroup->Collection, [&SU](
SUnit *Elt) {
 
 1948        return any_of(Elt->Succs,
 
 1949                      [&SU](SDep &Succ) { return Succ.getSUnit() == SU; });
 
 1953                      bool NeedsCache = 
false)
 
 1954        : InstructionRule(
TII, SGID, NeedsCache) {}
 
 1958  class VMEMSize final : 
public InstructionRule {
 
 1963      if (
MI->getOpcode() == TargetOpcode::BUNDLE)
 
 1965      if (!Collection.
size())
 
 1970      auto TRI = 
TII->getRegisterInfo();
 
 1971      auto &
MRI = 
MI->getParent()->getParent()->getRegInfo();
 
 1972      for (
auto &Elt : Collection) {
 
 1973        auto Op = Elt->getInstr()->getOperand(0);
 
 1975            TRI.getRegSizeInBits(*
TRI.getRegClassForOperandReg(
MRI, 
Op));
 
 1979      if (NumBits < 128) {
 
 1981        if (NumBits + 
TRI.getRegSizeInBits(*
TRI.getRegClassForOperandReg(
 
 1982                          MRI, 
MI->getOperand(0))) <=
 
 1990    VMEMSize(
const SIInstrInfo *
TII, 
unsigned SGID, 
bool NeedsCache = 
false)
 
 1991        : InstructionRule(
TII, SGID, NeedsCache) {}
 
 1996  class SharesPredWithPrevNthGroup final : 
public InstructionRule {
 
 1998    unsigned Distance = 1;
 
 2003      SchedGroup *OtherGroup = 
nullptr;
 
 2004      if (!SyncPipe.
size())
 
 2007      if (!Cache->size()) {
 
 2009        for (
auto &PipeSG : SyncPipe) {
 
 2010          if ((
unsigned)PipeSG.getSGID() == SGID - Distance) {
 
 2011            OtherGroup = &PipeSG;
 
 2017        if (!OtherGroup->Collection.size())
 
 2020        for (
auto &OtherEle : OtherGroup->Collection) {
 
 2021          for (
auto &Pred : OtherEle->Preds) {
 
 2022            if (Pred.getSUnit()->getInstr()->getOpcode() ==
 
 2023                AMDGPU::V_PERM_B32_e64)
 
 2024              Cache->push_back(Pred.getSUnit());
 
 2033      auto *DAG = SyncPipe[0].DAG;
 
 2040    SharesPredWithPrevNthGroup(
unsigned Distance, 
const SIInstrInfo *
TII,
 
 2041                               unsigned SGID, 
bool NeedsCache = 
false)
 
 2042        : InstructionRule(
TII, SGID, NeedsCache), Distance(Distance) {}
 
 2046  bool applyIGLPStrategy(
 
 2057      : IGLPStrategy(DAG, 
TII) {
 
 2062static unsigned DSWCount = 0;
 
 2063static unsigned DSWWithPermCount = 0;
 
 2064static unsigned DSWWithSharedVMEMCount = 0;
 
 2066bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
 
 2067    DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
 
 2070  unsigned MFMACount = 0;
 
 2071  unsigned DSRCount = 0;
 
 2073  bool IsInitial = 
Phase == AMDGPU::SchedulingPhase::Initial;
 
 2075  assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
 
 2076                         DSWWithSharedVMEMCount == 0)) &&
 
 2077         "DSWCounters should be zero in pre-RA scheduling!");
 
 2079  for (
auto &SU : DAG->SUnits) {
 
 2080    auto *
I = SU.getInstr();
 
 2081    if (
TII->isMFMAorWMMA(*
I))
 
 2083    else if (
TII->isDS(*
I)) {
 
 2086      else if (
I->mayStore() && IsInitial) {
 
 2088        for (
auto Pred : SU.Preds) {
 
 2089          if (Pred.getSUnit()->getInstr()->getOpcode() ==
 
 2090              AMDGPU::V_PERM_B32_e64) {
 
 2100    DSWWithPermCount = DSWithPerms.
size();
 
 2101    auto *
I = DSWithPerms.
begin();
 
 2102    auto *
E = DSWithPerms.
end();
 
 2110    DenseMap<MachineInstr *, SUnit *> VMEMLookup;
 
 2112    for (; 
I != 
E; 
I++) {
 
 2113      SUnit *Cand = 
nullptr;
 
 2114      bool MissedAny = 
false;
 
 2115      for (
auto &Pred : (*I)->Preds) {
 
 2116        if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
 
 2122        for (
auto &Succ : Pred.getSUnit()->Succs) {
 
 2123          auto *
MI = Succ.getSUnit()->getInstr();
 
 2124          if (!
TII->isVMEM(*
MI) || !
MI->mayLoad())
 
 2127          if (MissedAny || !VMEMLookup.
size()) {
 
 2129            VMEMLookup[
MI] = *
I;
 
 2146      if (!MissedAny && Cand) {
 
 2147        DSWWithSharedVMEMCount += 2;
 
 2154  assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
 
 2156  unsigned PipelineSyncID = 0;
 
 2158  if (DSWWithPermCount) {
 
 2159    for (
unsigned I = 0; 
I < MFMACount; 
I++) {
 
 2160      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2161          SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, 
TII);
 
 2162      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2164      SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2165          SchedGroupMask::VALU, 2, PipelineSyncID, DAG, 
TII);
 
 2166      SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2176  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2177      SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, 
TII);
 
 2178  SG->addRule(std::make_shared<EnablesInitialMFMA>(
TII, SG->getSGID(), 
true));
 
 2179  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2181  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2182      SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, 
TII);
 
 2183  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2186  for (
unsigned I = 0; 
I < DSRCount - 4; ++
I) {
 
 2187    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2188        SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, 
TII);
 
 2189    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2191    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2192        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, 
TII);
 
 2193    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2199  for (
unsigned I = 0; 
I < DSWWithPermCount - DSWWithSharedVMEMCount; ++
I) {
 
 2200    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2201        SchedGroupMask::VALU, 4, PipelineSyncID, DAG, 
TII);
 
 2202    SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(), 
true));
 
 2203    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2205    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2206        SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, 
TII);
 
 2207    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
 
 2208    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2210    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2211        SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, 
TII);
 
 2212    SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
 
 2213        1, 
TII, SG->getSGID(), 
true));
 
 2214    SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
 
 2215    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2217    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2218        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, 
TII);
 
 2219    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2221    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2222        SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, 
TII);
 
 2223    SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
 
 2224        3, 
TII, SG->getSGID(), 
true));
 
 2225    SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
 
 2226    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2228    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2229        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, 
TII);
 
 2230    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2236  for (
unsigned I = 0; 
I < DSWCount - DSWWithPermCount; 
I++) {
 
 2237    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2238        SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, 
TII);
 
 2239    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2241    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2242        SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, 
TII);
 
 2243    SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
 
 2244    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2246    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2247        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, 
TII);
 
 2248    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2256  for (
unsigned I = 0; 
I < DSWWithSharedVMEMCount; ++
I) {
 
 2257    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2258        SchedGroupMask::VALU, 4, PipelineSyncID, DAG, 
TII);
 
 2259    SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(), 
true));
 
 2260    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2262    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2263        SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, 
TII);
 
 2264    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
 
 2265    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2267    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2268        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, 
TII);
 
 2269    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2271    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2272        SchedGroupMask::VALU, 4, PipelineSyncID, DAG, 
TII);
 
 2273    SG->addRule(std::make_shared<IsPermForDSW>(
TII, SG->getSGID(), 
true));
 
 2274    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2276    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2277        SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, 
TII);
 
 2278    SG->addRule(std::make_shared<IsSuccOfPrevGroup>(
TII, SG->getSGID()));
 
 2279    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2281    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2282        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, 
TII);
 
 2283    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2285    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2286        SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, 
TII);
 
 2287    SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
 
 2288        2, 
TII, SG->getSGID(), 
true));
 
 2289    SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
 
 2290    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2292    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2293        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, 
TII);
 
 2294    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2296    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2297        SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, 
TII);
 
 2298    SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
 
 2299        4, 
TII, SG->getSGID(), 
true));
 
 2300    SG->addRule(std::make_shared<VMEMSize>(
TII, SG->getSGID()));
 
 2301    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2303    SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
 
 2304        SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, 
TII);
 
 2305    SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
 
 2311static std::unique_ptr<IGLPStrategy>
 
 2312createIGLPStrategy(IGLPStrategyID 
ID, ScheduleDAGInstrs *DAG,
 
 2313                   const SIInstrInfo *
TII) {
 
 2315  case MFMASmallGemmOptID:
 
 2316    return std::make_unique<MFMASmallGemmOpt>(DAG, 
TII);
 
 2317  case MFMASmallGemmSingleWaveOptID:
 
 2318    return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, 
TII);
 
 2319  case MFMAExpInterleaveID:
 
 2320    return std::make_unique<MFMAExpInterleaveOpt>(DAG, 
TII);
 
 2321  case MFMAExpSimpleInterleaveID:
 
 2322    return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, 
TII);
 
 2328class IGroupLPDAGMutation : 
public ScheduleDAGMutation {
 
 2330  const SIInstrInfo *
TII;
 
 2337  DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
 
 2340  DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
 
 2343  void addSchedBarrierEdges(SUnit &SU);
 
 2354  SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) 
const;
 
 2357  void initSchedGroupBarrierPipelineStage(
 
 2358      std::vector<SUnit>::reverse_iterator RIter);
 
 2360  bool initIGLPOpt(SUnit &SU);
 
 2363  void apply(ScheduleDAGInstrs *DAGInstrs) 
override;
 
 2370  bool IsBottomUp = 
true;
 
 2375  IGroupLPDAGMutation() = 
default;
 
 2379unsigned SchedGroup::NumSchedGroups = 0;
 
 2381bool SchedGroup::tryAddEdge(SUnit *
A, SUnit *
B) {
 
 2389bool SchedGroup::canAddMI(
const MachineInstr &
MI)
 const {
 
 2391  if (
MI.isMetaInstruction())
 
 2394  else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
 
 2399  else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
 
 2407  else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
 
 2411  else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
 
 2412           TII->isMFMAorWMMA(
MI))
 
 2415  else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
 
 2419  else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
 
 2420           MI.mayLoad() && 
TII->isVMEM(
MI))
 
 2423  else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
 
 2424           MI.mayStore() && 
TII->isVMEM(
MI))
 
 2427  else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
 
 2431  else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
 
 2432           MI.mayLoad() && 
TII->isDS(
MI))
 
 2435  else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
 
 2436           MI.mayStore() && 
TII->isDS(
MI))
 
 2439  else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
 
 2444      dbgs() << 
"For SchedGroup with mask " << 
format_hex((
int)SGMask, 10, 
true)
 
 2445             << (Result ? 
" could classify " : 
" unable to classify ") << 
MI);
 
 2450int SchedGroup::link(SUnit &SU, 
bool MakePred,
 
 2451                     std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
 
 2452  int MissedEdges = 0;
 
 2453  for (
auto *
A : Collection) {
 
 2455    if (
A == 
B || 
A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
 
 2465    bool Added = tryAddEdge(
A, 
B);
 
 2467      AddedEdges.emplace_back(
A, 
B);
 
 2475void SchedGroup::link(SUnit &SU, 
bool MakePred) {
 
 2476  for (
auto *
A : Collection) {
 
 2478    if (
A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
 
 2487void SchedGroup::link(SUnit &SU,
 
 2488                      function_ref<
bool(
const SUnit *
A, 
const SUnit *
B)> 
P) {
 
 2489  for (
auto *
A : Collection) {
 
 2498void SchedGroup::link(SchedGroup &OtherGroup) {
 
 2499  for (
auto *
B : OtherGroup.Collection)
 
 2503bool SchedGroup::canAddSU(SUnit &SU)
 const {
 
 2505  if (
MI.getOpcode() != TargetOpcode::BUNDLE)
 
 2506    return canAddMI(
MI);
 
 2509  const MachineBasicBlock *
MBB = 
MI.getParent();
 
 2511  while (
E != 
MBB->
end() && 
E->isBundledWithPred())
 
 2515  return std::all_of(
B, 
E, [
this](MachineInstr &
MI) { 
return canAddMI(
MI); });
 
 2518void SchedGroup::initSchedGroup() {
 
 2519  for (
auto &SU : DAG->
SUnits) {
 
 2528void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
 
 2529                                SUnitsToCandidateSGsMap &SyncedInstrs) {
 
 2530  SUnit &InitSU = *RIter;
 
 2531  for (
auto E = DAG->
SUnits.rend(); RIter != 
E; ++RIter) {
 
 2537      SyncedInstrs[&SU].push_back(SGID);
 
 2545void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
 
 2546  auto I = DAG->
SUnits.rbegin();
 
 2548  for (; 
I != 
E; ++
I) {
 
 2553      SyncedInstrs[&SU].push_back(SGID);
 
 2557void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
 
 2558  const TargetSchedModel *TSchedModel = DAGInstrs->
getSchedModel();
 
 2559  if (!TSchedModel || DAGInstrs->
SUnits.empty())
 
 2564  TII = 
ST.getInstrInfo();
 
 2565  DAG = 
static_cast<ScheduleDAGMI *
>(DAGInstrs);
 
 2566  SyncedSchedGroups.clear();
 
 2567  SyncedInstrs.clear();
 
 2568  bool FoundSB = 
false;
 
 2569  bool FoundIGLP = 
false;
 
 2570  bool ShouldApplyIGLP = 
false;
 
 2571  for (
auto R = DAG->
SUnits.rbegin(), 
E = DAG->
SUnits.rend(); R != 
E; ++R) {
 
 2572    unsigned Opc = 
R->getInstr()->getOpcode();
 
 2574    if (
Opc == AMDGPU::SCHED_BARRIER) {
 
 2575      addSchedBarrierEdges(*R);
 
 2577    } 
else if (
Opc == AMDGPU::SCHED_GROUP_BARRIER) {
 
 2578      initSchedGroupBarrierPipelineStage(R);
 
 2580    } 
else if (
Opc == AMDGPU::IGLP_OPT) {
 
 2581      if (!FoundSB && !FoundIGLP) {
 
 2583        ShouldApplyIGLP = initIGLPOpt(*R);
 
 2588  if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
 
 2589    PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
 
 2597void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
 
 2599  assert(
MI.getOpcode() == AMDGPU::SCHED_BARRIER);
 
 2602  LLVM_DEBUG(
dbgs() << 
"Building SchedGroup for SchedBarrier with Mask: " 
 2603                    << 
MI.getOperand(0).getImm() << 
"\n");
 
 2605      invertSchedBarrierMask((SchedGroupMask)
MI.getOperand(0).getImm());
 
 2606  SchedGroup SG(InvertedMask, std::nullopt, DAG, 
TII);
 
 2607  SG.initSchedGroup();
 
 2612      (function_ref<
bool(
const SUnit *
A, 
const SUnit *
B)>)[](
 
 2613          const SUnit *
A, 
const SUnit *
B) { 
return A->NodeNum > 
B->NodeNum; });
 
 2617IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask)
 const {
 
 2620  SchedGroupMask InvertedMask = ~Mask;
 
 2623  if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
 
 2624    InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
 
 2625                    ~SchedGroupMask
::MFMA & ~SchedGroupMask::TRANS;
 
 2627  else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
 
 2628           (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
 
 2629           (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
 
 2630           (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
 
 2631    InvertedMask &= ~SchedGroupMask::ALU;
 
 2634  if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
 
 2635    InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
 
 2637  else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
 
 2638           (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
 
 2639    InvertedMask &= ~SchedGroupMask::VMEM;
 
 2642  if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
 
 2643    InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
 
 2645  else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
 
 2646           (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
 
 2647    InvertedMask &= ~SchedGroupMask::DS;
 
 2649  LLVM_DEBUG(
dbgs() << 
"After Inverting, SchedGroup Mask: " << (
int)InvertedMask
 
 2652  return InvertedMask;
 
 2655void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
 
 2656    std::vector<SUnit>::reverse_iterator RIter) {
 
 2659  MachineInstr &SGB = *RIter->getInstr();
 
 2665  auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
 
 2668  SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
 
 2671bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
 
 2672  IGLPStrategyID StrategyID =
 
 2674  auto S = createIGLPStrategy(StrategyID, DAG, 
TII);
 
 2675  if (!S->shouldApplyStrategy(DAG, 
Phase))
 
 2678  IsBottomUp = S->IsBottomUp;
 
 2679  return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, 
Phase);
 
 2689std::unique_ptr<ScheduleDAGMutation>
 
 2691  return std::make_unique<IGroupLPDAGMutation>(
Phase);
 
 
unsigned const MachineRegisterInfo * MRI
 
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late Phase
 
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
 
Provides AMDGPU specific target descriptions.
 
AMDGPU Rewrite AGPR Copy MFMA
 
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
 
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
 
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
 
This file defines the DenseMap class.
 
const HexagonInstrInfo * TII
 
static std::pair< Value *, APInt > getMask(Value *WideMask, unsigned Factor, ElementCount LeafValueEC)
 
Register const TargetRegisterInfo * TRI
 
Interface definition for SIInstrInfo.
 
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
 
size_t size() const
size - Get the array size.
 
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
 
Instructions::iterator instr_iterator
 
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
 
Representation of each machine instruction.
 
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
 
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
 
const MachineOperand & getOperand(unsigned i) const
 
@ Data
Regular data dependence (aka true-dependence).
 
@ Artificial
Arbitrary strong DAG edge (no real dependence).
 
Scheduling unit. This is a node in the scheduling DAG.
 
unsigned NodeNum
Entry # of node in the node vector.
 
LLVM_ABI void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
 
SmallVector< SDep, 4 > Succs
All sunit successors.
 
SmallVector< SDep, 4 > Preds
All sunit predecessors.
 
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
 
A ScheduleDAG for scheduling lists of MachineInstr.
 
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
 
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
 
bool IsReachable(SUnit *SU, SUnit *TargetSU)
IsReachable - Checks if SU is reachable from TargetSU.
 
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU)
True if an edge can be added from PredSU to SuccSU without creating a cycle.
 
void dump() const override
 
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
 
std::vector< SUnit > SUnits
The scheduling units.
 
MachineFunction & MF
Machine function.
 
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
 
void push_back(const T &Elt)
 
reverse_iterator rbegin()
 
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
 
An efficient, type-erasing, non-owning reference to a callable.
 
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
 
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
 
void apply(Opt *O, const Mod &M, const Mods &... Ms)
 
initializer< Ty > init(const Ty &Val)
 
LLVM_ABI void link(std::unique_ptr< LinkGraph > G, std::unique_ptr< JITLinkContext > Ctx)
Link the given graph.
 
This is an optimization pass for GlobalISel generic memory operations.
 
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
 
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
 
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
 
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
 
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
 
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
 
FormattedNumber format_hex(uint64_t N, unsigned Width, bool Upper=false)
format_hex - Output N as a fixed width hexadecimal.
 
@ LLVM_MARK_AS_BITMASK_ENUM
 
DWARFExpression::Operation Op
 
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
 
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
 
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
 
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
 
Function object to check whether the second component of a container supported by std::get (like std:...