LLVM 20.0.0git
AMDGPUIGroupLP.cpp
Go to the documentation of this file.
1//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file This file defines a set of schedule DAG mutations that can be used to
10// override default scheduler behavior to enforce specific scheduling patterns.
11// They should be used in cases where runtime performance considerations such as
12// inter-wavefront interactions, mean that compile-time heuristics cannot
13// predict the optimal instruction ordering, or in kernels where optimum
14// instruction scheduling is important enough to warrant manual intervention.
15//
16//===----------------------------------------------------------------------===//
17
18#include "AMDGPUIGroupLP.h"
20#include "SIInstrInfo.h"
23#include "llvm/ADT/DenseMap.h"
26
27using namespace llvm;
28
29#define DEBUG_TYPE "igrouplp"
30
31namespace {
32
33static cl::opt<bool> EnableExactSolver(
34 "amdgpu-igrouplp-exact-solver", cl::Hidden,
35 cl::desc("Whether to use the exponential time solver to fit "
36 "the instructions to the pipeline as closely as "
37 "possible."),
38 cl::init(false));
39
40static cl::opt<unsigned> CutoffForExact(
41 "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden,
42 cl::desc("The maximum number of scheduling group conflicts "
43 "which we attempt to solve with the exponential time "
44 "exact solver. Problem sizes greater than this will"
45 "be solved by the less accurate greedy algorithm. Selecting "
46 "solver by size is superseded by manually selecting "
47 "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
48
49static cl::opt<uint64_t> MaxBranchesExplored(
50 "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden,
51 cl::desc("The amount of branches that we are willing to explore with"
52 "the exact algorithm before giving up."));
53
54static cl::opt<bool> UseCostHeur(
55 "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,
56 cl::desc("Whether to use the cost heuristic to make choices as we "
57 "traverse the search space using the exact solver. Defaulted "
58 "to on, and if turned off, we will use the node order -- "
59 "attempting to put the later nodes in the later sched groups. "
60 "Experimentally, results are mixed, so this should be set on a "
61 "case-by-case basis."));
62
63// Components of the mask that determines which instruction types may be may be
64// classified into a SchedGroup.
65enum class SchedGroupMask {
66 NONE = 0u,
67 ALU = 1u << 0,
68 VALU = 1u << 1,
69 SALU = 1u << 2,
70 MFMA = 1u << 3,
71 VMEM = 1u << 4,
72 VMEM_READ = 1u << 5,
73 VMEM_WRITE = 1u << 6,
74 DS = 1u << 7,
75 DS_READ = 1u << 8,
76 DS_WRITE = 1u << 9,
77 TRANS = 1u << 10,
78 ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
79 DS_READ | DS_WRITE | TRANS,
80 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
81};
82
83class SchedGroup;
84
85// InstructionRule class is used to enact a filter which determines whether or
86// not an SU maps to a given SchedGroup. It contains complementary data
87// structures (e.g Cache) to help those filters.
88class InstructionRule {
89protected:
90 const SIInstrInfo *TII;
91 unsigned SGID;
92 // A cache made available to the Filter to store SUnits for subsequent
93 // invocations of the Filter
94 std::optional<SmallVector<SUnit *, 4>> Cache;
95
96public:
97 virtual bool
98 apply(const SUnit *, const ArrayRef<SUnit *>,
100 return true;
101 };
102
103 InstructionRule(const SIInstrInfo *TII, unsigned SGID,
104 bool NeedsCache = false)
105 : TII(TII), SGID(SGID) {
106 if (NeedsCache) {
107 Cache = SmallVector<SUnit *, 4>();
108 }
109 }
110
111 virtual ~InstructionRule() = default;
112};
113
114using SUnitsToCandidateSGsMap = DenseMap<SUnit *, SmallVector<int, 4>>;
115
116// Classify instructions into groups to enable fine tuned control over the
117// scheduler. These groups may be more specific than current SchedModel
118// instruction classes.
119class SchedGroup {
120private:
121 // Mask that defines which instruction types can be classified into this
122 // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
123 // and SCHED_GROUP_BARRIER.
124 SchedGroupMask SGMask;
125
126 // Maximum number of SUnits that can be added to this group.
127 std::optional<unsigned> MaxSize;
128
129 // SchedGroups will only synchronize with other SchedGroups that have the same
130 // SyncID.
131 int SyncID = 0;
132
133 // SGID is used to map instructions to candidate SchedGroups
134 unsigned SGID;
135
136 // The different rules each instruction in this SchedGroup must conform to
138
139 // Count of the number of created SchedGroups, used to initialize SGID.
140 static unsigned NumSchedGroups;
141
142 // Try to add and edge from SU A to SU B.
143 bool tryAddEdge(SUnit *A, SUnit *B);
144
145 // Use SGMask to determine whether we can classify MI as a member of this
146 // SchedGroup object.
147 bool canAddMI(const MachineInstr &MI) const;
148
149public:
150 // Collection of SUnits that are classified as members of this group.
151 SmallVector<SUnit *, 32> Collection;
152
154 const SIInstrInfo *TII;
155
156 // Returns true if SU can be added to this SchedGroup.
157 bool canAddSU(SUnit &SU) const;
158
159 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
160 // MakePred is true, SU will be a predecessor of the SUnits in this
161 // SchedGroup, otherwise SU will be a successor.
162 void link(SUnit &SU, bool MakePred = false);
163
164 // Add DAG dependencies and track which edges are added, and the count of
165 // missed edges
166 int link(SUnit &SU, bool MakePred,
167 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
168
169 // Add DAG dependencies from all SUnits in this SchedGroup and this SU.
170 // Use the predicate to determine whether SU should be a predecessor (P =
171 // true) or a successor (P = false) of this SchedGroup.
172 void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P);
173
174 // Add DAG dependencies such that SUnits in this group shall be ordered
175 // before SUnits in OtherGroup.
176 void link(SchedGroup &OtherGroup);
177
178 // Returns true if no more instructions may be added to this group.
179 bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
180
181 // Append a constraint that SUs must meet in order to fit into this
182 // SchedGroup. Since many rules involve the relationship between a SchedGroup
183 // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve
184 // time (rather than SchedGroup init time.)
185 void addRule(std::shared_ptr<InstructionRule> NewRule) {
186 Rules.push_back(NewRule);
187 }
188
189 // Returns true if the SU matches all rules
190 bool allowedByRules(const SUnit *SU,
191 SmallVectorImpl<SchedGroup> &SyncPipe) const {
192 for (auto &Rule : Rules) {
193 if (!Rule->apply(SU, Collection, SyncPipe))
194 return false;
195 }
196 return true;
197 }
198
199 // Add SU to the SchedGroup.
200 void add(SUnit &SU) {
201 LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
202 << format_hex((int)SGMask, 10, true) << " adding "
203 << *SU.getInstr());
204 Collection.push_back(&SU);
205 }
206
207 // Remove last element in the SchedGroup
208 void pop() { Collection.pop_back(); }
209
210 // Identify and add all relevant SUs from the DAG to this SchedGroup.
211 void initSchedGroup();
212
213 // Add instructions to the SchedGroup bottom up starting from RIter.
214 // PipelineInstrs is a set of instructions that should not be added to the
215 // SchedGroup even when the other conditions for adding it are satisfied.
216 // RIter will be added to the SchedGroup as well, and dependencies will be
217 // added so that RIter will always be scheduled at the end of the group.
218 void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
219 SUnitsToCandidateSGsMap &SyncedInstrs);
220
221 void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
222
223 int getSyncID() { return SyncID; }
224
225 int getSGID() { return SGID; }
226
227 SchedGroupMask getMask() { return SGMask; }
228
229 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
230 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
231 : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
232 SGID = NumSchedGroups++;
233 }
234
235 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
236 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
237 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
238 SGID = NumSchedGroups++;
239 }
240};
241
242// Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
243static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
244 assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||
245 SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
246 SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT);
247
248 while (!SU.Preds.empty())
249 for (auto &P : SU.Preds)
250 SU.removePred(P);
251
252 while (!SU.Succs.empty())
253 for (auto &S : SU.Succs)
254 for (auto &SP : S.getSUnit()->Preds)
255 if (SP.getSUnit() == &SU)
256 S.getSUnit()->removePred(SP);
257}
258
259using SUToCandSGsPair = std::pair<SUnit *, SmallVector<int, 4>>;
260using SUsToCandSGsVec = SmallVector<SUToCandSGsPair, 4>;
261
262// The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
263// in non-trivial cases. For example, if the requested pipeline is
264// {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction
265// in the DAG, then we will have an instruction that can not be trivially
266// assigned to a SchedGroup. The PipelineSolver class implements two algorithms
267// to find a good solution to the pipeline -- a greedy algorithm and an exact
268// algorithm. The exact algorithm has an exponential time complexity and should
269// only be used for small sized problems or medium sized problems where an exact
270// solution is highly desired.
271class PipelineSolver {
272 ScheduleDAGMI *DAG;
273
274 // Instructions that can be assigned to multiple SchedGroups
276 SmallVector<SUsToCandSGsVec, 4> PipelineInstrs;
278 // The current working pipeline
280 // The pipeline that has the best solution found so far
282
283 // Whether or not we actually have any SyncedInstrs to try to solve.
284 bool NeedsSolver = false;
285
286 // Compute an estimate of the size of search tree -- the true size is
287 // the product of each conflictedInst.Matches.size() across all SyncPipelines
288 unsigned computeProblemSize();
289
290 // The cost penalty of not assigning a SU to a SchedGroup
291 int MissPenalty = 0;
292
293 // Costs in terms of the number of edges we are unable to add
294 int BestCost = -1;
295 int CurrCost = 0;
296
297 // Index pointing to the conflicting instruction that is currently being
298 // fitted
299 int CurrConflInstNo = 0;
300 // Index to the pipeline that is currently being fitted
301 int CurrSyncGroupIdx = 0;
302 // The first non trivial pipeline
303 int BeginSyncGroupIdx = 0;
304
305 // How many branches we have explored
306 uint64_t BranchesExplored = 0;
307
308 // The direction in which we process the candidate SchedGroups per SU
309 bool IsBottomUp = true;
310
311 // Update indices to fit next conflicting instruction
312 void advancePosition();
313 // Recede indices to attempt to find better fit for previous conflicting
314 // instruction
315 void retreatPosition();
316
317 // The exponential time algorithm which finds the provably best fit
318 bool solveExact();
319 // The polynomial time algorithm which attempts to find a good fit
320 bool solveGreedy();
321 // Find the best SchedGroup for the current SU using the heuristic given all
322 // current information. One step in the greedy algorithm. Templated against
323 // the SchedGroup iterator (either reverse or forward).
324 template <typename T>
325 void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I,
326 T E);
327 // Whether or not the current solution is optimal
328 bool checkOptimal();
329 // Populate the ready list, prioiritizing fewest missed edges first
330 // Templated against the SchedGroup iterator (either reverse or forward).
331 template <typename T>
332 void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I,
333 T E);
334 // Add edges corresponding to the SchedGroups as assigned by solver
335 void makePipeline();
336 // Link the SchedGroups in the best found pipeline.
337 // Tmplated against the SchedGroup iterator (either reverse or forward).
338 template <typename T> void linkSchedGroups(T I, T E);
339 // Add the edges from the SU to the other SchedGroups in pipeline, and
340 // return the number of edges missed.
341 int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
342 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
343 /// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It
344 /// returns the cost (in terms of missed pipeline edges), and tracks the edges
345 /// added in \p AddedEdges
346 template <typename T>
347 int linkSUnit(SUnit *SU, int SGID,
348 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E);
349 /// Remove the edges passed via \p AddedEdges
350 void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
351 // Convert the passed in maps to arrays for bidirectional iterators
352 void convertSyncMapsToArrays();
353
354 void reset();
355
356public:
357 // Invoke the solver to map instructions to instruction groups. Heuristic &&
358 // command-line-option determines to use exact or greedy algorithm.
359 void solve();
360
361 PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
363 ScheduleDAGMI *DAG, bool IsBottomUp = true)
364 : DAG(DAG), SyncedInstrs(SyncedInstrs),
365 SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
366
367 for (auto &PipelineInstrs : SyncedInstrs) {
368 if (PipelineInstrs.second.size() > 0) {
369 NeedsSolver = true;
370 break;
371 }
372 }
373
374 if (!NeedsSolver)
375 return;
376
377 convertSyncMapsToArrays();
378
379 CurrPipeline = BestPipeline;
380
381 while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&
382 PipelineInstrs[BeginSyncGroupIdx].size() == 0)
383 ++BeginSyncGroupIdx;
384
385 if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())
386 return;
387 }
388};
389
390void PipelineSolver::reset() {
391
392 for (auto &SyncPipeline : CurrPipeline) {
393 for (auto &SG : SyncPipeline) {
394 SmallVector<SUnit *, 32> TempCollection = SG.Collection;
395 SG.Collection.clear();
396 auto *SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {
397 return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
398 });
399 if (SchedBarr != TempCollection.end())
400 SG.Collection.push_back(*SchedBarr);
401 }
402 }
403
404 CurrSyncGroupIdx = BeginSyncGroupIdx;
405 CurrConflInstNo = 0;
406 CurrCost = 0;
407}
408
409void PipelineSolver::convertSyncMapsToArrays() {
410 for (auto &SyncPipe : SyncedSchedGroups) {
411 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
412 }
413
414 int PipelineIDx = SyncedInstrs.size() - 1;
415 PipelineInstrs.resize(SyncedInstrs.size());
416 for (auto &SyncInstrMap : SyncedInstrs) {
417 for (auto &SUsToCandSGs : SyncInstrMap.second) {
418 if (PipelineInstrs[PipelineIDx].size() == 0) {
419 PipelineInstrs[PipelineIDx].push_back(
420 std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
421 continue;
422 }
423 auto *SortPosition = PipelineInstrs[PipelineIDx].begin();
424 // Insert them in sorted order -- this allows for good parsing order in
425 // the greedy algorithm
426 while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
427 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
428 ++SortPosition;
429 PipelineInstrs[PipelineIDx].insert(
430 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
431 }
432 --PipelineIDx;
433 }
434}
435
436template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) {
437 for (; I != E; ++I) {
438 auto &GroupA = *I;
439 for (auto J = std::next(I); J != E; ++J) {
440 auto &GroupB = *J;
441 GroupA.link(GroupB);
442 }
443 }
444}
445
446void PipelineSolver::makePipeline() {
447 // Preserve the order of barrier for subsequent SchedGroupBarrier mutations
448 for (auto &SyncPipeline : BestPipeline) {
449 LLVM_DEBUG(dbgs() << "Printing SchedGroups\n");
450 for (auto &SG : SyncPipeline) {
451 LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID()
452 << " has: \n");
453 SUnit *SGBarr = nullptr;
454 for (auto &SU : SG.Collection) {
455 if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
456 SGBarr = SU;
457 LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");
458 }
459 // Command line requested IGroupLP doesn't have SGBarr
460 if (!SGBarr)
461 continue;
462 resetEdges(*SGBarr, DAG);
463 SG.link(*SGBarr, false);
464 }
465 }
466
467 for (auto &SyncPipeline : BestPipeline) {
468 IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
469 : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
470 }
471}
472
473template <typename T>
474int PipelineSolver::linkSUnit(
475 SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
476 T I, T E) {
477 bool MakePred = false;
478 int AddedCost = 0;
479 for (; I < E; ++I) {
480 if (I->getSGID() == SGID) {
481 MakePred = true;
482 continue;
483 }
484 auto Group = *I;
485 AddedCost += Group.link(*SU, MakePred, AddedEdges);
486 assert(AddedCost >= 0);
487 }
488 return AddedCost;
489}
490
491int PipelineSolver::addEdges(
492 SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
493 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
494
495 // For IsBottomUp, the first SchedGroup in SyncPipeline contains the
496 // instructions that are the ultimate successors in the resultant mutation.
497 // Therefore, in such a configuration, the SchedGroups occurring before the
498 // candidate SGID are successors of the candidate SchedGroup, thus the current
499 // SU should be linked as a predecessor to SUs in those SchedGroups. The
500 // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple
501 // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using
502 // IsBottomUp (in reverse).
503 return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(),
504 SyncPipeline.rend())
505 : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(),
506 SyncPipeline.end());
507}
508
509void PipelineSolver::removeEdges(
510 const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
511 // Only remove the edges that we have added when testing
512 // the fit.
513 for (auto &PredSuccPair : EdgesToRemove) {
514 SUnit *Pred = PredSuccPair.first;
515 SUnit *Succ = PredSuccPair.second;
516
517 auto *Match = llvm::find_if(
518 Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });
519 if (Match != Succ->Preds.end()) {
520 assert(Match->isArtificial());
521 Succ->removePred(*Match);
522 }
523 }
524}
525
526void PipelineSolver::advancePosition() {
527 ++CurrConflInstNo;
528
529 if (static_cast<size_t>(CurrConflInstNo) >=
530 PipelineInstrs[CurrSyncGroupIdx].size()) {
531 CurrConflInstNo = 0;
532 ++CurrSyncGroupIdx;
533 // Advance to next non-trivial pipeline
534 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
535 PipelineInstrs[CurrSyncGroupIdx].size() == 0)
536 ++CurrSyncGroupIdx;
537 }
538}
539
540void PipelineSolver::retreatPosition() {
541 assert(CurrConflInstNo >= 0);
542 assert(CurrSyncGroupIdx >= 0);
543
544 if (CurrConflInstNo > 0) {
545 --CurrConflInstNo;
546 return;
547 }
548
549 if (CurrConflInstNo == 0) {
550 // If we return to the starting position, we have explored
551 // the entire tree
552 if (CurrSyncGroupIdx == BeginSyncGroupIdx)
553 return;
554
555 --CurrSyncGroupIdx;
556 // Go to previous non-trivial pipeline
557 while (PipelineInstrs[CurrSyncGroupIdx].size() == 0)
558 --CurrSyncGroupIdx;
559
560 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
561 }
562}
563
564bool PipelineSolver::checkOptimal() {
565 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
566 if (BestCost == -1 || CurrCost < BestCost) {
567 BestPipeline = CurrPipeline;
568 BestCost = CurrCost;
569 LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");
570 }
571 assert(BestCost >= 0);
572 }
573
574 bool DoneExploring = false;
575 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
576 DoneExploring = true;
577
578 return (DoneExploring || BestCost == 0);
579}
580
581template <typename T>
582void PipelineSolver::populateReadyList(
583 SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) {
584 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
585 auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
586 assert(CurrSU.second.size() >= 1);
587
588 for (; I != E; ++I) {
589 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
590 int CandSGID = *I;
591 SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
592 return SG.getSGID() == CandSGID;
593 });
594 assert(Match);
595
596 if (UseCostHeur) {
597 if (Match->isFull()) {
598 ReadyList.push_back(std::pair(*I, MissPenalty));
599 continue;
600 }
601
602 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
603 ReadyList.push_back(std::pair(*I, TempCost));
604 removeEdges(AddedEdges);
605 } else
606 ReadyList.push_back(std::pair(*I, -1));
607 }
608
609 if (UseCostHeur) {
610 std::sort(ReadyList.begin(), ReadyList.end(),
611 [](std::pair<int, int> A, std::pair<int, int> B) {
612 return A.second < B.second;
613 });
614 }
615
616 assert(ReadyList.size() == CurrSU.second.size());
617}
618
619bool PipelineSolver::solveExact() {
620 if (checkOptimal())
621 return true;
622
623 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
624 return false;
625
626 assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
627 assert(static_cast<size_t>(CurrConflInstNo) <
628 PipelineInstrs[CurrSyncGroupIdx].size());
629 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
630 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
631 << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
632
633 // SchedGroup -> Cost pairs
635 // Prioritize the candidate sched groups in terms of lowest cost first
636 IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(),
637 CurrSU.second.rend())
638 : populateReadyList(ReadyList, CurrSU.second.begin(),
639 CurrSU.second.end());
640
641 auto *I = ReadyList.begin();
642 auto *E = ReadyList.end();
643 for (; I != E; ++I) {
644 // If we are trying SGs in least cost order, and the current SG is cost
645 // infeasible, then all subsequent SGs will also be cost infeasible, so we
646 // can prune.
647 if (BestCost != -1 && (CurrCost + I->second > BestCost))
648 return false;
649
650 int CandSGID = I->first;
651 int AddedCost = 0;
652 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
653 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
654 SchedGroup *Match;
655 for (auto &SG : SyncPipeline) {
656 if (SG.getSGID() == CandSGID)
657 Match = &SG;
658 }
659
660 if (Match->isFull())
661 continue;
662
663 if (!Match->allowedByRules(CurrSU.first, SyncPipeline))
664 continue;
665
666 LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
667 << (int)Match->getMask() << "and ID " << CandSGID
668 << "\n");
669 Match->add(*CurrSU.first);
670 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
671 LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
672 CurrCost += AddedCost;
673 advancePosition();
674 ++BranchesExplored;
675 bool FinishedExploring = false;
676 // If the Cost after adding edges is greater than a known solution,
677 // backtrack
678 if (CurrCost < BestCost || BestCost == -1) {
679 if (solveExact()) {
680 FinishedExploring = BestCost != 0;
681 if (!FinishedExploring)
682 return true;
683 }
684 }
685
686 retreatPosition();
687 CurrCost -= AddedCost;
688 removeEdges(AddedEdges);
689 Match->pop();
690 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
691 if (FinishedExploring)
692 return true;
693 }
694
695 // Try the pipeline where the current instruction is omitted
696 // Potentially if we omit a problematic instruction from the pipeline,
697 // all the other instructions can nicely fit.
698 CurrCost += MissPenalty;
699 advancePosition();
700
701 LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");
702
703 bool FinishedExploring = false;
704 if (CurrCost < BestCost || BestCost == -1) {
705 if (solveExact()) {
706 bool FinishedExploring = BestCost != 0;
707 if (!FinishedExploring)
708 return true;
709 }
710 }
711
712 retreatPosition();
713 CurrCost -= MissPenalty;
714 return FinishedExploring;
715}
716
717template <typename T>
718void PipelineSolver::greedyFind(
719 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) {
720 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
721 int BestNodeCost = -1;
722 int TempCost;
723 SchedGroup *BestGroup = nullptr;
724 int BestGroupID = -1;
725 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
726 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
727 << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
728
729 // Since we have added the potential SchedGroups from bottom up, but
730 // traversed the DAG from top down, parse over the groups from last to
731 // first. If we fail to do this for the greedy algorithm, the solution will
732 // likely not be good in more complex cases.
733 for (; I != E; ++I) {
734 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
735 int CandSGID = *I;
736 SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
737 return SG.getSGID() == CandSGID;
738 });
739 assert(Match);
740
741 LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
742 << (int)Match->getMask() << "\n");
743
744 if (Match->isFull()) {
745 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
746 continue;
747 }
748 if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) {
749 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n");
750 continue;
751 }
752 TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
753 LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
754 if (TempCost < BestNodeCost || BestNodeCost == -1) {
755 BestGroup = Match;
756 BestNodeCost = TempCost;
757 BestGroupID = CandSGID;
758 }
759 removeEdges(AddedEdges);
760 if (BestNodeCost == 0)
761 break;
762 }
763
764 if (BestGroupID != -1) {
765 BestGroup->add(*CurrSU.first);
766 addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
767 LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
768 << (int)BestGroup->getMask() << "\n");
769 BestCost += TempCost;
770 } else
771 BestCost += MissPenalty;
772
773 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
774}
775
776bool PipelineSolver::solveGreedy() {
777 BestCost = 0;
778 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
779
780 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
781 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
782 IsBottomUp
783 ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
784 : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
785 advancePosition();
786 }
787 BestPipeline = CurrPipeline;
788 removeEdges(AddedEdges);
789 return false;
790}
791
792unsigned PipelineSolver::computeProblemSize() {
793 unsigned ProblemSize = 0;
794 for (auto &PipeConflicts : PipelineInstrs) {
795 ProblemSize += PipeConflicts.size();
796 }
797
798 return ProblemSize;
799}
800
801void PipelineSolver::solve() {
802 if (!NeedsSolver)
803 return;
804
805 unsigned ProblemSize = computeProblemSize();
806 assert(ProblemSize > 0);
807
808 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
809 MissPenalty = (ProblemSize / 2) + 1;
810
811 LLVM_DEBUG(DAG->dump());
812 if (EnableExactSolver || BelowCutoff) {
813 LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
814 solveGreedy();
815 reset();
816 LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
817 if (BestCost > 0) {
818 LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
819 solveExact();
820 LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
821 }
822 } else { // Use the Greedy Algorithm by default
823 LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
824 solveGreedy();
825 }
826
827 makePipeline();
828 LLVM_DEBUG(dbgs() << "After applying mutation\n");
829 LLVM_DEBUG(DAG->dump());
830}
831
832enum IGLPStrategyID : int {
833 MFMASmallGemmOptID = 0,
834 MFMASmallGemmSingleWaveOptID = 1,
835 MFMAExpInterleaveID = 2,
836 MFMAExpSimpleInterleaveID = 3
837};
838
839// Implement a IGLP scheduling strategy.
840class IGLPStrategy {
841protected:
843
844 const SIInstrInfo *TII;
845
846public:
847 /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
848 virtual bool applyIGLPStrategy(
850 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
852
853 // Returns true if this strategy should be applied to a ScheduleDAG.
854 virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
856
857 bool IsBottomUp = true;
858
859 IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
860 : DAG(DAG), TII(TII) {}
861
862 virtual ~IGLPStrategy() = default;
863};
864
865class MFMASmallGemmOpt final : public IGLPStrategy {
866private:
867public:
868 bool applyIGLPStrategy(
870 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
872
873 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
875 return true;
876 }
877
878 MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
879 : IGLPStrategy(DAG, TII) {
880 IsBottomUp = true;
881 }
882};
883
884bool MFMASmallGemmOpt::applyIGLPStrategy(
886 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
888 // Count the number of MFMA instructions.
889 unsigned MFMACount = 0;
890 for (const MachineInstr &I : *DAG)
891 if (TII->isMFMAorWMMA(I))
892 ++MFMACount;
893
894 const unsigned PipelineSyncID = 0;
895 SchedGroup *SG = nullptr;
896 for (unsigned I = 0; I < MFMACount * 3; ++I) {
897 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
898 SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);
899 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
900
901 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
902 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
903 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
904 }
905
906 return true;
907}
908
909class MFMAExpInterleaveOpt final : public IGLPStrategy {
910private:
911 // The count of TRANS SUs involved in the interleaved pipeline
912 static unsigned TransPipeCount;
913 // The count of MFMA SUs involved in the interleaved pipeline
914 static unsigned MFMAPipeCount;
915 // The count of Add SUs involved in the interleaved pipeline
916 static unsigned AddPipeCount;
917 // The number of transitive MFMA successors for each TRANS SU
918 static unsigned MFMAEnablement;
919 // The number of transitive TRANS predecessors for each MFMA SU
920 static unsigned ExpRequirement;
921 // The count of independent "chains" of MFMA instructions in the pipeline
922 static unsigned MFMAChains;
923 // The length of each independent "chain" of MFMA instructions
924 static unsigned MFMAChainLength;
925 // Whether or not the pipeline has V_CVT instructions
926 static bool HasCvt;
927 // Whether or not there are instructions between the TRANS instruction and
928 // V_CVT
929 static bool HasChainBetweenCvt;
930 // The first occuring DS_READ which feeds an MFMA chain
931 static std::optional<unsigned> FirstPipeDSR;
932 // The MFMAPipe SUs with no MFMA predecessors
933 SmallVector<SUnit *, 4> MFMAChainSeeds;
934 // Compute the heuristics for the pipeline, returning whether or not the DAG
935 // is well formatted for the mutation
936 bool analyzeDAG(const SIInstrInfo *TII);
937
938 /// Whether or not the instruction is a transitive predecessor of an MFMA
939 /// instruction
940 class IsPipeExp final : public InstructionRule {
941 public:
942 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
943 SmallVectorImpl<SchedGroup> &SyncPipe) override {
944
945 auto *DAG = SyncPipe[0].DAG;
946
947 if (Cache->empty()) {
948 auto I = DAG->SUnits.rbegin();
949 auto E = DAG->SUnits.rend();
950 for (; I != E; I++) {
951 if (TII->isMFMAorWMMA(*I->getInstr()))
952 Cache->push_back(&*I);
953 }
954 if (Cache->empty())
955 return false;
956 }
957
958 auto Reaches = any_of(*Cache, [&SU, &DAG](SUnit *TargetSU) {
959 return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));
960 });
961
962 return Reaches;
963 }
964 IsPipeExp(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
965 : InstructionRule(TII, SGID, NeedsCache) {}
966 };
967
968 /// Whether or not the instruction is a transitive predecessor of the
969 /// \p Number th MFMA of the MFMAs occuring after a TRANS instruction
970 class EnablesNthMFMA final : public InstructionRule {
971 private:
972 unsigned Number = 1;
973
974 public:
975 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
976 SmallVectorImpl<SchedGroup> &SyncPipe) override {
977 bool FoundTrans = false;
978 unsigned Counter = 1;
979 auto *DAG = SyncPipe[0].DAG;
980
981 if (Cache->empty()) {
983
984 auto I = DAG->SUnits.begin();
985 auto E = DAG->SUnits.end();
986 for (; I != E; I++) {
987 if (FoundTrans && TII->isMFMAorWMMA(*I->getInstr())) {
988 if (Counter == Number) {
989 Cache->push_back(&*I);
990 break;
991 }
992 ++Counter;
993 }
994 if (!FoundTrans && TII->isTRANS(I->getInstr()->getOpcode()))
995 FoundTrans = true;
996 }
997 if (Cache->empty())
998 return false;
999 }
1000
1001 return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
1002 }
1003
1004 EnablesNthMFMA(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
1005 bool NeedsCache = false)
1006 : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
1007 };
1008
1009 /// Whether or not the instruction enables the exact MFMA that is the \p
1010 /// Number th MFMA in the chain starting with \p ChainSeed
1011 class EnablesNthMFMAInChain final : public InstructionRule {
1012 private:
1013 unsigned Number = 1;
1014 SUnit *ChainSeed;
1015
1016 public:
1017 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1018 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1019 auto *DAG = SyncPipe[0].DAG;
1020
1021 if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
1022 return false;
1023
1024 if (Cache->empty()) {
1025 auto *TempSU = ChainSeed;
1026 auto Depth = Number;
1027 while (Depth > 0) {
1028 --Depth;
1029 bool Found = false;
1030 for (auto &Succ : TempSU->Succs) {
1031 if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1032 TempSU = Succ.getSUnit();
1033 Found = true;
1034 break;
1035 }
1036 }
1037 if (!Found)
1038 return false;
1039 }
1040
1041 Cache->push_back(TempSU);
1042 }
1043 // If we failed to find the instruction to be placed into the cache, we
1044 // would have already exited.
1045 assert(!Cache->empty());
1046
1047 return DAG->IsReachable((*Cache)[0], const_cast<SUnit *>(SU));
1048 }
1049
1050 EnablesNthMFMAInChain(unsigned Number, SUnit *ChainSeed,
1051 const SIInstrInfo *TII, unsigned SGID,
1052 bool NeedsCache = false)
1053 : InstructionRule(TII, SGID, NeedsCache), Number(Number),
1054 ChainSeed(ChainSeed) {}
1055 };
1056
1057 /// Whether or not the instruction has less than \p Size immediate successors.
1058 /// If \p HasIntermediary is true, this tests also whether all successors of
1059 /// the SUnit have less than \p Size successors.
1060 class LessThanNSuccs final : public InstructionRule {
1061 private:
1062 unsigned Size = 1;
1063 bool HasIntermediary = false;
1064
1065 public:
1066 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1067 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1068 if (!SyncPipe.size())
1069 return false;
1070
1071 auto SuccSize = std::count_if(
1072 SU->Succs.begin(), SU->Succs.end(),
1073 [](const SDep &Succ) { return Succ.getKind() == SDep::Data; });
1074 if (SuccSize >= Size)
1075 return false;
1076
1077 if (HasIntermediary) {
1078 for (auto Succ : SU->Succs) {
1079 auto SuccSize = std::count_if(
1080 Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(),
1081 [](const SDep &SuccSucc) {
1082 return SuccSucc.getKind() == SDep::Data;
1083 });
1084 if (SuccSize >= Size)
1085 return false;
1086 }
1087 }
1088
1089 return true;
1090 }
1091 LessThanNSuccs(unsigned Size, const SIInstrInfo *TII, unsigned SGID,
1092 bool HasIntermediary = false, bool NeedsCache = false)
1093 : InstructionRule(TII, SGID, NeedsCache), Size(Size),
1094 HasIntermediary(HasIntermediary) {}
1095 };
1096
1097 /// Whether or not the instruction has greater than or equal to \p Size
1098 /// immediate successors. If \p HasIntermediary is true, this tests also
1099 /// whether all successors of the SUnit have greater than or equal to \p Size
1100 /// successors.
1101 class GreaterThanOrEqualToNSuccs final : public InstructionRule {
1102 private:
1103 unsigned Size = 1;
1104 bool HasIntermediary = false;
1105
1106 public:
1107 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1108 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1109 if (!SyncPipe.size())
1110 return false;
1111
1112 auto SuccSize = std::count_if(
1113 SU->Succs.begin(), SU->Succs.end(),
1114 [](const SDep &Succ) { return Succ.getKind() == SDep::Data; });
1115 if (SuccSize >= Size)
1116 return true;
1117
1118 if (HasIntermediary) {
1119 for (auto Succ : SU->Succs) {
1120 auto SuccSize = std::count_if(
1121 Succ.getSUnit()->Succs.begin(), Succ.getSUnit()->Succs.end(),
1122 [](const SDep &SuccSucc) {
1123 return SuccSucc.getKind() == SDep::Data;
1124 });
1125 if (SuccSize >= Size)
1126 return true;
1127 }
1128 }
1129
1130 return false;
1131 }
1132 GreaterThanOrEqualToNSuccs(unsigned Size, const SIInstrInfo *TII,
1133 unsigned SGID, bool HasIntermediary = false,
1134 bool NeedsCache = false)
1135 : InstructionRule(TII, SGID, NeedsCache), Size(Size),
1136 HasIntermediary(HasIntermediary) {}
1137 };
1138
1139 // Whether or not the instruction is a relevant V_CVT instruction.
1140 class IsCvt final : public InstructionRule {
1141 public:
1142 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1143 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1144 auto Opc = SU->getInstr()->getOpcode();
1145 return Opc == AMDGPU::V_CVT_F16_F32_e32 ||
1146 Opc == AMDGPU::V_CVT_I32_F32_e32;
1147 }
1148 IsCvt(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1149 : InstructionRule(TII, SGID, NeedsCache) {}
1150 };
1151
1152 // Whether or not the instruction is FMA_F32.
1153 class IsFMA final : public InstructionRule {
1154 public:
1155 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1156 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1157 return SU->getInstr()->getOpcode() == AMDGPU::V_FMA_F32_e64 ||
1158 SU->getInstr()->getOpcode() == AMDGPU::V_PK_FMA_F32;
1159 }
1160 IsFMA(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1161 : InstructionRule(TII, SGID, NeedsCache) {}
1162 };
1163
1164 // Whether or not the instruction is a V_ADD_F32 instruction.
1165 class IsPipeAdd final : public InstructionRule {
1166 public:
1167 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1168 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1169 return SU->getInstr()->getOpcode() == AMDGPU::V_ADD_F32_e32;
1170 }
1171 IsPipeAdd(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1172 : InstructionRule(TII, SGID, NeedsCache) {}
1173 };
1174
1175 /// Whether or not the instruction is an immediate RAW successor
1176 /// of the SchedGroup \p Distance steps before.
1177 class IsSuccOfPrevNthGroup final : public InstructionRule {
1178 private:
1179 unsigned Distance = 1;
1180
1181 public:
1182 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1183 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1184 SchedGroup *OtherGroup = nullptr;
1185 if (!SyncPipe.size())
1186 return false;
1187
1188 for (auto &PipeSG : SyncPipe) {
1189 if ((unsigned)PipeSG.getSGID() == SGID - Distance)
1190 OtherGroup = &PipeSG;
1191 }
1192
1193 if (!OtherGroup)
1194 return false;
1195 if (!OtherGroup->Collection.size())
1196 return true;
1197
1198 for (auto &OtherEle : OtherGroup->Collection) {
1199 for (auto &Succ : OtherEle->Succs) {
1200 if (Succ.getSUnit() == SU && Succ.getKind() == SDep::Data)
1201 return true;
1202 }
1203 }
1204
1205 return false;
1206 }
1207 IsSuccOfPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
1208 unsigned SGID, bool NeedsCache = false)
1209 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
1210 };
1211
1212 /// Whether or not the instruction is a transitive successor of any
1213 /// instruction the the SchedGroup \p Distance steps before.
1214 class IsReachableFromPrevNthGroup final : public InstructionRule {
1215 private:
1216 unsigned Distance = 1;
1217
1218 public:
1219 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1220 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1221 SchedGroup *OtherGroup = nullptr;
1222 if (!SyncPipe.size())
1223 return false;
1224
1225 for (auto &PipeSG : SyncPipe) {
1226 if ((unsigned)PipeSG.getSGID() == SGID - Distance)
1227 OtherGroup = &PipeSG;
1228 }
1229
1230 if (!OtherGroup)
1231 return false;
1232 if (!OtherGroup->Collection.size())
1233 return true;
1234
1235 auto *DAG = SyncPipe[0].DAG;
1236
1237 for (auto &OtherEle : OtherGroup->Collection)
1238 if (DAG->IsReachable(const_cast<SUnit *>(SU), OtherEle))
1239 return true;
1240
1241 return false;
1242 }
1243 IsReachableFromPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
1244 unsigned SGID, bool NeedsCache = false)
1245 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
1246 };
1247
1248 /// Whether or not the instruction occurs after the SU with NodeNUm \p Number
1249 class OccursAtOrAfterNode final : public InstructionRule {
1250 private:
1251 unsigned Number = 1;
1252
1253 public:
1254 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1255 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1256
1257 return SU->NodeNum >= Number;
1258 }
1259 OccursAtOrAfterNode(unsigned Number, const SIInstrInfo *TII, unsigned SGID,
1260 bool NeedsCache = false)
1261 : InstructionRule(TII, SGID, NeedsCache), Number(Number) {}
1262 };
1263
1264 /// Whether or not the SU is exactly the \p Number th MFMA in the chain
1265 /// starting with \p ChainSeed
1266 class IsExactMFMA final : public InstructionRule {
1267 private:
1268 unsigned Number = 1;
1269 SUnit *ChainSeed;
1270
1271 public:
1272 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1273 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1274 if (!SU || !TII->isMFMAorWMMA(*ChainSeed->getInstr()))
1275 return false;
1276
1277 if (Cache->empty()) {
1278 auto *TempSU = ChainSeed;
1279 auto Depth = Number;
1280 while (Depth > 0) {
1281 --Depth;
1282 bool Found = false;
1283 for (auto &Succ : TempSU->Succs) {
1284 if (TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr())) {
1285 TempSU = Succ.getSUnit();
1286 Found = true;
1287 break;
1288 }
1289 }
1290 if (!Found) {
1291 return false;
1292 }
1293 }
1294 Cache->push_back(TempSU);
1295 }
1296 // If we failed to find the instruction to be placed into the cache, we
1297 // would have already exited.
1298 assert(!Cache->empty());
1299
1300 return (*Cache)[0] == SU;
1301 }
1302
1303 IsExactMFMA(unsigned Number, SUnit *ChainSeed, const SIInstrInfo *TII,
1304 unsigned SGID, bool NeedsCache = false)
1305 : InstructionRule(TII, SGID, NeedsCache), Number(Number),
1306 ChainSeed(ChainSeed) {}
1307 };
1308
1309 // Whether the instruction occurs after the first TRANS instruction. This
1310 // implies the instruction can not be a predecessor of the first TRANS
1311 // insruction
1312 class OccursAfterExp final : public InstructionRule {
1313 public:
1314 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1315 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1316
1317 SmallVector<SUnit *, 12> Worklist;
1318 auto *DAG = SyncPipe[0].DAG;
1319 if (Cache->empty()) {
1320 for (auto &SU : DAG->SUnits)
1321 if (TII->isTRANS(SU.getInstr()->getOpcode())) {
1322 Cache->push_back(&SU);
1323 break;
1324 }
1325 if (Cache->empty())
1326 return false;
1327 }
1328
1329 return SU->NodeNum > (*Cache)[0]->NodeNum;
1330 }
1331
1332 OccursAfterExp(const SIInstrInfo *TII, unsigned SGID,
1333 bool NeedsCache = false)
1334 : InstructionRule(TII, SGID, NeedsCache) {}
1335 };
1336
1337public:
1338 bool applyIGLPStrategy(
1340 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1342
1343 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1345
1346 MFMAExpInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
1347 : IGLPStrategy(DAG, TII) {
1348 IsBottomUp = false;
1349 }
1350};
1351
1352unsigned MFMAExpInterleaveOpt::TransPipeCount = 0;
1353unsigned MFMAExpInterleaveOpt::MFMAPipeCount = 0;
1354unsigned MFMAExpInterleaveOpt::AddPipeCount = 0;
1355unsigned MFMAExpInterleaveOpt::MFMAEnablement = 0;
1356unsigned MFMAExpInterleaveOpt::ExpRequirement = 0;
1357unsigned MFMAExpInterleaveOpt::MFMAChains = 0;
1358unsigned MFMAExpInterleaveOpt::MFMAChainLength = 0;
1359bool MFMAExpInterleaveOpt::HasCvt = false;
1360bool MFMAExpInterleaveOpt::HasChainBetweenCvt = false;
1361std::optional<unsigned> MFMAExpInterleaveOpt::FirstPipeDSR = std::nullopt;
1362
1363bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
1364 SmallVector<SUnit *, 10> ExpPipeCands;
1365 SmallVector<SUnit *, 10> MFMAPipeCands;
1366 SmallVector<SUnit *, 10> MFMAPipeSUs;
1369
1370 auto isBitPack = [](unsigned Opc) {
1371 return Opc == AMDGPU::V_PACK_B32_F16_e64 || Opc == AMDGPU::V_PERM_B32_e64;
1372 };
1373
1374 auto isCvt = [](unsigned Opc) {
1375 return Opc == AMDGPU::V_CVT_F16_F32_e32 || Opc == AMDGPU::V_CVT_I32_F32_e32;
1376 };
1377
1378 auto isAdd = [](unsigned Opc) { return Opc == AMDGPU::V_ADD_F32_e32; };
1379
1380 AddPipeCount = 0;
1381 for (SUnit &SU : DAG->SUnits) {
1382 auto Opc = SU.getInstr()->getOpcode();
1383 if (TII->isTRANS(Opc)) {
1384 // Avoid counting a potential bonus V_EXP which all the MFMA depend on
1385 if (SU.Succs.size() >= 7)
1386 continue;
1387 for (auto &Succ : SU.Succs) {
1388 if (Succ.getSUnit()->Succs.size() >= 7)
1389 continue;
1390 }
1391 ExpPipeCands.push_back(&SU);
1392 }
1393
1394 if (TII->isMFMAorWMMA(*SU.getInstr()))
1395 MFMAPipeCands.push_back(&SU);
1396
1397 if (isBitPack(Opc))
1398 PackSUs.push_back(&SU);
1399
1400 if (isCvt(Opc))
1401 CvtSUs.push_back(&SU);
1402
1403 if (isAdd(Opc))
1404 ++AddPipeCount;
1405 }
1406
1407 if (!(PackSUs.size() && MFMAPipeCands.size() && ExpPipeCands.size()))
1408 return false;
1409
1410 TransPipeCount = 0;
1411
1412 std::optional<SUnit *> TempMFMA;
1413 std::optional<SUnit *> TempExp;
1414 // Count the number of EXPs that reach an MFMA
1415 for (auto &PredSU : ExpPipeCands) {
1416 for (auto &SuccSU : MFMAPipeCands) {
1417 if (DAG->IsReachable(SuccSU, PredSU)) {
1418 if (!TempExp) {
1419 TempExp = PredSU;
1420 TempMFMA = SuccSU;
1421 }
1422 MFMAPipeSUs.push_back(SuccSU);
1423 ++TransPipeCount;
1424 break;
1425 }
1426 }
1427 }
1428
1429 if (!(TempExp && TempMFMA))
1430 return false;
1431
1432 HasChainBetweenCvt = none_of((*TempExp)->Succs, [&isCvt](SDep &Succ) {
1433 return isCvt(Succ.getSUnit()->getInstr()->getOpcode());
1434 });
1435
1436 // Count the number of MFMAs that are reached by an EXP
1437 for (auto &SuccSU : MFMAPipeCands) {
1438 if (MFMAPipeSUs.size() &&
1439 any_of(MFMAPipeSUs, [&SuccSU](SUnit *PotentialMatch) {
1440 return PotentialMatch->NodeNum == SuccSU->NodeNum;
1441 }))
1442 continue;
1443
1444 for (auto &PredSU : ExpPipeCands) {
1445 if (DAG->IsReachable(SuccSU, PredSU)) {
1446 MFMAPipeSUs.push_back(SuccSU);
1447 break;
1448 }
1449 }
1450 }
1451
1452 MFMAPipeCount = MFMAPipeSUs.size();
1453
1454 assert(TempExp && TempMFMA);
1455 assert(MFMAPipeCount > 0);
1456
1457 std::optional<SUnit *> TempCvt;
1458 for (auto &SuccSU : CvtSUs) {
1459 if (DAG->IsReachable(SuccSU, *TempExp)) {
1460 TempCvt = SuccSU;
1461 break;
1462 }
1463 }
1464
1465 HasCvt = false;
1466 if (TempCvt.has_value()) {
1467 for (auto &SuccSU : MFMAPipeSUs) {
1468 if (DAG->IsReachable(SuccSU, *TempCvt)) {
1469 HasCvt = true;
1470 break;
1471 }
1472 }
1473 }
1474
1475 MFMAChains = 0;
1476 for (auto &MFMAPipeSU : MFMAPipeSUs) {
1477 if (is_contained(MFMAChainSeeds, MFMAPipeSU))
1478 continue;
1479 if (none_of(MFMAPipeSU->Preds, [&TII](SDep &Succ) {
1480 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1481 })) {
1482 MFMAChainSeeds.push_back(MFMAPipeSU);
1483 ++MFMAChains;
1484 }
1485 }
1486
1487 if (!MFMAChains)
1488 return false;
1489
1490 for (auto Pred : MFMAChainSeeds[0]->Preds) {
1491 if (TII->isDS(Pred.getSUnit()->getInstr()->getOpcode()) &&
1492 Pred.getSUnit()->getInstr()->mayLoad())
1493 FirstPipeDSR = Pred.getSUnit()->NodeNum;
1494 }
1495
1496 MFMAChainLength = MFMAPipeCount / MFMAChains;
1497
1498 // The number of bit pack operations that depend on a single V_EXP
1499 unsigned PackSuccCount = std::count_if(
1500 PackSUs.begin(), PackSUs.end(), [this, &TempExp](SUnit *VPack) {
1501 return DAG->IsReachable(VPack, *TempExp);
1502 });
1503
1504 // The number of bit pack operations an MFMA depends on
1505 unsigned PackPredCount =
1506 std::count_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
1507 [&isBitPack](SDep &Pred) {
1508 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1509 return isBitPack(Opc);
1510 });
1511
1512 auto *PackPred =
1513 std::find_if((*TempMFMA)->Preds.begin(), (*TempMFMA)->Preds.end(),
1514 [&isBitPack](SDep &Pred) {
1515 auto Opc = Pred.getSUnit()->getInstr()->getOpcode();
1516 return isBitPack(Opc);
1517 });
1518
1519 if (PackPred == (*TempMFMA)->Preds.end())
1520 return false;
1521
1522 MFMAEnablement = 0;
1523 ExpRequirement = 0;
1524 // How many MFMAs depend on a single bit pack operation
1525 MFMAEnablement =
1526 std::count_if(PackPred->getSUnit()->Succs.begin(),
1527 PackPred->getSUnit()->Succs.end(), [&TII](SDep &Succ) {
1528 return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
1529 });
1530
1531 // The number of MFMAs that depend on a single V_EXP
1532 MFMAEnablement *= PackSuccCount;
1533
1534 // The number of V_EXPs required to resolve all dependencies for an MFMA
1535 ExpRequirement =
1536 std::count_if(ExpPipeCands.begin(), ExpPipeCands.end(),
1537 [this, &PackPred](SUnit *ExpBase) {
1538 return DAG->IsReachable(PackPred->getSUnit(), ExpBase);
1539 });
1540
1541 ExpRequirement *= PackPredCount;
1542 return true;
1543}
1544
1545bool MFMAExpInterleaveOpt::shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1547 const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
1548 const SIInstrInfo *TII = ST.getInstrInfo();
1549
1550 if (Phase != AMDGPU::SchedulingPhase::PostRA)
1551 MFMAChainSeeds.clear();
1552 if (Phase != AMDGPU::SchedulingPhase::PostRA && !analyzeDAG(TII))
1553 return false;
1554
1555 return true;
1556}
1557
1558bool MFMAExpInterleaveOpt::applyIGLPStrategy(
1560 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1562
1563 bool IsSmallKernelType =
1564 MFMAEnablement == 2 && ExpRequirement == 4 && TransPipeCount == 32;
1565 bool IsLargeKernelType =
1566 MFMAEnablement == 4 && ExpRequirement == 4 && TransPipeCount == 64;
1567
1568 if (!(IsSmallKernelType || IsLargeKernelType))
1569 return false;
1570
1571 const GCNSubtarget &ST = DAG->MF.getSubtarget<GCNSubtarget>();
1572 const SIInstrInfo *TII = ST.getInstrInfo();
1573
1574 unsigned PipelineSyncID = 0;
1575 SchedGroup *SG = nullptr;
1576
1577 unsigned MFMAChain = 0;
1578 unsigned PositionInChain = 0;
1579 unsigned CurrMFMAForTransPosition = 0;
1580
1581 auto incrementTransPosition = [&MFMAChain, &PositionInChain,
1582 &CurrMFMAForTransPosition]() {
1583 CurrMFMAForTransPosition += MFMAEnablement;
1584 PositionInChain = (CurrMFMAForTransPosition / MFMAChains);
1585 MFMAChain = CurrMFMAForTransPosition % MFMAChains;
1586 };
1587
1588 auto getNextTransPositionInChain = [&CurrMFMAForTransPosition]() {
1589 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1590 return (TempMFMAForTrans / MFMAChains);
1591 };
1592
1593 auto getNextTransMFMAChain = [&CurrMFMAForTransPosition]() {
1594 auto TempMFMAForTrans = CurrMFMAForTransPosition + MFMAEnablement;
1595 return TempMFMAForTrans % MFMAChains;
1596 };
1597
1598 unsigned CurrMFMAPosition = 0;
1599 unsigned MFMAChainForMFMA = 0;
1600 unsigned PositionInChainForMFMA = 0;
1601
1602 auto incrementMFMAPosition = [&CurrMFMAPosition, &MFMAChainForMFMA,
1603 &PositionInChainForMFMA]() {
1604 ++CurrMFMAPosition;
1605 MFMAChainForMFMA = CurrMFMAPosition % MFMAChains;
1606 PositionInChainForMFMA = CurrMFMAPosition / MFMAChains;
1607 };
1608
1609 bool IsPostRA = Phase == AMDGPU::SchedulingPhase::PostRA;
1610 assert(IsPostRA || MFMAChainSeeds.size() == MFMAChains);
1611
1612 bool UsesFMA = IsSmallKernelType || !IsPostRA;
1613 bool UsesDSRead = IsLargeKernelType && !IsPostRA && FirstPipeDSR;
1614 bool UsesCvt = HasCvt && (IsSmallKernelType || !IsPostRA);
1615 bool UsesVALU = IsSmallKernelType;
1616
1617 // PHASE 1: "Prefetch"
1618 if (UsesFMA) {
1619 // First Round FMA
1620 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1621 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
1622 if (!IsPostRA && MFMAChains) {
1623 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1624 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
1625 true));
1626 } else
1627 SG->addRule(
1628 std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
1629 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1630 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1631
1632 // Second Round FMA
1633 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1634 SchedGroupMask::VALU, ExpRequirement, PipelineSyncID, DAG, TII);
1635 if (!IsPostRA && MFMAChains) {
1636 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1637 getNextTransPositionInChain(),
1638 MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
1639 } else
1640 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,
1641 SG->getSGID(), true));
1642 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1643 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1644 }
1645
1646 if (UsesDSRead) {
1647 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1648 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
1649 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,
1650 SG->getSGID()));
1651 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1652 }
1653
1654 // First Round EXP
1655 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1656 SchedGroupMask::TRANS, ExpRequirement, PipelineSyncID, DAG, TII);
1657 if (!IsPostRA && MFMAChains)
1658 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1659 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(), true));
1660 else
1661 SG->addRule(std::make_shared<EnablesNthMFMA>(1, TII, SG->getSGID(), true));
1662 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1663 SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
1664 HasChainBetweenCvt));
1665 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1666
1667 incrementTransPosition();
1668
1669 // First Round CVT, Third Round FMA, Second Round EXP; interleaved
1670 for (unsigned I = 0; I < ExpRequirement; I++) {
1671 // First Round CVT
1672 if (UsesCvt) {
1673 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1674 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1675 SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
1676 if (HasChainBetweenCvt)
1677 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1678 1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
1679 else
1680 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(
1681 1 + (2 + UsesFMA) * I, TII, SG->getSGID()));
1682 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1683 }
1684
1685 // Third Round FMA
1686 if (UsesFMA) {
1687 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1688 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1689 if (!IsPostRA && MFMAChains) {
1690 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1691 getNextTransPositionInChain(),
1692 MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(), true));
1693 } else
1694 SG->addRule(std::make_shared<EnablesNthMFMA>(2 * MFMAEnablement + 1,
1695 TII, SG->getSGID(), true));
1696 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1697 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1698 }
1699
1700 // Second Round EXP
1701 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1702 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1703 if (!IsPostRA && MFMAChains)
1704 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1705 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
1706 true));
1707 else
1708 SG->addRule(std::make_shared<EnablesNthMFMA>(MFMAEnablement + 1, TII,
1709 SG->getSGID(), true));
1710 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1711 SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
1712 HasChainBetweenCvt));
1713 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1714 }
1715
1716 // The "extra" EXP which enables all MFMA
1717 // TODO: UsesExtraExp
1718 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1719 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1720 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1721 SG->addRule(std::make_shared<GreaterThanOrEqualToNSuccs>(
1722 8, TII, SG->getSGID(), HasChainBetweenCvt));
1723 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1724
1725 // PHASE 2: Main Interleave Loop
1726
1727 // The number of MFMAs per iteration
1728 unsigned MFMARatio =
1729 MFMAEnablement > ExpRequirement ? MFMAEnablement / ExpRequirement : 1;
1730 // The number of Exps per iteration
1731 unsigned ExpRatio =
1732 MFMAEnablement > ExpRequirement ? 1 : ExpRequirement / MFMAEnablement;
1733 // The reamaining Exps
1734 unsigned RemainingExp = TransPipeCount > (2 * ExpRequirement)
1735 ? TransPipeCount - (2 * ExpRequirement)
1736 : 0;
1737 unsigned ExpLoopCount = RemainingExp / ExpRatio;
1738 // In loop MFMAs
1739 unsigned MFMAInLoop = MFMAPipeCount > (MFMAEnablement * 2)
1740 ? MFMAPipeCount - (MFMAEnablement * 2)
1741 : 0;
1742 unsigned MFMALoopCount = MFMAInLoop / MFMARatio;
1743 unsigned VALUOps =
1744 AddPipeCount < MFMAPipeCount ? 1 : AddPipeCount / MFMAPipeCount;
1745 unsigned LoopSize = std::min(ExpLoopCount, MFMALoopCount);
1746
1747 for (unsigned I = 0; I < LoopSize; I++) {
1748 if (!(I * ExpRatio % ExpRequirement))
1749 incrementTransPosition();
1750
1751 // Round N MFMA
1752 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1753 SchedGroupMask::MFMA, MFMARatio, PipelineSyncID, DAG, TII);
1754 if (!IsPostRA && MFMAChains)
1755 SG->addRule(std::make_shared<IsExactMFMA>(
1756 PositionInChainForMFMA, MFMAChainSeeds[MFMAChainForMFMA], TII,
1757 SG->getSGID(), true));
1758 else
1759 SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
1760 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1761 incrementMFMAPosition();
1762
1763 if (UsesVALU) {
1764 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1765 SchedGroupMask::VALU, VALUOps, PipelineSyncID, DAG, TII);
1766 SG->addRule(std::make_shared<IsPipeAdd>(TII, SG->getSGID()));
1767 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1768 }
1769
1770 if (UsesDSRead && !(I % 4)) {
1771 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1772 SchedGroupMask::DS_READ, 2, PipelineSyncID, DAG, TII);
1773 SG->addRule(std::make_shared<OccursAtOrAfterNode>(*FirstPipeDSR, TII,
1774 SG->getSGID()));
1775 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1776 }
1777
1778 // CVT, EXP, FMA Interleaving
1779 for (unsigned J = 0; J < ExpRatio; J++) {
1780 auto MFMAOffset = (1 + UsesVALU) * MFMARatio * (I + 1);
1781 auto MaxMFMAOffset =
1782 (1 + UsesVALU) * ExpRequirement * MFMARatio / ExpRatio;
1783
1784 // Round N + 1 CVT
1785 if (UsesCvt) {
1786 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1787 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1788 SG->addRule(std::make_shared<IsCvt>(TII, SG->getSGID()));
1789 auto BaseDiff = (2 + UsesFMA) * (ExpRequirement - 1) + 1;
1790 auto DSROffset = I / 4 + 1;
1791 auto MaxDSROffset = MaxMFMAOffset / 4;
1792 // TODO: UsesExtraExp
1793 auto ExpOffset = I * ExpRatio + J >= ExpRequirement ? 0 : 1;
1794 auto CurrentOffset = UsesDSRead * std::min(MaxDSROffset, DSROffset) +
1795 std::min(MaxMFMAOffset, MFMAOffset) + BaseDiff +
1796 ExpOffset;
1797 if (HasChainBetweenCvt)
1798 SG->addRule(std::make_shared<IsReachableFromPrevNthGroup>(
1799 CurrentOffset, TII, SG->getSGID()));
1800 else
1801 SG->addRule(std::make_shared<IsSuccOfPrevNthGroup>(CurrentOffset, TII,
1802 SG->getSGID()));
1803 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1804 }
1805
1806 // Round N + 3 FMA
1807 if (UsesFMA) {
1808 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1809 SchedGroupMask::VALU, 1, PipelineSyncID, DAG, TII);
1810 if (!IsPostRA && MFMAChains)
1811 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1812 getNextTransPositionInChain(),
1813 MFMAChainSeeds[getNextTransMFMAChain()], TII, SG->getSGID(),
1814 true));
1815 else
1816 SG->addRule(std::make_shared<EnablesNthMFMA>(
1817 (((I * ExpRatio + J) / ExpRequirement) + 3) * MFMAEnablement + 1,
1818 TII, SG->getSGID(), true));
1819 SG->addRule(std::make_shared<IsFMA>(TII, SG->getSGID()));
1820 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1821 }
1822
1823 // Round N + 2 Exp
1824 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1825 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1826 if (!IsPostRA && MFMAChains)
1827 SG->addRule(std::make_shared<EnablesNthMFMAInChain>(
1828 PositionInChain, MFMAChainSeeds[MFMAChain], TII, SG->getSGID(),
1829 true));
1830 else
1831 SG->addRule(std::make_shared<EnablesNthMFMA>(
1832 (((I * ExpRatio + J) / ExpRequirement) + 2) * MFMAEnablement + 1,
1833 TII, SG->getSGID(), true));
1834 SG->addRule(std::make_shared<IsPipeExp>(TII, SG->getSGID(), true));
1835 SG->addRule(std::make_shared<LessThanNSuccs>(8, TII, SG->getSGID(),
1836 HasChainBetweenCvt));
1837 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1838 }
1839 }
1840
1841 // PHASE 3: Remaining MFMAs
1842 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1843 SchedGroupMask::MFMA, MFMAEnablement * 2, PipelineSyncID, DAG, TII);
1844 SG->addRule(std::make_shared<OccursAfterExp>(TII, SG->getSGID(), true));
1845 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1846 return true;
1847}
1848
1849class MFMAExpSimpleInterleaveOpt final : public IGLPStrategy {
1850public:
1851 bool applyIGLPStrategy(
1853 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1855
1856 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
1857 AMDGPU::SchedulingPhase Phase) override {
1858 return true;
1859 }
1860
1861 MFMAExpSimpleInterleaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
1862 : IGLPStrategy(DAG, TII) {
1863 IsBottomUp = true;
1864 }
1865};
1866
1867bool MFMAExpSimpleInterleaveOpt::applyIGLPStrategy(
1869 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
1871 // Count the number of MFMA instructions.
1872 unsigned MFMACount = 0;
1873 for (const MachineInstr &I : *DAG)
1874 if (TII->isMFMAorWMMA(I))
1875 ++MFMACount;
1876
1877 const unsigned PipelineSyncID = 0;
1878 for (unsigned I = 0; I < MFMACount * 3; ++I) {
1879 SchedGroup *SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1880 SchedGroupMask::TRANS, 1, PipelineSyncID, DAG, TII);
1881 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1882
1883 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
1884 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
1885 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
1886 }
1887
1888 return true;
1889}
1890
1891class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
1892private:
1893 // Whether the DS_READ is a predecessor of first four MFMA in region
1894 class EnablesInitialMFMA final : public InstructionRule {
1895 public:
1896 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1897 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1898 if (!SyncPipe.size())
1899 return false;
1900 int MFMAsFound = 0;
1901 if (!Cache->size()) {
1902 for (auto &Elt : SyncPipe[0].DAG->SUnits) {
1903 if (TII->isMFMAorWMMA(*Elt.getInstr())) {
1904 ++MFMAsFound;
1905 if (MFMAsFound > 4)
1906 break;
1907 Cache->push_back(&Elt);
1908 }
1909 }
1910 }
1911
1912 assert(Cache->size());
1913 auto *DAG = SyncPipe[0].DAG;
1914 for (auto &Elt : *Cache) {
1915 if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
1916 return true;
1917 }
1918 return false;
1919 }
1920
1921 EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID,
1922 bool NeedsCache = false)
1923 : InstructionRule(TII, SGID, NeedsCache) {}
1924 };
1925
1926 // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE
1927 class IsPermForDSW final : public InstructionRule {
1928 public:
1929 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1930 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1931 auto *MI = SU->getInstr();
1932 if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
1933 return false;
1934
1935 bool FitsInGroup = false;
1936 // Does the VALU have a DS_WRITE successor
1937 if (!Collection.size()) {
1938 for (auto &Succ : SU->Succs) {
1939 SUnit *SuccUnit = Succ.getSUnit();
1940 if (TII->isDS(*SuccUnit->getInstr()) &&
1941 SuccUnit->getInstr()->mayStore()) {
1942 Cache->push_back(SuccUnit);
1943 FitsInGroup = true;
1944 }
1945 }
1946 return FitsInGroup;
1947 }
1948
1949 assert(Cache->size());
1950
1951 // Does the VALU have a DS_WRITE successor that is the same as other
1952 // VALU already in the group. The V_PERMs will all share 1 DS_W succ
1953 return llvm::any_of(*Cache, [&SU](SUnit *Elt) {
1954 return llvm::any_of(SU->Succs, [&Elt](const SDep &ThisSucc) {
1955 return ThisSucc.getSUnit() == Elt;
1956 });
1957 });
1958 }
1959
1960 IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
1961 : InstructionRule(TII, SGID, NeedsCache) {}
1962 };
1963
1964 // Whether the SU is a successor of any element in previous SchedGroup
1965 class IsSuccOfPrevGroup final : public InstructionRule {
1966 public:
1967 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1968 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1969 SchedGroup *OtherGroup = nullptr;
1970 for (auto &PipeSG : SyncPipe) {
1971 if ((unsigned)PipeSG.getSGID() == SGID - 1) {
1972 OtherGroup = &PipeSG;
1973 }
1974 }
1975
1976 if (!OtherGroup)
1977 return false;
1978 if (!OtherGroup->Collection.size())
1979 return true;
1980
1981 // Does the previous VALU have this DS_Write as a successor
1982 return any_of(OtherGroup->Collection, [&SU](SUnit *Elt) {
1983 return any_of(Elt->Succs,
1984 [&SU](SDep &Succ) { return Succ.getSUnit() == SU; });
1985 });
1986 }
1987 IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID,
1988 bool NeedsCache = false)
1989 : InstructionRule(TII, SGID, NeedsCache) {}
1990 };
1991
1992 // Whether the combined load width of group is 128 bits
1993 class VMEMSize final : public InstructionRule {
1994 public:
1995 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
1996 SmallVectorImpl<SchedGroup> &SyncPipe) override {
1997 auto *MI = SU->getInstr();
1998 if (MI->getOpcode() == TargetOpcode::BUNDLE)
1999 return false;
2000 if (!Collection.size())
2001 return true;
2002
2003 int NumBits = 0;
2004
2005 auto TRI = TII->getRegisterInfo();
2006 auto &MRI = MI->getParent()->getParent()->getRegInfo();
2007 for (auto &Elt : Collection) {
2008 auto Op = Elt->getInstr()->getOperand(0);
2009 auto Size =
2010 TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op));
2011 NumBits += Size;
2012 }
2013
2014 if (NumBits < 128) {
2015 assert(TII->isVMEM(*MI) && MI->mayLoad());
2016 if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(
2017 MRI, MI->getOperand(0))) <=
2018 128)
2019 return true;
2020 }
2021
2022 return false;
2023 }
2024
2025 VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
2026 : InstructionRule(TII, SGID, NeedsCache) {}
2027 };
2028
2029 /// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup
2030 /// that is \p Distance steps away
2031 class SharesPredWithPrevNthGroup final : public InstructionRule {
2032 private:
2033 unsigned Distance = 1;
2034
2035 public:
2036 bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
2037 SmallVectorImpl<SchedGroup> &SyncPipe) override {
2038 SchedGroup *OtherGroup = nullptr;
2039 if (!SyncPipe.size())
2040 return false;
2041
2042 if (!Cache->size()) {
2043
2044 for (auto &PipeSG : SyncPipe) {
2045 if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
2046 OtherGroup = &PipeSG;
2047 }
2048 }
2049
2050 if (!OtherGroup)
2051 return false;
2052 if (!OtherGroup->Collection.size())
2053 return true;
2054
2055 for (auto &OtherEle : OtherGroup->Collection) {
2056 for (auto &Pred : OtherEle->Preds) {
2057 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2058 AMDGPU::V_PERM_B32_e64)
2059 Cache->push_back(Pred.getSUnit());
2060 }
2061 }
2062
2063 // If the other group has no PERM preds, then this group won't share any
2064 if (!Cache->size())
2065 return false;
2066 }
2067
2068 auto *DAG = SyncPipe[0].DAG;
2069 // Does the previous DS_WRITE share a V_PERM predecessor with this
2070 // VMEM_READ
2071 return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) {
2072 return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);
2073 });
2074 }
2075 SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
2076 unsigned SGID, bool NeedsCache = false)
2077 : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
2078 };
2079
2080public:
2081 bool applyIGLPStrategy(
2083 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
2085
2086 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG,
2087 AMDGPU::SchedulingPhase Phase) override {
2088 return true;
2089 }
2090
2091 MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
2092 : IGLPStrategy(DAG, TII) {
2093 IsBottomUp = false;
2094 }
2095};
2096
2097static unsigned DSWCount = 0;
2098static unsigned DSWWithPermCount = 0;
2099static unsigned DSWWithSharedVMEMCount = 0;
2100
2101bool MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
2103 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
2105 unsigned MFMACount = 0;
2106 unsigned DSRCount = 0;
2107
2108 bool IsInitial = Phase == AMDGPU::SchedulingPhase::Initial;
2109
2110 assert((!IsInitial || (DSWCount == 0 && DSWWithPermCount == 0 &&
2111 DSWWithSharedVMEMCount == 0)) &&
2112 "DSWCounters should be zero in pre-RA scheduling!");
2113 SmallVector<SUnit *, 6> DSWithPerms;
2114 for (auto &SU : DAG->SUnits) {
2115 auto *I = SU.getInstr();
2116 if (TII->isMFMAorWMMA(*I))
2117 ++MFMACount;
2118 else if (TII->isDS(*I)) {
2119 if (I->mayLoad())
2120 ++DSRCount;
2121 else if (I->mayStore() && IsInitial) {
2122 ++DSWCount;
2123 for (auto Pred : SU.Preds) {
2124 if (Pred.getSUnit()->getInstr()->getOpcode() ==
2125 AMDGPU::V_PERM_B32_e64) {
2126 DSWithPerms.push_back(&SU);
2127 break;
2128 }
2129 }
2130 }
2131 }
2132 }
2133
2134 if (IsInitial) {
2135 DSWWithPermCount = DSWithPerms.size();
2136 auto *I = DSWithPerms.begin();
2137 auto *E = DSWithPerms.end();
2138
2139 // Get the count of DS_WRITES with V_PERM predecessors which
2140 // have loop carried dependencies (WAR) on the same VMEM_READs.
2141 // We consider partial overlap as a miss -- in other words,
2142 // for a given DS_W, we only consider another DS_W as matching
2143 // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
2144 // for every V_PERM pred of this DS_W.
2147 for (; I != E; I++) {
2148 SUnit *Cand = nullptr;
2149 bool MissedAny = false;
2150 for (auto &Pred : (*I)->Preds) {
2151 if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
2152 continue;
2153
2154 if (Cand && llvm::is_contained(Counted, Cand))
2155 break;
2156
2157 for (auto &Succ : Pred.getSUnit()->Succs) {
2158 auto *MI = Succ.getSUnit()->getInstr();
2159 if (!TII->isVMEM(*MI) || !MI->mayLoad())
2160 continue;
2161
2162 if (MissedAny || !VMEMLookup.size()) {
2163 MissedAny = true;
2164 VMEMLookup[MI] = *I;
2165 continue;
2166 }
2167
2168 auto [It, Inserted] = VMEMLookup.try_emplace(MI, *I);
2169 if (Inserted) {
2170 MissedAny = true;
2171 continue;
2172 }
2173
2174 Cand = It->second;
2175 if (llvm::is_contained(Counted, Cand)) {
2176 MissedAny = true;
2177 break;
2178 }
2179 }
2180 }
2181 if (!MissedAny && Cand) {
2182 DSWWithSharedVMEMCount += 2;
2183 Counted.push_back(Cand);
2184 Counted.push_back(*I);
2185 }
2186 }
2187 }
2188
2189 assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
2190 SchedGroup *SG;
2191 unsigned PipelineSyncID = 0;
2192 // For kernels with V_PERM, there are enough VALU to mix in between MFMAs
2193 if (DSWWithPermCount) {
2194 for (unsigned I = 0; I < MFMACount; I++) {
2195 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2196 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2197 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2198
2199 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2200 SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII);
2201 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2202 }
2203 }
2204
2205 PipelineSyncID = 1;
2206 // Phase 1: Break up DS_READ and MFMA clusters.
2207 // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ
2208 // prefetch
2209
2210 // Make ready initial MFMA
2211 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2212 SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII);
2213 SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true));
2214 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2215
2216 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2217 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2218 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2219
2220 // Interleave MFMA with DS_READ prefetch
2221 for (unsigned I = 0; I < DSRCount - 4; ++I) {
2222 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2223 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
2224 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2225
2226 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2227 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2228 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2229 }
2230
2231 // Phase 2a: Loop carried dependency with V_PERM
2232 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
2233 // depend on. Interleave MFMA to keep XDL unit busy throughout.
2234 for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
2235 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2236 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
2237 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
2238 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2239
2240 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2241 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2242 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
2243 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2244
2245 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2246 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2247 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2248 1, TII, SG->getSGID(), true));
2249 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2250 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2251
2252 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2253 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2254 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2255
2256 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2257 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2258 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2259 3, TII, SG->getSGID(), true));
2260 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2261 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2262
2263 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2264 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2265 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2266 }
2267
2268 // Phase 2b: Loop carried dependency without V_PERM
2269 // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
2270 // Interleave MFMA to keep XDL unit busy throughout.
2271 for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {
2272 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2273 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2274 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2275
2276 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2277 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2278 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2279 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2280
2281 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2282 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2283 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2284 }
2285
2286 // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are
2287 // ultimately used by two DS_WRITE
2288 // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
2289 // depend on. Interleave MFMA to keep XDL unit busy throughout.
2290
2291 for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) {
2292 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2293 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
2294 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
2295 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2296
2297 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2298 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2299 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
2300 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2301
2302 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2303 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2304 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2305
2306 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2307 SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
2308 SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
2309 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2310
2311 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2312 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
2313 SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID()));
2314 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2315
2316 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2317 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2318 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2319
2320 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2321 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2322 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2323 2, TII, SG->getSGID(), true));
2324 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2325 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2326
2327 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2328 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2329 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2330
2331 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2332 SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
2333 SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
2334 4, TII, SG->getSGID(), true));
2335 SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID()));
2336 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2337
2338 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
2339 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
2340 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
2341 }
2342
2343 return true;
2344}
2345
2346static std::unique_ptr<IGLPStrategy>
2347createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
2348 const SIInstrInfo *TII) {
2349 switch (ID) {
2350 case MFMASmallGemmOptID:
2351 return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
2352 case MFMASmallGemmSingleWaveOptID:
2353 return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
2354 case MFMAExpInterleaveID:
2355 return std::make_unique<MFMAExpInterleaveOpt>(DAG, TII);
2356 case MFMAExpSimpleInterleaveID:
2357 return std::make_unique<MFMAExpSimpleInterleaveOpt>(DAG, TII);
2358 }
2359
2360 llvm_unreachable("Unknown IGLPStrategyID");
2361}
2362
2363class IGroupLPDAGMutation : public ScheduleDAGMutation {
2364private:
2365 const SIInstrInfo *TII;
2366
2367 ScheduleDAGMI *DAG;
2368
2369 // Organize lists of SchedGroups by their SyncID. SchedGroups /
2370 // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
2371 // between then.
2373
2374 // Used to track instructions that can be mapped to multiple sched groups
2376
2377 // Add DAG edges that enforce SCHED_BARRIER ordering.
2378 void addSchedBarrierEdges(SUnit &SU);
2379
2380 // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
2381 // not be reordered accross the SCHED_BARRIER. This is used for the base
2382 // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
2383 // SCHED_BARRIER will always block all instructions that can be classified
2384 // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size
2385 // and may only synchronize with some SchedGroups. Returns the inverse of
2386 // Mask. SCHED_BARRIER's mask describes which instruction types should be
2387 // allowed to be scheduled across it. Invert the mask to get the
2388 // SchedGroupMask of instructions that should be barred.
2389 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;
2390
2391 // Create SchedGroups for a SCHED_GROUP_BARRIER.
2392 void initSchedGroupBarrierPipelineStage(
2393 std::vector<SUnit>::reverse_iterator RIter);
2394
2395 bool initIGLPOpt(SUnit &SU);
2396
2397public:
2398 void apply(ScheduleDAGInstrs *DAGInstrs) override;
2399
2400 // The order in which the PipelineSolver should process the candidate
2401 // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last
2402 // created SchedGroup first, and will consider that as the ultimate
2403 // predecessor group when linking. TOP_DOWN instead links and processes the
2404 // first created SchedGroup first.
2405 bool IsBottomUp = true;
2406
2407 // The scheduling phase this application of IGLP corresponds with.
2409
2410 IGroupLPDAGMutation() = default;
2411 IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {}
2412};
2413
2414unsigned SchedGroup::NumSchedGroups = 0;
2415
2416bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
2417 if (A != B && DAG->canAddEdge(B, A)) {
2418 DAG->addEdge(B, SDep(A, SDep::Artificial));
2419 return true;
2420 }
2421 return false;
2422}
2423
2424bool SchedGroup::canAddMI(const MachineInstr &MI) const {
2425 bool Result = false;
2426 if (MI.isMetaInstruction())
2427 Result = false;
2428
2429 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
2430 (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
2431 TII->isTRANS(MI)))
2432 Result = true;
2433
2434 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
2435 TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI))
2436 Result = true;
2437
2438 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
2439 TII->isSALU(MI))
2440 Result = true;
2441
2442 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
2443 TII->isMFMAorWMMA(MI))
2444 Result = true;
2445
2446 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
2447 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
2448 Result = true;
2449
2450 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
2451 MI.mayLoad() &&
2452 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
2453 Result = true;
2454
2455 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
2456 MI.mayStore() &&
2457 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
2458 Result = true;
2459
2460 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
2461 TII->isDS(MI))
2462 Result = true;
2463
2464 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
2465 MI.mayLoad() && TII->isDS(MI))
2466 Result = true;
2467
2468 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
2469 MI.mayStore() && TII->isDS(MI))
2470 Result = true;
2471
2472 else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
2473 TII->isTRANS(MI))
2474 Result = true;
2475
2476 LLVM_DEBUG(
2477 dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
2478 << (Result ? " could classify " : " unable to classify ") << MI);
2479
2480 return Result;
2481}
2482
2483int SchedGroup::link(SUnit &SU, bool MakePred,
2484 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
2485 int MissedEdges = 0;
2486 for (auto *A : Collection) {
2487 SUnit *B = &SU;
2488 if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2489 continue;
2490 if (MakePred)
2491 std::swap(A, B);
2492
2493 if (DAG->IsReachable(B, A))
2494 continue;
2495
2496 // tryAddEdge returns false if there is a dependency that makes adding
2497 // the A->B edge impossible, otherwise it returns true;
2498 bool Added = tryAddEdge(A, B);
2499 if (Added)
2500 AddedEdges.emplace_back(A, B);
2501 else
2502 ++MissedEdges;
2503 }
2504
2505 return MissedEdges;
2506}
2507
2508void SchedGroup::link(SUnit &SU, bool MakePred) {
2509 for (auto *A : Collection) {
2510 SUnit *B = &SU;
2511 if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
2512 continue;
2513 if (MakePred)
2514 std::swap(A, B);
2515
2516 tryAddEdge(A, B);
2517 }
2518}
2519
2520void SchedGroup::link(SUnit &SU,
2521 function_ref<bool(const SUnit *A, const SUnit *B)> P) {
2522 for (auto *A : Collection) {
2523 SUnit *B = &SU;
2524 if (P(A, B))
2525 std::swap(A, B);
2526
2527 tryAddEdge(A, B);
2528 }
2529}
2530
2531void SchedGroup::link(SchedGroup &OtherGroup) {
2532 for (auto *B : OtherGroup.Collection)
2533 link(*B);
2534}
2535
2536bool SchedGroup::canAddSU(SUnit &SU) const {
2537 MachineInstr &MI = *SU.getInstr();
2538 if (MI.getOpcode() != TargetOpcode::BUNDLE)
2539 return canAddMI(MI);
2540
2541 // Special case for bundled MIs.
2542 const MachineBasicBlock *MBB = MI.getParent();
2543 MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
2544 while (E != MBB->end() && E->isBundledWithPred())
2545 ++E;
2546
2547 // Return true if all of the bundled MIs can be added to this group.
2548 return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); });
2549}
2550
2551void SchedGroup::initSchedGroup() {
2552 for (auto &SU : DAG->SUnits) {
2553 if (isFull())
2554 break;
2555
2556 if (canAddSU(SU))
2557 add(SU);
2558 }
2559}
2560
2561void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
2562 SUnitsToCandidateSGsMap &SyncedInstrs) {
2563 SUnit &InitSU = *RIter;
2564 for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) {
2565 auto &SU = *RIter;
2566 if (isFull())
2567 break;
2568
2569 if (canAddSU(SU))
2570 SyncedInstrs[&SU].push_back(SGID);
2571 }
2572
2573 add(InitSU);
2574 assert(MaxSize);
2575 (*MaxSize)++;
2576}
2577
2578void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
2579 auto I = DAG->SUnits.rbegin();
2580 auto E = DAG->SUnits.rend();
2581 for (; I != E; ++I) {
2582 auto &SU = *I;
2583 if (isFull())
2584 break;
2585 if (canAddSU(SU))
2586 SyncedInstrs[&SU].push_back(SGID);
2587 }
2588}
2589
2590void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
2591 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
2592 if (!TSchedModel || DAGInstrs->SUnits.empty())
2593 return;
2594
2595 LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
2596 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
2597 TII = ST.getInstrInfo();
2598 DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
2599 SyncedSchedGroups.clear();
2600 SyncedInstrs.clear();
2601 bool FoundSB = false;
2602 bool FoundIGLP = false;
2603 bool ShouldApplyIGLP = false;
2604 for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
2605 unsigned Opc = R->getInstr()->getOpcode();
2606 // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
2607 if (Opc == AMDGPU::SCHED_BARRIER) {
2608 addSchedBarrierEdges(*R);
2609 FoundSB = true;
2610 } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
2611 initSchedGroupBarrierPipelineStage(R);
2612 FoundSB = true;
2613 } else if (Opc == AMDGPU::IGLP_OPT) {
2614 resetEdges(*R, DAG);
2615 if (!FoundSB && !FoundIGLP) {
2616 FoundIGLP = true;
2617 ShouldApplyIGLP = initIGLPOpt(*R);
2618 }
2619 }
2620 }
2621
2622 if (FoundSB || (FoundIGLP && ShouldApplyIGLP)) {
2623 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
2624 // PipelineSolver performs the mutation by adding the edges it
2625 // determined as the best
2626 PS.solve();
2627 return;
2628 }
2629}
2630
2631void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
2632 MachineInstr &MI = *SchedBarrier.getInstr();
2633 assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
2634 // Remove all existing edges from the SCHED_BARRIER that were added due to the
2635 // instruction having side effects.
2636 resetEdges(SchedBarrier, DAG);
2637 LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: "
2638 << MI.getOperand(0).getImm() << "\n");
2639 auto InvertedMask =
2640 invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());
2641 SchedGroup SG(InvertedMask, std::nullopt, DAG, TII);
2642 SG.initSchedGroup();
2643
2644 // Preserve original instruction ordering relative to the SCHED_BARRIER.
2645 SG.link(
2646 SchedBarrier,
2647 (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
2648 const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; });
2649}
2650
2651SchedGroupMask
2652IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
2653 // Invert mask and erase bits for types of instructions that are implied to be
2654 // allowed past the SCHED_BARRIER.
2655 SchedGroupMask InvertedMask = ~Mask;
2656
2657 // ALU implies VALU, SALU, MFMA, TRANS.
2658 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
2659 InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
2660 ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
2661 // VALU, SALU, MFMA, TRANS implies ALU.
2662 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
2663 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
2664 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
2665 (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
2666 InvertedMask &= ~SchedGroupMask::ALU;
2667
2668 // VMEM implies VMEM_READ, VMEM_WRITE.
2669 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
2670 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
2671 // VMEM_READ, VMEM_WRITE implies VMEM.
2672 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
2673 (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
2674 InvertedMask &= ~SchedGroupMask::VMEM;
2675
2676 // DS implies DS_READ, DS_WRITE.
2677 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
2678 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
2679 // DS_READ, DS_WRITE implies DS.
2680 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
2681 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
2682 InvertedMask &= ~SchedGroupMask::DS;
2683
2684 LLVM_DEBUG(dbgs() << "After Inverting, SchedGroup Mask: " << (int)InvertedMask
2685 << "\n");
2686
2687 return InvertedMask;
2688}
2689
2690void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
2691 std::vector<SUnit>::reverse_iterator RIter) {
2692 // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
2693 // to the instruction having side effects.
2694 resetEdges(*RIter, DAG);
2695 MachineInstr &SGB = *RIter->getInstr();
2696 assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
2697 int32_t SGMask = SGB.getOperand(0).getImm();
2698 int32_t Size = SGB.getOperand(1).getImm();
2699 int32_t SyncID = SGB.getOperand(2).getImm();
2700
2701 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
2702 Size, SyncID, DAG, TII);
2703
2704 SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
2705}
2706
2707bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
2708 IGLPStrategyID StrategyID =
2709 (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
2710 auto S = createIGLPStrategy(StrategyID, DAG, TII);
2711 if (!S->shouldApplyStrategy(DAG, Phase))
2712 return false;
2713
2714 IsBottomUp = S->IsBottomUp;
2715 return S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, Phase);
2716}
2717
2718} // namespace
2719
2720namespace llvm {
2721
2722/// \p Phase specifes whether or not this is a reentry into the
2723/// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
2724/// same scheduling region (e.g. pre and post-RA scheduling / multiple
2725/// scheduling "phases"), we can reenter this mutation framework more than once
2726/// for a given region.
2727std::unique_ptr<ScheduleDAGMutation>
2729 return std::make_unique<IGroupLPDAGMutation>(Phase);
2730}
2731
2732} // end namespace llvm
unsigned const MachineRegisterInfo * MRI
aarch64 falkor hwpf fix Falkor HW Prefetch Fix Late Phase
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
uint64_t Size
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
#define P(N)
uint32_t Number
Definition: Profile.cpp:47
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
This class represents an Operation in the Expression.
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
unsigned size() const
Definition: DenseMap.h:99
Instructions::iterator instr_iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
int64_t getImm() const
Scheduling dependency.
Definition: ScheduleDAG.h:49
SUnit * getSUnit() const
Definition: ScheduleDAG.h:498
@ Data
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:270
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:263
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:262
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:390
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
bool IsReachable(SUnit *SU, SUnit *TargetSU)
IsReachable - Checks if SU is reachable from TargetSU.
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU)
True if an edge can be added from PredSU to SuccSU without creating a cycle.
void dump() const override
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Mutate the DAG as a postpass after normal DAG building.
virtual void apply(ScheduleDAGInstrs *DAG)=0
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:579
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
Provide an instruction scheduling machine model to CodeGen passes.
An efficient, type-erasing, non-owning reference to a callable.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase)
Phase specifes whether or not this is a reentry into the IGroupLPDAGMutation.
@ NONE
Definition: Attributor.h:6475
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
FormattedNumber format_hex(uint64_t N, unsigned Width, bool Upper=false)
format_hex - Output N as a fixed width hexadecimal.
Definition: Format.h:187
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860