LLVM  17.0.0git
AMDGPUIGroupLP.cpp
Go to the documentation of this file.
1 //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file This file defines a set of schedule DAG mutations that can be used to
10 // override default scheduler behavior to enforce specific scheduling patterns.
11 // They should be used in cases where runtime performance considerations such as
12 // inter-wavefront interactions, mean that compile-time heuristics cannot
13 // predict the optimal instruction ordering, or in kernels where optimum
14 // instruction scheduling is important enough to warrant manual intervention.
15 //
16 //===----------------------------------------------------------------------===//
17 
18 #include "AMDGPUIGroupLP.h"
19 #include "AMDGPUTargetMachine.h"
21 #include "SIInstrInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "llvm/ADT/BitmaskEnum.h"
24 #include "llvm/ADT/DenseMap.h"
27 
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "igrouplp"
31 
32 namespace {
33 
34 static cl::opt<bool> EnableExactSolver(
35  "amdgpu-igrouplp-exact-solver", cl::Hidden,
36  cl::desc("Whether to use the exponential time solver to fit "
37  "the instructions to the pipeline as closely as "
38  "possible."),
39  cl::init(false));
40 
41 static cl::opt<unsigned> CutoffForExact(
42  "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden,
43  cl::desc("The maximum number of scheduling group conflicts "
44  "which we attempt to solve with the exponential time "
45  "exact solver. Problem sizes greater than this will"
46  "be solved by the less accurate greedy algorithm. Selecting "
47  "solver by size is superseded by manually selecting "
48  "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
49 
50 static cl::opt<uint64_t> MaxBranchesExplored(
51  "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden,
52  cl::desc("The amount of branches that we are willing to explore with"
53  "the exact algorithm before giving up."));
54 
55 static cl::opt<bool> UseCostHeur(
56  "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,
57  cl::desc("Whether to use the cost heuristic to make choices as we "
58  "traverse the search space using the exact solver. Defaulted "
59  "to on, and if turned off, we will use the node order -- "
60  "attempting to put the later nodes in the later sched groups. "
61  "Experimentally, results are mixed, so this should be set on a "
62  "case-by-case basis."));
63 
64 // Components of the mask that determines which instruction types may be may be
65 // classified into a SchedGroup.
66 enum class SchedGroupMask {
67  NONE = 0u,
68  ALU = 1u << 0,
69  VALU = 1u << 1,
70  SALU = 1u << 2,
71  MFMA = 1u << 3,
72  VMEM = 1u << 4,
73  VMEM_READ = 1u << 5,
74  VMEM_WRITE = 1u << 6,
75  DS = 1u << 7,
76  DS_READ = 1u << 8,
77  DS_WRITE = 1u << 9,
78  ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
79  DS_READ | DS_WRITE,
80  LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
81 };
82 
83 typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap;
84 
85 // Classify instructions into groups to enable fine tuned control over the
86 // scheduler. These groups may be more specific than current SchedModel
87 // instruction classes.
88 class SchedGroup {
89 private:
90  // Mask that defines which instruction types can be classified into this
91  // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
92  // and SCHED_GROUP_BARRIER.
93  SchedGroupMask SGMask;
94 
95  // Maximum number of SUnits that can be added to this group.
96  std::optional<unsigned> MaxSize;
97 
98  // SchedGroups will only synchronize with other SchedGroups that have the same
99  // SyncID.
100  int SyncID = 0;
101 
102  // SGID is used to map instructions to candidate SchedGroups
103  unsigned SGID;
104 
105  // Count of the number of created SchedGroups, used to initialize SGID.
106  static unsigned NumSchedGroups;
107 
108  ScheduleDAGInstrs *DAG;
109 
110  const SIInstrInfo *TII;
111 
112  // Try to add and edge from SU A to SU B.
113  bool tryAddEdge(SUnit *A, SUnit *B);
114 
115  // Use SGMask to determine whether we can classify MI as a member of this
116  // SchedGroup object.
117  bool canAddMI(const MachineInstr &MI) const;
118 
119 public:
120  // Collection of SUnits that are classified as members of this group.
121  SmallVector<SUnit *, 32> Collection;
122 
123  // Returns true if SU can be added to this SchedGroup.
124  bool canAddSU(SUnit &SU) const;
125 
126  // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
127  // MakePred is true, SU will be a predecessor of the SUnits in this
128  // SchedGroup, otherwise SU will be a successor.
129  void link(SUnit &SU, bool MakePred = false);
130 
131  // Add DAG dependencies and track which edges are added, and the count of
132  // missed edges
133  int link(SUnit &SU, bool MakePred,
134  std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
135 
136  // Add DAG dependencies from all SUnits in this SchedGroup and this SU.
137  // Use the predicate to determine whether SU should be a predecessor (P =
138  // true) or a successor (P = false) of this SchedGroup.
139  void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P);
140 
141  // Add DAG dependencies such that SUnits in this group shall be ordered
142  // before SUnits in OtherGroup.
143  void link(SchedGroup &OtherGroup);
144 
145  // Returns true if no more instructions may be added to this group.
146  bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
147 
148  // Add SU to the SchedGroup.
149  void add(SUnit &SU) {
150  LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
151  << format_hex((int)SGMask, 10, true) << " adding "
152  << *SU.getInstr());
153  Collection.push_back(&SU);
154  }
155 
156  // Remove last element in the SchedGroup
157  void pop() { Collection.pop_back(); }
158 
159  // Identify and add all relevant SUs from the DAG to this SchedGroup.
160  void initSchedGroup();
161 
162  // Add instructions to the SchedGroup bottom up starting from RIter.
163  // PipelineInstrs is a set of instructions that should not be added to the
164  // SchedGroup even when the other conditions for adding it are satisfied.
165  // RIter will be added to the SchedGroup as well, and dependencies will be
166  // added so that RIter will always be scheduled at the end of the group.
167  void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
168  SUnitsToCandidateSGsMap &SyncedInstrs);
169 
170  void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
171 
172  int getSyncID() { return SyncID; }
173 
174  int getSGID() { return SGID; }
175 
176  SchedGroupMask getMask() { return SGMask; }
177 
178  SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
179  ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
180  : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
181  SGID = NumSchedGroups++;
182  }
183 
184  SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
185  ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
186  : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
187  SGID = NumSchedGroups++;
188  }
189 };
190 
191 // Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
192 static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
193  assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||
194  SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
195  SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT);
196 
197  while (!SU.Preds.empty())
198  for (auto &P : SU.Preds)
199  SU.removePred(P);
200 
201  while (!SU.Succs.empty())
202  for (auto &S : SU.Succs)
203  for (auto &SP : S.getSUnit()->Preds)
204  if (SP.getSUnit() == &SU)
205  S.getSUnit()->removePred(SP);
206 }
207 
208 typedef std::pair<SUnit *, SmallVector<int, 4>> SUToCandSGsPair;
209 typedef SmallVector<SUToCandSGsPair, 4> SUsToCandSGsVec;
210 
211 // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
212 // in non-trivial cases. For example, if the requested pipeline is
213 // {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction
214 // in the DAG, then we will have an instruction that can not be trivially
215 // assigned to a SchedGroup. The PipelineSolver class implements two algorithms
216 // to find a good solution to the pipeline -- a greedy algorithm and an exact
217 // algorithm. The exact algorithm has an exponential time complexity and should
218 // only be used for small sized problems or medium sized problems where an exact
219 // solution is highly desired.
220 class PipelineSolver {
221  ScheduleDAGMI *DAG;
222 
223  // Instructions that can be assigned to multiple SchedGroups
225  SmallVector<SUsToCandSGsVec, 4> PipelineInstrs;
226  DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
227  // The current working pipeline
229  // The pipeline that has the best solution found so far
231 
232  // Whether or not we actually have any SyncedInstrs to try to solve.
233  bool NeedsSolver = false;
234 
235  // Compute an estimate of the size of search tree -- the true size is
236  // the product of each conflictedInst.Matches.size() across all SyncPipelines
237  unsigned computeProblemSize();
238 
239  // The cost penalty of not assigning a SU to a SchedGroup
240  int MissPenalty = 0;
241 
242  // Costs in terms of the number of edges we are unable to add
243  int BestCost = -1;
244  int CurrCost = 0;
245 
246  // Index pointing to the conflicting instruction that is currently being
247  // fitted
248  int CurrConflInstNo = 0;
249  // Index to the pipeline that is currently being fitted
250  int CurrSyncGroupIdx = 0;
251  // The first non trivial pipeline
252  int BeginSyncGroupIdx = 0;
253 
254  // How many branches we have explored
255  uint64_t BranchesExplored = 0;
256 
257  // Update indices to fit next conflicting instruction
258  void advancePosition();
259  // Recede indices to attempt to find better fit for previous conflicting
260  // instruction
261  void retreatPosition();
262 
263  // The exponential time algorithm which finds the provably best fit
264  bool solveExact();
265  // The polynomial time algorithm which attempts to find a good fit
266  bool solveGreedy();
267  // Whether or not the current solution is optimal
268  bool checkOptimal();
269  // Populate the ready list, prioiritizing fewest missed edges first
270  void populateReadyList(SUToCandSGsPair &CurrSU,
271  SmallVectorImpl<std::pair<int, int>> &ReadyList,
272  SmallVectorImpl<SchedGroup> &SyncPipeline);
273  // Add edges corresponding to the SchedGroups as assigned by solver
274  void makePipeline();
275  // Add the edges from the SU to the other SchedGroups in pipeline, and
276  // return the number of edges missed.
277  int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
278  std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
279  // Remove the edges passed via AddedEdges
280  void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
281  // Convert the passed in maps to arrays for bidirectional iterators
282  void convertSyncMapsToArrays();
283 
284  void reset();
285 
286 public:
287  // Invoke the solver to map instructions to instruction groups. Heuristic &&
288  // command-line-option determines to use exact or greedy algorithm.
289  void solve();
290 
291  PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
293  ScheduleDAGMI *DAG)
294  : DAG(DAG), SyncedInstrs(SyncedInstrs),
295  SyncedSchedGroups(SyncedSchedGroups) {
296 
297  for (auto &PipelineInstrs : SyncedInstrs) {
298  if (PipelineInstrs.second.size() > 0) {
299  NeedsSolver = true;
300  break;
301  }
302  }
303 
304  if (!NeedsSolver)
305  return;
306 
307  convertSyncMapsToArrays();
308 
309  CurrPipeline = BestPipeline;
310 
311  while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&
312  PipelineInstrs[BeginSyncGroupIdx].size() == 0)
313  ++BeginSyncGroupIdx;
314 
315  if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())
316  return;
317  }
318 };
319 
320 void PipelineSolver::reset() {
321 
322  for (auto &SyncPipeline : CurrPipeline) {
323  for (auto &SG : SyncPipeline) {
324  SmallVector<SUnit *, 32> TempCollection = SG.Collection;
325  SG.Collection.clear();
326  auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {
327  return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
328  });
329  if (SchedBarr != TempCollection.end())
330  SG.Collection.push_back(*SchedBarr);
331  }
332  }
333 
334  CurrSyncGroupIdx = BeginSyncGroupIdx;
335  CurrConflInstNo = 0;
336  CurrCost = 0;
337 }
338 
339 void PipelineSolver::convertSyncMapsToArrays() {
340  for (auto &SyncPipe : SyncedSchedGroups) {
341  BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
342  }
343 
344  int PipelineIDx = SyncedInstrs.size() - 1;
345  PipelineInstrs.resize(SyncedInstrs.size());
346  for (auto &SyncInstrMap : SyncedInstrs) {
347  for (auto &SUsToCandSGs : SyncInstrMap.second) {
348  if (PipelineInstrs[PipelineIDx].size() == 0) {
349  PipelineInstrs[PipelineIDx].push_back(
350  std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
351  continue;
352  }
353  auto SortPosition = PipelineInstrs[PipelineIDx].begin();
354  // Insert them in sorted order -- this allows for good parsing order in
355  // the greedy algorithm
356  while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
357  SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
358  ++SortPosition;
359  PipelineInstrs[PipelineIDx].insert(
360  SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
361  }
362  --PipelineIDx;
363  }
364 }
365 
366 void PipelineSolver::makePipeline() {
367  // Preserve the order of barrier for subsequent SchedGroupBarrier mutations
368  for (auto &SyncPipeline : BestPipeline) {
369  for (auto &SG : SyncPipeline) {
370  SUnit *SGBarr = nullptr;
371  for (auto &SU : SG.Collection) {
372  if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
373  SGBarr = SU;
374  }
375  // Command line requested IGroupLP doesn't have SGBarr
376  if (!SGBarr)
377  continue;
378  resetEdges(*SGBarr, DAG);
379  SG.link(*SGBarr, false);
380  }
381  }
382 
383  for (auto &SyncPipeline : BestPipeline) {
384  auto I = SyncPipeline.rbegin();
385  auto E = SyncPipeline.rend();
386  for (; I != E; ++I) {
387  auto &GroupA = *I;
388  for (auto J = std::next(I); J != E; ++J) {
389  auto &GroupB = *J;
390  GroupA.link(GroupB);
391  }
392  }
393  }
394 }
395 
396 int PipelineSolver::addEdges(
397  SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
398  std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
399  int AddedCost = 0;
400  bool MakePred = false;
401 
402  // The groups in the pipeline are in reverse order. Thus,
403  // by traversing them from last to first, we are traversing
404  // them in the order as they were introduced in the code. After we
405  // pass the group the SU is being assigned to, it should be
406  // linked as a predecessor of the subsequent SchedGroups
407  auto GroupNo = (int)SyncPipeline.size() - 1;
408  for (; GroupNo >= 0; GroupNo--) {
409  if (SyncPipeline[GroupNo].getSGID() == SGID) {
410  MakePred = true;
411  continue;
412  }
413  auto Group = &SyncPipeline[GroupNo];
414  AddedCost += Group->link(*SU, MakePred, AddedEdges);
415  assert(AddedCost >= 0);
416  }
417 
418  return AddedCost;
419 }
420 
421 void PipelineSolver::removeEdges(
422  const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
423  // Only remove the edges that we have added when testing
424  // the fit.
425  for (auto &PredSuccPair : EdgesToRemove) {
426  SUnit *Pred = PredSuccPair.first;
427  SUnit *Succ = PredSuccPair.second;
428 
429  auto Match = llvm::find_if(
430  Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });
431  if (Match != Succ->Preds.end()) {
432  assert(Match->isArtificial());
433  Succ->removePred(*Match);
434  }
435  }
436 }
437 
438 void PipelineSolver::advancePosition() {
439  ++CurrConflInstNo;
440 
441  if (static_cast<size_t>(CurrConflInstNo) >=
442  PipelineInstrs[CurrSyncGroupIdx].size()) {
443  CurrConflInstNo = 0;
444  ++CurrSyncGroupIdx;
445  // Advance to next non-trivial pipeline
446  while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
447  PipelineInstrs[CurrSyncGroupIdx].size() == 0)
448  ++CurrSyncGroupIdx;
449  }
450 }
451 
452 void PipelineSolver::retreatPosition() {
453  assert(CurrConflInstNo >= 0);
454  assert(CurrSyncGroupIdx >= 0);
455 
456  if (CurrConflInstNo > 0) {
457  --CurrConflInstNo;
458  return;
459  }
460 
461  if (CurrConflInstNo == 0) {
462  // If we return to the starting position, we have explored
463  // the entire tree
464  if (CurrSyncGroupIdx == BeginSyncGroupIdx)
465  return;
466 
467  --CurrSyncGroupIdx;
468  // Go to previous non-trivial pipeline
469  while (PipelineInstrs[CurrSyncGroupIdx].size() == 0)
470  --CurrSyncGroupIdx;
471 
472  CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
473  }
474 }
475 
476 bool PipelineSolver::checkOptimal() {
477  if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
478  if (BestCost == -1 || CurrCost < BestCost) {
479  BestPipeline = CurrPipeline;
480  BestCost = CurrCost;
481  LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n");
482  }
483  assert(BestCost >= 0);
484  }
485 
486  bool DoneExploring = false;
487  if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
488  DoneExploring = true;
489 
490  return (DoneExploring || BestCost == 0);
491 }
492 
493 void PipelineSolver::populateReadyList(
494  SUToCandSGsPair &CurrSU, SmallVectorImpl<std::pair<int, int>> &ReadyList,
495  SmallVectorImpl<SchedGroup> &SyncPipeline) {
496  assert(CurrSU.second.size() >= 1);
497  auto I = CurrSU.second.rbegin();
498  auto E = CurrSU.second.rend();
499  for (; I != E; ++I) {
500  std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
501  int CandSGID = *I;
502  SchedGroup *Match;
503  for (auto &SG : SyncPipeline) {
504  if (SG.getSGID() == CandSGID)
505  Match = &SG;
506  }
507 
508  if (UseCostHeur) {
509  if (Match->isFull()) {
510  ReadyList.push_back(std::pair(*I, MissPenalty));
511  continue;
512  }
513 
514  int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
515  ReadyList.push_back(std::pair(*I, TempCost));
516  removeEdges(AddedEdges);
517  } else
518  ReadyList.push_back(std::pair(*I, -1));
519  }
520 
521  if (UseCostHeur) {
522  std::sort(ReadyList.begin(), ReadyList.end(),
523  [](std::pair<int, int> A, std::pair<int, int> B) {
524  return A.second < B.second;
525  });
526  }
527 
528  assert(ReadyList.size() == CurrSU.second.size());
529 }
530 
531 bool PipelineSolver::solveExact() {
532  if (checkOptimal())
533  return true;
534 
535  if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
536  return false;
537 
538  assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size());
539  assert(static_cast<size_t>(CurrConflInstNo) <
540  PipelineInstrs[CurrSyncGroupIdx].size());
541  SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
542  LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
543  << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
544 
545  // SchedGroup -> Cost pairs
546  SmallVector<std::pair<int, int>, 4> ReadyList;
547  // Prioritize the candidate sched groups in terms of lowest cost first
548  populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]);
549 
550  auto I = ReadyList.begin();
551  auto E = ReadyList.end();
552  for (; I != E; ++I) {
553  // If we are trying SGs in least cost order, and the current SG is cost
554  // infeasible, then all subsequent SGs will also be cost infeasible, so we
555  // can prune.
556  if (BestCost != -1 && (CurrCost + I->second > BestCost))
557  return false;
558 
559  int CandSGID = I->first;
560  int AddedCost = 0;
561  std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
562  auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
563  SchedGroup *Match;
564  for (auto &SG : SyncPipeline) {
565  if (SG.getSGID() == CandSGID)
566  Match = &SG;
567  }
568 
569  if (Match->isFull())
570  continue;
571 
572  LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
573  << (int)Match->getMask() << "and ID " << CandSGID
574  << "\n");
575  Match->add(*CurrSU.first);
576  AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
577  LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n");
578  CurrCost += AddedCost;
579  advancePosition();
580  ++BranchesExplored;
581  bool FinishedExploring = false;
582  // If the Cost after adding edges is greater than a known solution,
583  // backtrack
584  if (CurrCost < BestCost || BestCost == -1) {
585  if (solveExact()) {
586  FinishedExploring = BestCost != 0;
587  if (!FinishedExploring)
588  return true;
589  }
590  }
591 
592  retreatPosition();
593  CurrCost -= AddedCost;
594  removeEdges(AddedEdges);
595  Match->pop();
596  CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
597  if (FinishedExploring)
598  return true;
599  }
600 
601  // Try the pipeline where the current instruction is omitted
602  // Potentially if we omit a problematic instruction from the pipeline,
603  // all the other instructions can nicely fit.
604  CurrCost += MissPenalty;
605  advancePosition();
606 
607  LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n");
608 
609  bool FinishedExploring = false;
610  if (CurrCost < BestCost || BestCost == -1) {
611  if (solveExact()) {
612  bool FinishedExploring = BestCost != 0;
613  if (!FinishedExploring)
614  return true;
615  }
616  }
617 
618  retreatPosition();
619  CurrCost -= MissPenalty;
620  return FinishedExploring;
621 }
622 
623 bool PipelineSolver::solveGreedy() {
624  BestCost = 0;
625  std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
626 
627  while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
628  SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
629  int BestNodeCost = -1;
630  int TempCost;
631  SchedGroup *BestGroup = nullptr;
632  int BestGroupID = -1;
633  auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
634  LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
635  << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
636 
637  // Since we have added the potential SchedGroups from bottom up, but
638  // traversed the DAG from top down, parse over the groups from last to
639  // first. If we fail to do this for the greedy algorithm, the solution will
640  // likely not be good in more complex cases.
641  auto I = CurrSU.second.rbegin();
642  auto E = CurrSU.second.rend();
643  for (; I != E; ++I) {
644  std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
645  int CandSGID = *I;
646  SchedGroup *Match;
647  for (auto &SG : SyncPipeline) {
648  if (SG.getSGID() == CandSGID)
649  Match = &SG;
650  }
651 
652  LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
653  << (int)Match->getMask() << "\n");
654 
655  if (Match->isFull()) {
656  LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
657  continue;
658  }
659  TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
660  LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
661  if (TempCost < BestNodeCost || BestNodeCost == -1) {
662  BestGroup = Match;
663  BestNodeCost = TempCost;
664  BestGroupID = CandSGID;
665  }
666  removeEdges(AddedEdges);
667  if (BestNodeCost == 0)
668  break;
669  }
670 
671  if (BestGroupID != -1) {
672  BestGroup->add(*CurrSU.first);
673  addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
674  LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
675  << (int)BestGroup->getMask() << "\n");
676  BestCost += TempCost;
677  } else
678  BestCost += MissPenalty;
679 
680  CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
681  advancePosition();
682  }
683  BestPipeline = CurrPipeline;
684  removeEdges(AddedEdges);
685  return false;
686 }
687 
688 unsigned PipelineSolver::computeProblemSize() {
689  unsigned ProblemSize = 0;
690  for (auto &PipeConflicts : PipelineInstrs) {
691  ProblemSize += PipeConflicts.size();
692  }
693 
694  return ProblemSize;
695 }
696 
697 void PipelineSolver::solve() {
698  if (!NeedsSolver)
699  return;
700 
701  unsigned ProblemSize = computeProblemSize();
702  assert(ProblemSize > 0);
703 
704  bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
705  MissPenalty = (ProblemSize / 2) + 1;
706 
707  LLVM_DEBUG(DAG->dump());
708  if (EnableExactSolver || BelowCutoff) {
709  LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n");
710  solveGreedy();
711  reset();
712  LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n");
713  if (BestCost > 0) {
714  LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n");
715  solveExact();
716  LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n");
717  }
718  } else { // Use the Greedy Algorithm by default
719  LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n");
720  solveGreedy();
721  }
722 
723  makePipeline();
724 }
725 
726 enum IGLPStrategyID : int { MFMASmallGemmOptID = 0 };
727 
728 // Implement a IGLP scheduling strategy.
729 class IGLPStrategy {
730 protected:
731  ScheduleDAGInstrs *DAG;
732 
733  const SIInstrInfo *TII;
734 
735 public:
736  // Add SchedGroups to \p Pipeline to implement this Strategy.
737  virtual void applyIGLPStrategy(
739  DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0;
740 
741  // Returns true if this strategy should be applied to a ScheduleDAG.
742  virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
743 
744  IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
745  : DAG(DAG), TII(TII) {}
746 
747  virtual ~IGLPStrategy() = default;
748 };
749 
750 class MFMASmallGemmOpt final : public IGLPStrategy {
751 public:
752  void applyIGLPStrategy(
754  DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
755 
756  bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
757 
758  MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
759  : IGLPStrategy(DAG, TII) {}
760 };
761 
762 void MFMASmallGemmOpt::applyIGLPStrategy(
764  DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
765  // Count the number of MFMA instructions.
766  unsigned MFMACount = 0;
767  for (const MachineInstr &I : *DAG)
768  if (TII->isMFMAorWMMA(I))
769  ++MFMACount;
770 
771  const unsigned PipelineSyncID = 0;
772  SchedGroup *SG = nullptr;
773  for (unsigned I = 0; I < MFMACount * 3; ++I) {
774  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
775  SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);
776  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
777 
778  SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
779  SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
780  SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
781  }
782 }
783 
784 static std::unique_ptr<IGLPStrategy>
785 createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
786  const SIInstrInfo *TII) {
787  switch (ID) {
788  case MFMASmallGemmOptID:
789  return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
790  }
791 
792  llvm_unreachable("Unknown IGLPStrategyID");
793 }
794 
795 class IGroupLPDAGMutation : public ScheduleDAGMutation {
796 private:
797  const SIInstrInfo *TII;
798 
799  ScheduleDAGMI *DAG;
800 
801  // Organize lists of SchedGroups by their SyncID. SchedGroups /
802  // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
803  // between then.
804  DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
805 
806  // Used to track instructions that can be mapped to multiple sched groups
808 
809  // Add DAG edges that enforce SCHED_BARRIER ordering.
810  void addSchedBarrierEdges(SUnit &SU);
811 
812  // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
813  // not be reordered accross the SCHED_BARRIER. This is used for the base
814  // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
815  // SCHED_BARRIER will always block all instructions that can be classified
816  // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size
817  // and may only synchronize with some SchedGroups. Returns the inverse of
818  // Mask. SCHED_BARRIER's mask describes which instruction types should be
819  // allowed to be scheduled across it. Invert the mask to get the
820  // SchedGroupMask of instructions that should be barred.
821  SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;
822 
823  // Create SchedGroups for a SCHED_GROUP_BARRIER.
824  void initSchedGroupBarrierPipelineStage(
825  std::vector<SUnit>::reverse_iterator RIter);
826 
827  void initIGLPOpt(SUnit &SU);
828 
829 public:
830  void apply(ScheduleDAGInstrs *DAGInstrs) override;
831 
832  IGroupLPDAGMutation() = default;
833 };
834 
835 unsigned SchedGroup::NumSchedGroups = 0;
836 
837 bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
838  if (A != B && DAG->canAddEdge(B, A)) {
839  DAG->addEdge(B, SDep(A, SDep::Artificial));
840  return true;
841  }
842  return false;
843 }
844 
845 bool SchedGroup::canAddMI(const MachineInstr &MI) const {
846  bool Result = false;
847  if (MI.isMetaInstruction())
848  Result = false;
849 
850  else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
851  (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI)))
852  Result = true;
853 
854  else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
855  TII->isVALU(MI) && !TII->isMFMAorWMMA(MI))
856  Result = true;
857 
858  else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
859  TII->isSALU(MI))
860  Result = true;
861 
862  else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
863  TII->isMFMAorWMMA(MI))
864  Result = true;
865 
866  else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
867  (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
868  Result = true;
869 
870  else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
871  MI.mayLoad() &&
872  (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
873  Result = true;
874 
875  else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
876  MI.mayStore() &&
877  (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
878  Result = true;
879 
880  else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
881  TII->isDS(MI))
882  Result = true;
883 
884  else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
885  MI.mayLoad() && TII->isDS(MI))
886  Result = true;
887 
888  else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
889  MI.mayStore() && TII->isDS(MI))
890  Result = true;
891 
892  LLVM_DEBUG(
893  dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
894  << (Result ? " could classify " : " unable to classify ") << MI);
895 
896  return Result;
897 }
898 
899 int SchedGroup::link(SUnit &SU, bool MakePred,
900  std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
901  int MissedEdges = 0;
902  for (auto *A : Collection) {
903  SUnit *B = &SU;
904  if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
905  continue;
906  if (MakePred)
907  std::swap(A, B);
908 
909  if (DAG->IsReachable(B, A))
910  continue;
911  // tryAddEdge returns false if there is a dependency that makes adding
912  // the A->B edge impossible, otherwise it returns true;
913  bool Added = tryAddEdge(A, B);
914  if (Added)
915  AddedEdges.push_back(std::pair(A, B));
916  else
917  ++MissedEdges;
918  }
919 
920  return MissedEdges;
921 }
922 
923 void SchedGroup::link(SUnit &SU, bool MakePred) {
924  for (auto *A : Collection) {
925  SUnit *B = &SU;
926  if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
927  continue;
928  if (MakePred)
929  std::swap(A, B);
930 
931  tryAddEdge(A, B);
932  }
933 }
934 
935 void SchedGroup::link(SUnit &SU,
936  function_ref<bool(const SUnit *A, const SUnit *B)> P) {
937  for (auto *A : Collection) {
938  SUnit *B = &SU;
939  if (P(A, B))
940  std::swap(A, B);
941 
942  tryAddEdge(A, B);
943  }
944 }
945 
946 void SchedGroup::link(SchedGroup &OtherGroup) {
947  for (auto *B : OtherGroup.Collection)
948  link(*B);
949 }
950 
951 bool SchedGroup::canAddSU(SUnit &SU) const {
952  MachineInstr &MI = *SU.getInstr();
953  if (MI.getOpcode() != TargetOpcode::BUNDLE)
954  return canAddMI(MI);
955 
956  // Special case for bundled MIs.
957  const MachineBasicBlock *MBB = MI.getParent();
958  MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
959  while (E != MBB->end() && E->isBundledWithPred())
960  ++E;
961 
962  // Return true if all of the bundled MIs can be added to this group.
963  return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); });
964 }
965 
966 void SchedGroup::initSchedGroup() {
967  for (auto &SU : DAG->SUnits) {
968  if (isFull())
969  break;
970 
971  if (canAddSU(SU))
972  add(SU);
973  }
974 }
975 
976 void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
977  SUnitsToCandidateSGsMap &SyncedInstrs) {
978  SUnit &InitSU = *RIter;
979  for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) {
980  auto &SU = *RIter;
981  if (isFull())
982  break;
983 
984  if (canAddSU(SU))
985  SyncedInstrs[&SU].push_back(SGID);
986  }
987 
988  add(InitSU);
989  assert(MaxSize);
990  (*MaxSize)++;
991 }
992 
993 void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
994  auto I = DAG->SUnits.rbegin();
995  auto E = DAG->SUnits.rend();
996  for (; I != E; ++I) {
997  auto &SU = *I;
998  if (isFull())
999  break;
1000 
1001  if (canAddSU(SU))
1002  SyncedInstrs[&SU].push_back(SGID);
1003  }
1004 }
1005 
1007  const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1008  if (!TSchedModel || DAGInstrs->SUnits.empty())
1009  return;
1010 
1011  LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n");
1012  const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1013  TII = ST.getInstrInfo();
1014  DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
1015  SyncedSchedGroups.clear();
1016  SyncedInstrs.clear();
1017  bool foundSB = false;
1018  bool foundIGLP = false;
1019  for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
1020  unsigned Opc = R->getInstr()->getOpcode();
1021  // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
1022  if (Opc == AMDGPU::SCHED_BARRIER) {
1023  addSchedBarrierEdges(*R);
1024  foundSB = true;
1025  } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
1026  initSchedGroupBarrierPipelineStage(R);
1027  foundSB = true;
1028  } else if (Opc == AMDGPU::IGLP_OPT) {
1029  resetEdges(*R, DAG);
1030  if (!foundSB && !foundIGLP)
1031  initIGLPOpt(*R);
1032  foundIGLP = true;
1033  }
1034  }
1035 
1036  if (foundSB || foundIGLP) {
1037  PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG);
1038  // PipelineSolver performs the mutation by adding the edges it
1039  // determined as the best
1040  PS.solve();
1041  }
1042 }
1043 
1044 void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
1045  MachineInstr &MI = *SchedBarrier.getInstr();
1046  assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER);
1047  // Remove all existing edges from the SCHED_BARRIER that were added due to the
1048  // instruction having side effects.
1049  resetEdges(SchedBarrier, DAG);
1050  auto InvertedMask =
1051  invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());
1052  SchedGroup SG(InvertedMask, std::nullopt, DAG, TII);
1053  SG.initSchedGroup();
1054  // Preserve original instruction ordering relative to the SCHED_BARRIER.
1055  SG.link(
1056  SchedBarrier,
1057  (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
1058  const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; });
1059 }
1060 
1061 SchedGroupMask
1062 IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
1063  // Invert mask and erase bits for types of instructions that are implied to be
1064  // allowed past the SCHED_BARRIER.
1065  SchedGroupMask InvertedMask = ~Mask;
1066 
1067  // ALU implies VALU, SALU, MFMA.
1068  if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
1069  InvertedMask &=
1070  ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & ~SchedGroupMask::MFMA;
1071  // VALU, SALU, MFMA implies ALU.
1072  else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
1073  (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
1074  (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE)
1075  InvertedMask &= ~SchedGroupMask::ALU;
1076 
1077  // VMEM implies VMEM_READ, VMEM_WRITE.
1078  if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
1079  InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
1080  // VMEM_READ, VMEM_WRITE implies VMEM.
1081  else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
1082  (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
1083  InvertedMask &= ~SchedGroupMask::VMEM;
1084 
1085  // DS implies DS_READ, DS_WRITE.
1086  if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
1087  InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
1088  // DS_READ, DS_WRITE implies DS.
1089  else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
1090  (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
1091  InvertedMask &= ~SchedGroupMask::DS;
1092 
1093  return InvertedMask;
1094 }
1095 
1096 void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
1097  std::vector<SUnit>::reverse_iterator RIter) {
1098  // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
1099  // to the instruction having side effects.
1100  resetEdges(*RIter, DAG);
1101  MachineInstr &SGB = *RIter->getInstr();
1102  assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
1103  int32_t SGMask = SGB.getOperand(0).getImm();
1104  int32_t Size = SGB.getOperand(1).getImm();
1105  int32_t SyncID = SGB.getOperand(2).getImm();
1106 
1107  auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
1108  Size, SyncID, DAG, TII);
1109 
1110  SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
1111 }
1112 
1113 void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
1114  IGLPStrategyID StrategyID =
1115  (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
1116  auto S = createIGLPStrategy(StrategyID, DAG, TII);
1117  if (S->shouldApplyStrategy(DAG))
1118  S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups);
1119 }
1120 
1121 } // namespace
1122 
1123 namespace llvm {
1124 
1125 std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
1126  return std::make_unique<IGroupLPDAGMutation>();
1127 }
1128 
1129 } // end namespace llvm
llvm::SIInstrFlags::SALU
@ SALU
Definition: SIDefines.h:29
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:109
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
llvm::SDep::Artificial
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition: ScheduleDAG.h:72
SIMachineFunctionInfo.h
LLVM_MARK_AS_BITMASK_ENUM
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:41
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
llvm::ALL
@ ALL
Definition: Attributor.h:5571
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1199
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:138
llvm::ScheduleDAGInstrs::addEdge
bool addEdge(SUnit *SuccSU, const SDep &PredDep)
Add a DAG edge to the given SU with the given predecessor dependence data.
Definition: ScheduleDAGInstrs.cpp:1208
DenseMap.h
llvm::sys::path::end
const_iterator end(StringRef path)
Get end iterator over path.
Definition: Path.cpp:235
llvm::SUnit::Succs
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::DiagnosticPredicateTy::Match
@ Match
AMDGPUIGroupLP.h
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::SIInstrFlags::DS
@ DS
Definition: SIDefines.h:60
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1735
llvm::SUnit::removePred
void removePred(const SDep &D)
Removes the specified edge as a pred of the current node if it exists.
Definition: ScheduleDAG.cpp:175
llvm::cl::apply
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition: CommandLine.h:1298
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineOperand::getImm
int64_t getImm() const
Definition: MachineOperand.h:553
llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:526
int
Clang compiles this i1 i64 store i64 i64 store i64 i64 store i64 i64 store i64 align Which gets codegen d xmm0 movaps rbp movaps rbp movaps rbp movaps rbp rbp rbp rbp rbp It would be better to have movq s of instead of the movaps s LLVM produces ret int
Definition: README.txt:536
TargetOpcodes.h
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:94
llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1683
llvm::function_ref
An efficient, type-erasing, non-owning reference to a callable.
Definition: STLFunctionalExtras.h:36
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:672
llvm::cl::opt< bool >
AMDGPUMCTargetDesc.h
llvm::TargetSchedModel
Provide an instruction scheduling machine model to CodeGen passes.
Definition: TargetSchedule.h:30
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:66
uint64_t
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::SIInstrFlags::VALU
@ VALU
Definition: SIDefines.h:30
llvm::ARM_AM::add
@ add
Definition: ARMAddressingModes.h:39
llvm::ScheduleDAGInstrs::IsReachable
bool IsReachable(SUnit *SU, SUnit *TargetSU)
IsReachable - Checks if SU is reachable from TargetSU.
Definition: ScheduleDAGInstrs.h:273
llvm::DenseMap
Definition: DenseMap.h:714
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::SUnit::getInstr
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
llvm::ScheduleDAGInstrs::canAddEdge
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU)
True if an edge can be added from PredSU to SuccSU without creating a cycle.
Definition: ScheduleDAGInstrs.cpp:1204
llvm::ScheduleDAGMI
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
Definition: MachineScheduler.h:273
getOpcode
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
SIInstrInfo.h
llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1716
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:516
llvm::ScheduleDAG::MF
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:559
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
BitmaskEnum.h
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
llvm::ScheduleDAGInstrs::dump
void dump() const override
Definition: ScheduleDAGInstrs.cpp:1175
llvm::find_if
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1762
llvm::PBQP::RegAlloc::solve
Solution solve(PBQPRAGraph &G)
Definition: RegAllocPBQP.h:522
llvm::SDep
Scheduling dependency.
Definition: ScheduleDAG.h:49
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::ScheduleDAG::SUnits
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:561
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::SmallVectorImpl::clear
void clear()
Definition: SmallVector.h:614
llvm::CSKYAttrs::NONE
@ NONE
Definition: CSKYAttributes.h:76
llvm::SIInstrInfo
Definition: SIInstrInfo.h:44
MachineScheduler.h
llvm::NONE
@ NONE
Definition: Attributor.h:5568
llvm::ScheduleDAGInstrs::getSchedModel
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
Definition: ScheduleDAGInstrs.h:263
llvm::ScheduleDAGMutation
Mutate the DAG as a postpass after normal DAG building.
Definition: ScheduleDAGMutation.h:22
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:42
llvm::SUnit::Preds
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:256
llvm::SUnit
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
llvm::cl::desc
Definition: CommandLine.h:411
llvm::ScheduleDAGInstrs
A ScheduleDAG for scheduling lists of MachineInstr.
Definition: ScheduleDAGInstrs.h:120
llvm::format_hex
FormattedNumber format_hex(uint64_t N, unsigned Width, bool Upper=false)
format_hex - Output N as a fixed width hexadecimal.
Definition: Format.h:186
AMDGPUTargetMachine.h
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:311
llvm::createIGroupLPDAGMutation
std::unique_ptr< ScheduleDAGMutation > createIGroupLPDAGMutation()
Definition: AMDGPUIGroupLP.cpp:1125