LLVM  10.0.0svn
SIMachineScheduler.cpp
Go to the documentation of this file.
1 //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI Machine Scheduler interface
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIMachineScheduler.h"
15 #include "AMDGPU.h"
16 #include "SIInstrInfo.h"
17 #include "SIRegisterInfo.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
29 #include "llvm/Support/Debug.h"
32 #include <algorithm>
33 #include <cassert>
34 #include <map>
35 #include <set>
36 #include <utility>
37 #include <vector>
38 
39 using namespace llvm;
40 
41 #define DEBUG_TYPE "machine-scheduler"
42 
43 // This scheduler implements a different scheduling algorithm than
44 // GenericScheduler.
45 //
46 // There are several specific architecture behaviours that can't be modelled
47 // for GenericScheduler:
48 // . When accessing the result of an SGPR load instruction, you have to wait
49 // for all the SGPR load instructions before your current instruction to
50 // have finished.
51 // . When accessing the result of an VGPR load instruction, you have to wait
52 // for all the VGPR load instructions previous to the VGPR load instruction
53 // you are interested in to finish.
54 // . The less the register pressure, the best load latencies are hidden
55 //
56 // Moreover some specifities (like the fact a lot of instructions in the shader
57 // have few dependencies) makes the generic scheduler have some unpredictable
58 // behaviours. For example when register pressure becomes high, it can either
59 // manage to prevent register pressure from going too high, or it can
60 // increase register pressure even more than if it hadn't taken register
61 // pressure into account.
62 //
63 // Also some other bad behaviours are generated, like loading at the beginning
64 // of the shader a constant in VGPR you won't need until the end of the shader.
65 //
66 // The scheduling problem for SI can distinguish three main parts:
67 // . Hiding high latencies (texture sampling, etc)
68 // . Hiding low latencies (SGPR constant loading, etc)
69 // . Keeping register usage low for better latency hiding and general
70 // performance
71 //
72 // Some other things can also affect performance, but are hard to predict
73 // (cache usage, the fact the HW can issue several instructions from different
74 // wavefronts if different types, etc)
75 //
76 // This scheduler tries to solve the scheduling problem by dividing it into
77 // simpler sub-problems. It divides the instructions into blocks, schedules
78 // locally inside the blocks where it takes care of low latencies, and then
79 // chooses the order of the blocks by taking care of high latencies.
80 // Dividing the instructions into blocks helps control keeping register
81 // usage low.
82 //
83 // First the instructions are put into blocks.
84 // We want the blocks help control register usage and hide high latencies
85 // later. To help control register usage, we typically want all local
86 // computations, when for example you create a result that can be comsummed
87 // right away, to be contained in a block. Block inputs and outputs would
88 // typically be important results that are needed in several locations of
89 // the shader. Since we do want blocks to help hide high latencies, we want
90 // the instructions inside the block to have a minimal set of dependencies
91 // on high latencies. It will make it easy to pick blocks to hide specific
92 // high latencies.
93 // The block creation algorithm is divided into several steps, and several
94 // variants can be tried during the scheduling process.
95 //
96 // Second the order of the instructions inside the blocks is chosen.
97 // At that step we do take into account only register usage and hiding
98 // low latency instructions
99 //
100 // Third the block order is chosen, there we try to hide high latencies
101 // and keep register usage low.
102 //
103 // After the third step, a pass is done to improve the hiding of low
104 // latencies.
105 //
106 // Actually when talking about 'low latency' or 'high latency' it includes
107 // both the latency to get the cache (or global mem) data go to the register,
108 // and the bandwidth limitations.
109 // Increasing the number of active wavefronts helps hide the former, but it
110 // doesn't solve the latter, thus why even if wavefront count is high, we have
111 // to try have as many instructions hiding high latencies as possible.
112 // The OpenCL doc says for example latency of 400 cycles for a global mem access,
113 // which is hidden by 10 instructions if the wavefront count is 10.
114 
115 // Some figures taken from AMD docs:
116 // Both texture and constant L1 caches are 4-way associative with 64 bytes
117 // lines.
118 // Constant cache is shared with 4 CUs.
119 // For texture sampling, the address generation unit receives 4 texture
120 // addresses per cycle, thus we could expect texture sampling latency to be
121 // equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
122 // instructions in a wavefront group are executed every 4 cycles),
123 // or 16 instructions if the other wavefronts associated to the 3 other VALUs
124 // of the CU do texture sampling too. (Don't take these figures too seriously,
125 // as I'm not 100% sure of the computation)
126 // Data exports should get similar latency.
127 // For constant loading, the cache is shader with 4 CUs.
128 // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
129 // I guess if the other CU don't read the cache, it can go up to 64B/cycle.
130 // It means a simple s_buffer_load should take one instruction to hide, as
131 // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
132 // cache line.
133 //
134 // As of today the driver doesn't preload the constants in cache, thus the
135 // first loads get extra latency. The doc says global memory access can be
136 // 300-600 cycles. We do not specially take that into account when scheduling
137 // As we expect the driver to be able to preload the constants soon.
138 
139 // common code //
140 
141 #ifndef NDEBUG
142 
143 static const char *getReasonStr(SIScheduleCandReason Reason) {
144  switch (Reason) {
145  case NoCand: return "NOCAND";
146  case RegUsage: return "REGUSAGE";
147  case Latency: return "LATENCY";
148  case Successor: return "SUCCESSOR";
149  case Depth: return "DEPTH";
150  case NodeOrder: return "ORDER";
151  }
152  llvm_unreachable("Unknown reason!");
153 }
154 
155 #endif
156 
157 namespace llvm {
158 namespace SISched {
159 static bool tryLess(int TryVal, int CandVal,
160  SISchedulerCandidate &TryCand,
161  SISchedulerCandidate &Cand,
162  SIScheduleCandReason Reason) {
163  if (TryVal < CandVal) {
164  TryCand.Reason = Reason;
165  return true;
166  }
167  if (TryVal > CandVal) {
168  if (Cand.Reason > Reason)
169  Cand.Reason = Reason;
170  return true;
171  }
172  Cand.setRepeat(Reason);
173  return false;
174 }
175 
176 static bool tryGreater(int TryVal, int CandVal,
177  SISchedulerCandidate &TryCand,
178  SISchedulerCandidate &Cand,
179  SIScheduleCandReason Reason) {
180  if (TryVal > CandVal) {
181  TryCand.Reason = Reason;
182  return true;
183  }
184  if (TryVal < CandVal) {
185  if (Cand.Reason > Reason)
186  Cand.Reason = Reason;
187  return true;
188  }
189  Cand.setRepeat(Reason);
190  return false;
191 }
192 } // end namespace SISched
193 } // end namespace llvm
194 
195 // SIScheduleBlock //
196 
198  NodeNum2Index[SU->NodeNum] = SUnits.size();
199  SUnits.push_back(SU);
200 }
201 
202 #ifndef NDEBUG
203 void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
204 
205  dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
206  dbgs() << '\n';
207 }
208 #endif
209 
210 void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
211  SISchedCandidate &TryCand) {
212  // Initialize the candidate if needed.
213  if (!Cand.isValid()) {
214  TryCand.Reason = NodeOrder;
215  return;
216  }
217 
218  if (Cand.SGPRUsage > 60 &&
219  SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage,
220  TryCand, Cand, RegUsage))
221  return;
222 
223  // Schedule low latency instructions as top as possible.
224  // Order of priority is:
225  // . Low latency instructions which do not depend on other low latency
226  // instructions we haven't waited for
227  // . Other instructions which do not depend on low latency instructions
228  // we haven't waited for
229  // . Low latencies
230  // . All other instructions
231  // Goal is to get: low latency instructions - independent instructions
232  // - (eventually some more low latency instructions)
233  // - instructions that depend on the first low latency instructions.
234  // If in the block there is a lot of constant loads, the SGPR usage
235  // could go quite high, thus above the arbitrary limit of 60 will encourage
236  // use the already loaded constants (in order to release some SGPRs) before
237  // loading more.
238  if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent,
239  Cand.HasLowLatencyNonWaitedParent,
240  TryCand, Cand, SIScheduleCandReason::Depth))
241  return;
242 
243  if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
244  TryCand, Cand, SIScheduleCandReason::Depth))
245  return;
246 
247  if (TryCand.IsLowLatency &&
248  SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
249  TryCand, Cand, SIScheduleCandReason::Depth))
250  return;
251 
252  if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage,
253  TryCand, Cand, RegUsage))
254  return;
255 
256  // Fall through to original instruction order.
257  if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
258  TryCand.Reason = NodeOrder;
259  }
260 }
261 
262 SUnit* SIScheduleBlock::pickNode() {
263  SISchedCandidate TopCand;
264 
265  for (SUnit* SU : TopReadySUs) {
266  SISchedCandidate TryCand;
267  std::vector<unsigned> pressure;
268  std::vector<unsigned> MaxPressure;
269  // Predict register usage after this instruction.
270  TryCand.SU = SU;
271  TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
272  TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
273  TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
274  TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
275  TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
276  TryCand.HasLowLatencyNonWaitedParent =
277  HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
278  tryCandidateTopDown(TopCand, TryCand);
279  if (TryCand.Reason != NoCand)
280  TopCand.setBest(TryCand);
281  }
282 
283  return TopCand.SU;
284 }
285 
286 
287 // Schedule something valid.
289  TopReadySUs.clear();
290  if (Scheduled)
291  undoSchedule();
292 
293  for (SUnit* SU : SUnits) {
294  if (!SU->NumPredsLeft)
295  TopReadySUs.push_back(SU);
296  }
297 
298  while (!TopReadySUs.empty()) {
299  SUnit *SU = TopReadySUs[0];
300  ScheduledSUnits.push_back(SU);
301  nodeScheduled(SU);
302  }
303 
304  Scheduled = true;
305 }
306 
307 // Returns if the register was set between first and last.
308 static bool isDefBetween(unsigned Reg,
309  SlotIndex First, SlotIndex Last,
310  const MachineRegisterInfo *MRI,
311  const LiveIntervals *LIS) {
313  UI = MRI->def_instr_begin(Reg),
314  UE = MRI->def_instr_end(); UI != UE; ++UI) {
315  const MachineInstr* MI = &*UI;
316  if (MI->isDebugValue())
317  continue;
318  SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
319  if (InstSlot >= First && InstSlot <= Last)
320  return true;
321  }
322  return false;
323 }
324 
325 void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
326  MachineBasicBlock::iterator EndBlock) {
327  IntervalPressure Pressure, BotPressure;
328  RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
329  LiveIntervals *LIS = DAG->getLIS();
330  MachineRegisterInfo *MRI = DAG->getMRI();
331  DAG->initRPTracker(TopRPTracker);
332  DAG->initRPTracker(BotRPTracker);
333  DAG->initRPTracker(RPTracker);
334 
335  // Goes though all SU. RPTracker captures what had to be alive for the SUs
336  // to execute, and what is still alive at the end.
337  for (SUnit* SU : ScheduledSUnits) {
338  RPTracker.setPos(SU->getInstr());
339  RPTracker.advance();
340  }
341 
342  // Close the RPTracker to finalize live ins/outs.
343  RPTracker.closeRegion();
344 
345  // Initialize the live ins and live outs.
346  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
347  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
348 
349  // Do not Track Physical Registers, because it messes up.
350  for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
351  if (Register::isVirtualRegister(RegMaskPair.RegUnit))
352  LiveInRegs.insert(RegMaskPair.RegUnit);
353  }
354  LiveOutRegs.clear();
355  // There is several possibilities to distinguish:
356  // 1) Reg is not input to any instruction in the block, but is output of one
357  // 2) 1) + read in the block and not needed after it
358  // 3) 1) + read in the block but needed in another block
359  // 4) Reg is input of an instruction but another block will read it too
360  // 5) Reg is input of an instruction and then rewritten in the block.
361  // result is not read in the block (implies used in another block)
362  // 6) Reg is input of an instruction and then rewritten in the block.
363  // result is read in the block and not needed in another block
364  // 7) Reg is input of an instruction and then rewritten in the block.
365  // result is read in the block but also needed in another block
366  // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
367  // We want LiveOutRegs to contain only Regs whose content will be read after
368  // in another block, and whose content was written in the current block,
369  // that is we want it to get 1, 3, 5, 7
370  // Since we made the MIs of a block to be packed all together before
371  // scheduling, then the LiveIntervals were correct, and the RPTracker was
372  // able to correctly handle 5 vs 6, 2 vs 3.
373  // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
374  // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
375  // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
376  // The use of findDefBetween removes the case 4.
377  for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
378  unsigned Reg = RegMaskPair.RegUnit;
379  if (Register::isVirtualRegister(Reg) &&
380  isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
381  LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
382  LIS)) {
383  LiveOutRegs.insert(Reg);
384  }
385  }
386 
387  // Pressure = sum_alive_registers register size
388  // Internally llvm will represent some registers as big 128 bits registers
389  // for example, but they actually correspond to 4 actual 32 bits registers.
390  // Thus Pressure is not equal to num_alive_registers * constant.
391  LiveInPressure = TopPressure.MaxSetPressure;
392  LiveOutPressure = BotPressure.MaxSetPressure;
393 
394  // Prepares TopRPTracker for top down scheduling.
395  TopRPTracker.closeTop();
396 }
397 
399  MachineBasicBlock::iterator EndBlock) {
400  if (!Scheduled)
401  fastSchedule();
402 
403  // PreScheduling phase to set LiveIn and LiveOut.
404  initRegPressure(BeginBlock, EndBlock);
405  undoSchedule();
406 
407  // Schedule for real now.
408 
409  TopReadySUs.clear();
410 
411  for (SUnit* SU : SUnits) {
412  if (!SU->NumPredsLeft)
413  TopReadySUs.push_back(SU);
414  }
415 
416  while (!TopReadySUs.empty()) {
417  SUnit *SU = pickNode();
418  ScheduledSUnits.push_back(SU);
419  TopRPTracker.setPos(SU->getInstr());
420  TopRPTracker.advance();
421  nodeScheduled(SU);
422  }
423 
424  // TODO: compute InternalAdditionnalPressure.
425  InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
426 
427  // Check everything is right.
428 #ifndef NDEBUG
429  assert(SUnits.size() == ScheduledSUnits.size() &&
430  TopReadySUs.empty());
431  for (SUnit* SU : SUnits) {
432  assert(SU->isScheduled &&
433  SU->NumPredsLeft == 0);
434  }
435 #endif
436 
437  Scheduled = true;
438 }
439 
440 void SIScheduleBlock::undoSchedule() {
441  for (SUnit* SU : SUnits) {
442  SU->isScheduled = false;
443  for (SDep& Succ : SU->Succs) {
444  if (BC->isSUInBlock(Succ.getSUnit(), ID))
445  undoReleaseSucc(SU, &Succ);
446  }
447  }
448  HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
449  ScheduledSUnits.clear();
450  Scheduled = false;
451 }
452 
453 void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
454  SUnit *SuccSU = SuccEdge->getSUnit();
455 
456  if (SuccEdge->isWeak()) {
457  ++SuccSU->WeakPredsLeft;
458  return;
459  }
460  ++SuccSU->NumPredsLeft;
461 }
462 
463 void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
464  SUnit *SuccSU = SuccEdge->getSUnit();
465 
466  if (SuccEdge->isWeak()) {
467  --SuccSU->WeakPredsLeft;
468  return;
469  }
470 #ifndef NDEBUG
471  if (SuccSU->NumPredsLeft == 0) {
472  dbgs() << "*** Scheduling failed! ***\n";
473  DAG->dumpNode(*SuccSU);
474  dbgs() << " has been released too many times!\n";
475  llvm_unreachable(nullptr);
476  }
477 #endif
478 
479  --SuccSU->NumPredsLeft;
480 }
481 
482 /// Release Successors of the SU that are in the block or not.
483 void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
484  for (SDep& Succ : SU->Succs) {
485  SUnit *SuccSU = Succ.getSUnit();
486 
487  if (SuccSU->NodeNum >= DAG->SUnits.size())
488  continue;
489 
490  if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
491  continue;
492 
493  releaseSucc(SU, &Succ);
494  if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
495  TopReadySUs.push_back(SuccSU);
496  }
497 }
498 
499 void SIScheduleBlock::nodeScheduled(SUnit *SU) {
500  // Is in TopReadySUs
501  assert (!SU->NumPredsLeft);
502  std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU);
503  if (I == TopReadySUs.end()) {
504  dbgs() << "Data Structure Bug in SI Scheduler\n";
505  llvm_unreachable(nullptr);
506  }
507  TopReadySUs.erase(I);
508 
509  releaseSuccessors(SU, true);
510  // Scheduling this node will trigger a wait,
511  // thus propagate to other instructions that they do not need to wait either.
512  if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
513  HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
514 
515  if (DAG->IsLowLatencySU[SU->NodeNum]) {
516  for (SDep& Succ : SU->Succs) {
517  std::map<unsigned, unsigned>::iterator I =
518  NodeNum2Index.find(Succ.getSUnit()->NodeNum);
519  if (I != NodeNum2Index.end())
520  HasLowLatencyNonWaitedParent[I->second] = 1;
521  }
522  }
523  SU->isScheduled = true;
524 }
525 
527  // We remove links from outside blocks to enable scheduling inside the block.
528  for (SUnit* SU : SUnits) {
529  releaseSuccessors(SU, false);
530  if (DAG->IsHighLatencySU[SU->NodeNum])
531  HighLatencyBlock = true;
532  }
533  HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
534 }
535 
536 // we maintain ascending order of IDs
538  unsigned PredID = Pred->getID();
539 
540  // Check if not already predecessor.
541  for (SIScheduleBlock* P : Preds) {
542  if (PredID == P->getID())
543  return;
544  }
545  Preds.push_back(Pred);
546 
547  assert(none_of(Succs,
548  [=](std::pair<SIScheduleBlock*,
550  return PredID == S.first->getID();
551  }) &&
552  "Loop in the Block Graph!");
553 }
554 
557  unsigned SuccID = Succ->getID();
558 
559  // Check if not already predecessor.
560  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
561  if (SuccID == S.first->getID()) {
562  if (S.second == SIScheduleBlockLinkKind::NoData &&
564  S.second = Kind;
565  return;
566  }
567  }
568  if (Succ->isHighLatencyBlock())
569  ++NumHighLatencySuccessors;
570  Succs.push_back(std::make_pair(Succ, Kind));
571 
572  assert(none_of(Preds,
573  [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
574  "Loop in the Block Graph!");
575 }
576 
577 #ifndef NDEBUG
579  dbgs() << "Block (" << ID << ")\n";
580  if (!full)
581  return;
582 
583  dbgs() << "\nContains High Latency Instruction: "
584  << HighLatencyBlock << '\n';
585  dbgs() << "\nDepends On:\n";
586  for (SIScheduleBlock* P : Preds) {
587  P->printDebug(false);
588  }
589 
590  dbgs() << "\nSuccessors:\n";
591  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
592  if (S.second == SIScheduleBlockLinkKind::Data)
593  dbgs() << "(Data Dep) ";
594  S.first->printDebug(false);
595  }
596 
597  if (Scheduled) {
598  dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
599  << LiveInPressure[DAG->getVGPRSetID()] << '\n';
600  dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
601  << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
602  dbgs() << "LiveIns:\n";
603  for (unsigned Reg : LiveInRegs)
604  dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
605 
606  dbgs() << "\nLiveOuts:\n";
607  for (unsigned Reg : LiveOutRegs)
608  dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
609  }
610 
611  dbgs() << "\nInstructions:\n";
612  if (!Scheduled) {
613  for (const SUnit* SU : SUnits)
614  DAG->dumpNode(*SU);
615  } else {
616  for (const SUnit* SU : SUnits)
617  DAG->dumpNode(*SU);
618  }
619 
620  dbgs() << "///////////////////////\n";
621 }
622 #endif
623 
624 // SIScheduleBlockCreator //
625 
627 DAG(DAG) {
628 }
629 
631 
634  std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
635  Blocks.find(BlockVariant);
636  if (B == Blocks.end()) {
637  SIScheduleBlocks Res;
638  createBlocksForVariant(BlockVariant);
639  topologicalSort();
640  scheduleInsideBlocks();
641  fillStats();
642  Res.Blocks = CurrentBlocks;
643  Res.TopDownIndex2Block = TopDownIndex2Block;
644  Res.TopDownBlock2Index = TopDownBlock2Index;
645  Blocks[BlockVariant] = Res;
646  return Res;
647  } else {
648  return B->second;
649  }
650 }
651 
653  if (SU->NodeNum >= DAG->SUnits.size())
654  return false;
655  return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
656 }
657 
658 void SIScheduleBlockCreator::colorHighLatenciesAlone() {
659  unsigned DAGSize = DAG->SUnits.size();
660 
661  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
662  SUnit *SU = &DAG->SUnits[i];
663  if (DAG->IsHighLatencySU[SU->NodeNum]) {
664  CurrentColoring[SU->NodeNum] = NextReservedID++;
665  }
666  }
667 }
668 
669 static bool
670 hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
671  for (const auto &PredDep : SU.Preds) {
672  if (PredDep.getSUnit() == &FromSU &&
673  PredDep.getKind() == llvm::SDep::Data)
674  return true;
675  }
676  return false;
677 }
678 
679 void SIScheduleBlockCreator::colorHighLatenciesGroups() {
680  unsigned DAGSize = DAG->SUnits.size();
681  unsigned NumHighLatencies = 0;
682  unsigned GroupSize;
683  int Color = NextReservedID;
684  unsigned Count = 0;
685  std::set<unsigned> FormingGroup;
686 
687  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
688  SUnit *SU = &DAG->SUnits[i];
689  if (DAG->IsHighLatencySU[SU->NodeNum])
690  ++NumHighLatencies;
691  }
692 
693  if (NumHighLatencies == 0)
694  return;
695 
696  if (NumHighLatencies <= 6)
697  GroupSize = 2;
698  else if (NumHighLatencies <= 12)
699  GroupSize = 3;
700  else
701  GroupSize = 4;
702 
703  for (unsigned SUNum : DAG->TopDownIndex2SU) {
704  const SUnit &SU = DAG->SUnits[SUNum];
705  if (DAG->IsHighLatencySU[SU.NodeNum]) {
706  unsigned CompatibleGroup = true;
707  int ProposedColor = Color;
708  std::vector<int> AdditionalElements;
709 
710  // We don't want to put in the same block
711  // two high latency instructions that depend
712  // on each other.
713  // One way would be to check canAddEdge
714  // in both directions, but that currently is not
715  // enough because there the high latency order is
716  // enforced (via links).
717  // Instead, look at the dependencies between the
718  // high latency instructions and deduce if it is
719  // a data dependency or not.
720  for (unsigned j : FormingGroup) {
721  bool HasSubGraph;
722  std::vector<int> SubGraph;
723  // By construction (topological order), if SU and
724  // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
725  // in the parent graph of SU.
726 #ifndef NDEBUG
727  SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
728  HasSubGraph);
729  assert(!HasSubGraph);
730 #endif
731  SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
732  HasSubGraph);
733  if (!HasSubGraph)
734  continue; // No dependencies between each other
735  else if (SubGraph.size() > 5) {
736  // Too many elements would be required to be added to the block.
737  CompatibleGroup = false;
738  break;
739  }
740  else {
741  // Check the type of dependency
742  for (unsigned k : SubGraph) {
743  // If in the path to join the two instructions,
744  // there is another high latency instruction,
745  // or instructions colored for another block
746  // abort the merge.
747  if (DAG->IsHighLatencySU[k] ||
748  (CurrentColoring[k] != ProposedColor &&
749  CurrentColoring[k] != 0)) {
750  CompatibleGroup = false;
751  break;
752  }
753  // If one of the SU in the subgraph depends on the result of SU j,
754  // there'll be a data dependency.
755  if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) {
756  CompatibleGroup = false;
757  break;
758  }
759  }
760  if (!CompatibleGroup)
761  break;
762  // Same check for the SU
763  if (hasDataDependencyPred(SU, DAG->SUnits[j])) {
764  CompatibleGroup = false;
765  break;
766  }
767  // Add all the required instructions to the block
768  // These cannot live in another block (because they
769  // depend (order dependency) on one of the
770  // instruction in the block, and are required for the
771  // high latency instruction we add.
772  AdditionalElements.insert(AdditionalElements.end(),
773  SubGraph.begin(), SubGraph.end());
774  }
775  }
776  if (CompatibleGroup) {
777  FormingGroup.insert(SU.NodeNum);
778  for (unsigned j : AdditionalElements)
779  CurrentColoring[j] = ProposedColor;
780  CurrentColoring[SU.NodeNum] = ProposedColor;
781  ++Count;
782  }
783  // Found one incompatible instruction,
784  // or has filled a big enough group.
785  // -> start a new one.
786  if (!CompatibleGroup) {
787  FormingGroup.clear();
788  Color = ++NextReservedID;
789  ProposedColor = Color;
790  FormingGroup.insert(SU.NodeNum);
791  CurrentColoring[SU.NodeNum] = ProposedColor;
792  Count = 0;
793  } else if (Count == GroupSize) {
794  FormingGroup.clear();
795  Color = ++NextReservedID;
796  ProposedColor = Color;
797  Count = 0;
798  }
799  }
800  }
801 }
802 
803 void SIScheduleBlockCreator::colorComputeReservedDependencies() {
804  unsigned DAGSize = DAG->SUnits.size();
805  std::map<std::set<unsigned>, unsigned> ColorCombinations;
806 
807  CurrentTopDownReservedDependencyColoring.clear();
808  CurrentBottomUpReservedDependencyColoring.clear();
809 
810  CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
811  CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
812 
813  // Traverse TopDown, and give different colors to SUs depending
814  // on which combination of High Latencies they depend on.
815 
816  for (unsigned SUNum : DAG->TopDownIndex2SU) {
817  SUnit *SU = &DAG->SUnits[SUNum];
818  std::set<unsigned> SUColors;
819 
820  // Already given.
821  if (CurrentColoring[SU->NodeNum]) {
822  CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
823  CurrentColoring[SU->NodeNum];
824  continue;
825  }
826 
827  for (SDep& PredDep : SU->Preds) {
828  SUnit *Pred = PredDep.getSUnit();
829  if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
830  continue;
831  if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
832  SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
833  }
834  // Color 0 by default.
835  if (SUColors.empty())
836  continue;
837  // Same color than parents.
838  if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
839  CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
840  *SUColors.begin();
841  else {
842  std::map<std::set<unsigned>, unsigned>::iterator Pos =
843  ColorCombinations.find(SUColors);
844  if (Pos != ColorCombinations.end()) {
845  CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
846  } else {
847  CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
848  NextNonReservedID;
849  ColorCombinations[SUColors] = NextNonReservedID++;
850  }
851  }
852  }
853 
854  ColorCombinations.clear();
855 
856  // Same as before, but BottomUp.
857 
858  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
859  SUnit *SU = &DAG->SUnits[SUNum];
860  std::set<unsigned> SUColors;
861 
862  // Already given.
863  if (CurrentColoring[SU->NodeNum]) {
864  CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
865  CurrentColoring[SU->NodeNum];
866  continue;
867  }
868 
869  for (SDep& SuccDep : SU->Succs) {
870  SUnit *Succ = SuccDep.getSUnit();
871  if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
872  continue;
873  if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
874  SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
875  }
876  // Keep color 0.
877  if (SUColors.empty())
878  continue;
879  // Same color than parents.
880  if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
881  CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
882  *SUColors.begin();
883  else {
884  std::map<std::set<unsigned>, unsigned>::iterator Pos =
885  ColorCombinations.find(SUColors);
886  if (Pos != ColorCombinations.end()) {
887  CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
888  } else {
889  CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
890  NextNonReservedID;
891  ColorCombinations[SUColors] = NextNonReservedID++;
892  }
893  }
894  }
895 }
896 
897 void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
898  unsigned DAGSize = DAG->SUnits.size();
899  std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
900 
901  // Every combination of colors given by the top down
902  // and bottom up Reserved node dependency
903 
904  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
905  SUnit *SU = &DAG->SUnits[i];
906  std::pair<unsigned, unsigned> SUColors;
907 
908  // High latency instructions: already given.
909  if (CurrentColoring[SU->NodeNum])
910  continue;
911 
912  SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
913  SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
914 
915  std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
916  ColorCombinations.find(SUColors);
917  if (Pos != ColorCombinations.end()) {
918  CurrentColoring[SU->NodeNum] = Pos->second;
919  } else {
920  CurrentColoring[SU->NodeNum] = NextNonReservedID;
921  ColorCombinations[SUColors] = NextNonReservedID++;
922  }
923  }
924 }
925 
926 void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
927  unsigned DAGSize = DAG->SUnits.size();
928  std::vector<int> PendingColoring = CurrentColoring;
929 
930  assert(DAGSize >= 1 &&
931  CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
932  CurrentTopDownReservedDependencyColoring.size() == DAGSize);
933  // If there is no reserved block at all, do nothing. We don't want
934  // everything in one block.
935  if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
936  CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
937  *std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
938  CurrentTopDownReservedDependencyColoring.end()) == 0)
939  return;
940 
941  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
942  SUnit *SU = &DAG->SUnits[SUNum];
943  std::set<unsigned> SUColors;
944  std::set<unsigned> SUColorsPending;
945 
946  if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
947  continue;
948 
949  if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
950  CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
951  continue;
952 
953  for (SDep& SuccDep : SU->Succs) {
954  SUnit *Succ = SuccDep.getSUnit();
955  if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
956  continue;
957  if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
958  CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
959  SUColors.insert(CurrentColoring[Succ->NodeNum]);
960  SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
961  }
962  // If there is only one child/parent block, and that block
963  // is not among the ones we are removing in this path, then
964  // merge the instruction to that block
965  if (SUColors.size() == 1 && SUColorsPending.size() == 1)
966  PendingColoring[SU->NodeNum] = *SUColors.begin();
967  else // TODO: Attribute new colors depending on color
968  // combination of children.
969  PendingColoring[SU->NodeNum] = NextNonReservedID++;
970  }
971  CurrentColoring = PendingColoring;
972 }
973 
974 
975 void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
976  unsigned DAGSize = DAG->SUnits.size();
977  unsigned PreviousColor;
978  std::set<unsigned> SeenColors;
979 
980  if (DAGSize <= 1)
981  return;
982 
983  PreviousColor = CurrentColoring[0];
984 
985  for (unsigned i = 1, e = DAGSize; i != e; ++i) {
986  SUnit *SU = &DAG->SUnits[i];
987  unsigned CurrentColor = CurrentColoring[i];
988  unsigned PreviousColorSave = PreviousColor;
989  assert(i == SU->NodeNum);
990 
991  if (CurrentColor != PreviousColor)
992  SeenColors.insert(PreviousColor);
993  PreviousColor = CurrentColor;
994 
995  if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
996  continue;
997 
998  if (SeenColors.find(CurrentColor) == SeenColors.end())
999  continue;
1000 
1001  if (PreviousColorSave != CurrentColor)
1002  CurrentColoring[i] = NextNonReservedID++;
1003  else
1004  CurrentColoring[i] = CurrentColoring[i-1];
1005  }
1006 }
1007 
1008 void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
1009  unsigned DAGSize = DAG->SUnits.size();
1010 
1011  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1012  SUnit *SU = &DAG->SUnits[SUNum];
1013  std::set<unsigned> SUColors;
1014 
1015  if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1016  continue;
1017 
1018  // No predecessor: Vgpr constant loading.
1019  // Low latency instructions usually have a predecessor (the address)
1020  if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
1021  continue;
1022 
1023  for (SDep& SuccDep : SU->Succs) {
1024  SUnit *Succ = SuccDep.getSUnit();
1025  if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1026  continue;
1027  SUColors.insert(CurrentColoring[Succ->NodeNum]);
1028  }
1029  if (SUColors.size() == 1)
1030  CurrentColoring[SU->NodeNum] = *SUColors.begin();
1031  }
1032 }
1033 
1034 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
1035  unsigned DAGSize = DAG->SUnits.size();
1036 
1037  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1038  SUnit *SU = &DAG->SUnits[SUNum];
1039  std::set<unsigned> SUColors;
1040 
1041  if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1042  continue;
1043 
1044  for (SDep& SuccDep : SU->Succs) {
1045  SUnit *Succ = SuccDep.getSUnit();
1046  if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1047  continue;
1048  SUColors.insert(CurrentColoring[Succ->NodeNum]);
1049  }
1050  if (SUColors.size() == 1)
1051  CurrentColoring[SU->NodeNum] = *SUColors.begin();
1052  }
1053 }
1054 
1055 void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
1056  unsigned DAGSize = DAG->SUnits.size();
1057 
1058  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1059  SUnit *SU = &DAG->SUnits[SUNum];
1060  std::set<unsigned> SUColors;
1061 
1062  if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1063  continue;
1064 
1065  for (SDep& SuccDep : SU->Succs) {
1066  SUnit *Succ = SuccDep.getSUnit();
1067  if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1068  continue;
1069  SUColors.insert(CurrentColoring[Succ->NodeNum]);
1070  }
1071  if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
1072  CurrentColoring[SU->NodeNum] = *SUColors.begin();
1073  }
1074 }
1075 
1076 void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
1077  unsigned DAGSize = DAG->SUnits.size();
1078  std::map<unsigned, unsigned> ColorCount;
1079 
1080  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1081  SUnit *SU = &DAG->SUnits[SUNum];
1082  unsigned color = CurrentColoring[SU->NodeNum];
1083  ++ColorCount[color];
1084  }
1085 
1086  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1087  SUnit *SU = &DAG->SUnits[SUNum];
1088  unsigned color = CurrentColoring[SU->NodeNum];
1089  std::set<unsigned> SUColors;
1090 
1091  if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1092  continue;
1093 
1094  if (ColorCount[color] > 1)
1095  continue;
1096 
1097  for (SDep& SuccDep : SU->Succs) {
1098  SUnit *Succ = SuccDep.getSUnit();
1099  if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1100  continue;
1101  SUColors.insert(CurrentColoring[Succ->NodeNum]);
1102  }
1103  if (SUColors.size() == 1 && *SUColors.begin() != color) {
1104  --ColorCount[color];
1105  CurrentColoring[SU->NodeNum] = *SUColors.begin();
1106  ++ColorCount[*SUColors.begin()];
1107  }
1108  }
1109 }
1110 
1111 void SIScheduleBlockCreator::cutHugeBlocks() {
1112  // TODO
1113 }
1114 
1115 void SIScheduleBlockCreator::regroupNoUserInstructions() {
1116  unsigned DAGSize = DAG->SUnits.size();
1117  int GroupID = NextNonReservedID++;
1118 
1119  for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1120  SUnit *SU = &DAG->SUnits[SUNum];
1121  bool hasSuccessor = false;
1122 
1123  if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1124  continue;
1125 
1126  for (SDep& SuccDep : SU->Succs) {
1127  SUnit *Succ = SuccDep.getSUnit();
1128  if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1129  continue;
1130  hasSuccessor = true;
1131  }
1132  if (!hasSuccessor)
1133  CurrentColoring[SU->NodeNum] = GroupID;
1134  }
1135 }
1136 
1137 void SIScheduleBlockCreator::colorExports() {
1138  unsigned ExportColor = NextNonReservedID++;
1139  SmallVector<unsigned, 8> ExpGroup;
1140 
1141  // Put all exports together in a block.
1142  // The block will naturally end up being scheduled last,
1143  // thus putting exports at the end of the schedule, which
1144  // is better for performance.
1145  // However we must ensure, for safety, the exports can be put
1146  // together in the same block without any other instruction.
1147  // This could happen, for example, when scheduling after regalloc
1148  // if reloading a spilled register from memory using the same
1149  // register than used in a previous export.
1150  // If that happens, do not regroup the exports.
1151  for (unsigned SUNum : DAG->TopDownIndex2SU) {
1152  const SUnit &SU = DAG->SUnits[SUNum];
1153  if (SIInstrInfo::isEXP(*SU.getInstr())) {
1154  // Check the EXP can be added to the group safely,
1155  // ie without needing any other instruction.
1156  // The EXP is allowed to depend on other EXP
1157  // (they will be in the same group).
1158  for (unsigned j : ExpGroup) {
1159  bool HasSubGraph;
1160  std::vector<int> SubGraph;
1161  // By construction (topological order), if SU and
1162  // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
1163  // in the parent graph of SU.
1164 #ifndef NDEBUG
1165  SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
1166  HasSubGraph);
1167  assert(!HasSubGraph);
1168 #endif
1169  SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
1170  HasSubGraph);
1171  if (!HasSubGraph)
1172  continue; // No dependencies between each other
1173 
1174  // SubGraph contains all the instructions required
1175  // between EXP SUnits[j] and EXP SU.
1176  for (unsigned k : SubGraph) {
1177  if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr()))
1178  // Other instructions than EXP would be required in the group.
1179  // Abort the groupping.
1180  return;
1181  }
1182  }
1183 
1184  ExpGroup.push_back(SUNum);
1185  }
1186  }
1187 
1188  // The group can be formed. Give the color.
1189  for (unsigned j : ExpGroup)
1190  CurrentColoring[j] = ExportColor;
1191 }
1192 
1193 void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
1194  unsigned DAGSize = DAG->SUnits.size();
1195  std::map<unsigned,unsigned> RealID;
1196 
1197  CurrentBlocks.clear();
1198  CurrentColoring.clear();
1199  CurrentColoring.resize(DAGSize, 0);
1200  Node2CurrentBlock.clear();
1201 
1202  // Restore links previous scheduling variant has overridden.
1203  DAG->restoreSULinksLeft();
1204 
1205  NextReservedID = 1;
1206  NextNonReservedID = DAGSize + 1;
1207 
1208  LLVM_DEBUG(dbgs() << "Coloring the graph\n");
1209 
1211  colorHighLatenciesGroups();
1212  else
1213  colorHighLatenciesAlone();
1214  colorComputeReservedDependencies();
1215  colorAccordingToReservedDependencies();
1216  colorEndsAccordingToDependencies();
1218  colorForceConsecutiveOrderInGroup();
1219  regroupNoUserInstructions();
1220  colorMergeConstantLoadsNextGroup();
1221  colorMergeIfPossibleNextGroupOnlyForReserved();
1222  colorExports();
1223 
1224  // Put SUs of same color into same block
1225  Node2CurrentBlock.resize(DAGSize, -1);
1226  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1227  SUnit *SU = &DAG->SUnits[i];
1228  unsigned Color = CurrentColoring[SU->NodeNum];
1229  if (RealID.find(Color) == RealID.end()) {
1230  int ID = CurrentBlocks.size();
1231  BlockPtrs.push_back(std::make_unique<SIScheduleBlock>(DAG, this, ID));
1232  CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
1233  RealID[Color] = ID;
1234  }
1235  CurrentBlocks[RealID[Color]]->addUnit(SU);
1236  Node2CurrentBlock[SU->NodeNum] = RealID[Color];
1237  }
1238 
1239  // Build dependencies between blocks.
1240  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1241  SUnit *SU = &DAG->SUnits[i];
1242  int SUID = Node2CurrentBlock[i];
1243  for (SDep& SuccDep : SU->Succs) {
1244  SUnit *Succ = SuccDep.getSUnit();
1245  if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1246  continue;
1247  if (Node2CurrentBlock[Succ->NodeNum] != SUID)
1248  CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
1249  SuccDep.isCtrl() ? NoData : Data);
1250  }
1251  for (SDep& PredDep : SU->Preds) {
1252  SUnit *Pred = PredDep.getSUnit();
1253  if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
1254  continue;
1255  if (Node2CurrentBlock[Pred->NodeNum] != SUID)
1256  CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
1257  }
1258  }
1259 
1260  // Free root and leafs of all blocks to enable scheduling inside them.
1261  for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1262  SIScheduleBlock *Block = CurrentBlocks[i];
1263  Block->finalizeUnits();
1264  }
1265  LLVM_DEBUG(dbgs() << "Blocks created:\n\n";
1266  for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1267  SIScheduleBlock *Block = CurrentBlocks[i];
1268  Block->printDebug(true);
1269  });
1270 }
1271 
1272 // Two functions taken from Codegen/MachineScheduler.cpp
1273 
1274 /// Non-const version.
1278  for (; I != End; ++I) {
1279  if (!I->isDebugInstr())
1280  break;
1281  }
1282  return I;
1283 }
1284 
1285 void SIScheduleBlockCreator::topologicalSort() {
1286  unsigned DAGSize = CurrentBlocks.size();
1287  std::vector<int> WorkList;
1288 
1289  LLVM_DEBUG(dbgs() << "Topological Sort\n");
1290 
1291  WorkList.reserve(DAGSize);
1292  TopDownIndex2Block.resize(DAGSize);
1293  TopDownBlock2Index.resize(DAGSize);
1294  BottomUpIndex2Block.resize(DAGSize);
1295 
1296  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1297  SIScheduleBlock *Block = CurrentBlocks[i];
1298  unsigned Degree = Block->getSuccs().size();
1299  TopDownBlock2Index[i] = Degree;
1300  if (Degree == 0) {
1301  WorkList.push_back(i);
1302  }
1303  }
1304 
1305  int Id = DAGSize;
1306  while (!WorkList.empty()) {
1307  int i = WorkList.back();
1308  SIScheduleBlock *Block = CurrentBlocks[i];
1309  WorkList.pop_back();
1310  TopDownBlock2Index[i] = --Id;
1311  TopDownIndex2Block[Id] = i;
1312  for (SIScheduleBlock* Pred : Block->getPreds()) {
1313  if (!--TopDownBlock2Index[Pred->getID()])
1314  WorkList.push_back(Pred->getID());
1315  }
1316  }
1317 
1318 #ifndef NDEBUG
1319  // Check correctness of the ordering.
1320  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1321  SIScheduleBlock *Block = CurrentBlocks[i];
1322  for (SIScheduleBlock* Pred : Block->getPreds()) {
1323  assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
1324  "Wrong Top Down topological sorting");
1325  }
1326  }
1327 #endif
1328 
1329  BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1330  TopDownIndex2Block.rend());
1331 }
1332 
1333 void SIScheduleBlockCreator::scheduleInsideBlocks() {
1334  unsigned DAGSize = CurrentBlocks.size();
1335 
1336  LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1337 
1338  // We do schedule a valid scheduling such that a Block corresponds
1339  // to a range of instructions.
1340  LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1341  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1342  SIScheduleBlock *Block = CurrentBlocks[i];
1343  Block->fastSchedule();
1344  }
1345 
1346  // Note: the following code, and the part restoring previous position
1347  // is by far the most expensive operation of the Scheduler.
1348 
1349  // Do not update CurrentTop.
1350  MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
1351  std::vector<MachineBasicBlock::iterator> PosOld;
1352  std::vector<MachineBasicBlock::iterator> PosNew;
1353  PosOld.reserve(DAG->SUnits.size());
1354  PosNew.reserve(DAG->SUnits.size());
1355 
1356  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1357  int BlockIndice = TopDownIndex2Block[i];
1358  SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1359  std::vector<SUnit*> SUs = Block->getScheduledUnits();
1360 
1361  for (SUnit* SU : SUs) {
1362  MachineInstr *MI = SU->getInstr();
1364  PosOld.push_back(Pos);
1365  if (&*CurrentTopFastSched == MI) {
1366  PosNew.push_back(Pos);
1367  CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
1368  DAG->getCurrentBottom());
1369  } else {
1370  // Update the instruction stream.
1371  DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
1372 
1373  // Update LiveIntervals.
1374  // Note: Moving all instructions and calling handleMove every time
1375  // is the most cpu intensive operation of the scheduler.
1376  // It would gain a lot if there was a way to recompute the
1377  // LiveIntervals for the entire scheduling region.
1378  DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
1379  PosNew.push_back(CurrentTopFastSched);
1380  }
1381  }
1382  }
1383 
1384  // Now we have Block of SUs == Block of MI.
1385  // We do the final schedule for the instructions inside the block.
1386  // The property that all the SUs of the Block are grouped together as MI
1387  // is used for correct reg usage tracking.
1388  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1389  SIScheduleBlock *Block = CurrentBlocks[i];
1390  std::vector<SUnit*> SUs = Block->getScheduledUnits();
1391  Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
1392  }
1393 
1394  LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
1395  // Restore old ordering (which prevents a LIS->handleMove bug).
1396  for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
1397  MachineBasicBlock::iterator POld = PosOld[i-1];
1398  MachineBasicBlock::iterator PNew = PosNew[i-1];
1399  if (PNew != POld) {
1400  // Update the instruction stream.
1401  DAG->getBB()->splice(POld, DAG->getBB(), PNew);
1402 
1403  // Update LiveIntervals.
1404  DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
1405  }
1406  }
1407 
1408  LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1409  SIScheduleBlock *Block = CurrentBlocks[i];
1410  Block->printDebug(true);
1411  });
1412 }
1413 
1414 void SIScheduleBlockCreator::fillStats() {
1415  unsigned DAGSize = CurrentBlocks.size();
1416 
1417  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1418  int BlockIndice = TopDownIndex2Block[i];
1419  SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1420  if (Block->getPreds().empty())
1421  Block->Depth = 0;
1422  else {
1423  unsigned Depth = 0;
1424  for (SIScheduleBlock *Pred : Block->getPreds()) {
1425  if (Depth < Pred->Depth + Pred->getCost())
1426  Depth = Pred->Depth + Pred->getCost();
1427  }
1428  Block->Depth = Depth;
1429  }
1430  }
1431 
1432  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1433  int BlockIndice = BottomUpIndex2Block[i];
1434  SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1435  if (Block->getSuccs().empty())
1436  Block->Height = 0;
1437  else {
1438  unsigned Height = 0;
1439  for (const auto &Succ : Block->getSuccs())
1440  Height = std::max(Height, Succ.first->Height + Succ.first->getCost());
1441  Block->Height = Height;
1442  }
1443  }
1444 }
1445 
1446 // SIScheduleBlockScheduler //
1447 
1450  SIScheduleBlocks BlocksStruct) :
1451  DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
1452  LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1453  SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1454 
1455  // Fill the usage of every output
1456  // Warning: while by construction we always have a link between two blocks
1457  // when one needs a result from the other, the number of users of an output
1458  // is not the sum of child blocks having as input the same virtual register.
1459  // Here is an example. A produces x and y. B eats x and produces x'.
1460  // C eats x' and y. The register coalescer may have attributed the same
1461  // virtual register to x and x'.
1462  // To count accurately, we do a topological sort. In case the register is
1463  // found for several parents, we increment the usage of the one with the
1464  // highest topological index.
1465  LiveOutRegsNumUsages.resize(Blocks.size());
1466  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1467  SIScheduleBlock *Block = Blocks[i];
1468  for (unsigned Reg : Block->getInRegs()) {
1469  bool Found = false;
1470  int topoInd = -1;
1471  for (SIScheduleBlock* Pred: Block->getPreds()) {
1472  std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1473  std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1474 
1475  if (RegPos != PredOutRegs.end()) {
1476  Found = true;
1477  if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
1478  topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
1479  }
1480  }
1481  }
1482 
1483  if (!Found)
1484  continue;
1485 
1486  int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
1487  ++LiveOutRegsNumUsages[PredID][Reg];
1488  }
1489  }
1490 
1491  LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
1492  BlockNumPredsLeft.resize(Blocks.size());
1493  BlockNumSuccsLeft.resize(Blocks.size());
1494 
1495  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1496  SIScheduleBlock *Block = Blocks[i];
1497  BlockNumPredsLeft[i] = Block->getPreds().size();
1498  BlockNumSuccsLeft[i] = Block->getSuccs().size();
1499  }
1500 
1501 #ifndef NDEBUG
1502  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1503  SIScheduleBlock *Block = Blocks[i];
1504  assert(Block->getID() == i);
1505  }
1506 #endif
1507 
1508  std::set<unsigned> InRegs = DAG->getInRegs();
1509  addLiveRegs(InRegs);
1510 
1511  // Increase LiveOutRegsNumUsages for blocks
1512  // producing registers consumed in another
1513  // scheduling region.
1514  for (unsigned Reg : DAG->getOutRegs()) {
1515  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1516  // Do reverse traversal
1517  int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
1518  SIScheduleBlock *Block = Blocks[ID];
1519  const std::set<unsigned> &OutRegs = Block->getOutRegs();
1520 
1521  if (OutRegs.find(Reg) == OutRegs.end())
1522  continue;
1523 
1524  ++LiveOutRegsNumUsages[ID][Reg];
1525  break;
1526  }
1527  }
1528 
1529  // Fill LiveRegsConsumers for regs that were already
1530  // defined before scheduling.
1531  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1532  SIScheduleBlock *Block = Blocks[i];
1533  for (unsigned Reg : Block->getInRegs()) {
1534  bool Found = false;
1535  for (SIScheduleBlock* Pred: Block->getPreds()) {
1536  std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1537  std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1538 
1539  if (RegPos != PredOutRegs.end()) {
1540  Found = true;
1541  break;
1542  }
1543  }
1544 
1545  if (!Found)
1546  ++LiveRegsConsumers[Reg];
1547  }
1548  }
1549 
1550  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1551  SIScheduleBlock *Block = Blocks[i];
1552  if (BlockNumPredsLeft[i] == 0) {
1553  ReadyBlocks.push_back(Block);
1554  }
1555  }
1556 
1557  while (SIScheduleBlock *Block = pickBlock()) {
1558  BlocksScheduled.push_back(Block);
1559  blockScheduled(Block);
1560  }
1561 
1562  LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block
1563  : BlocksScheduled) {
1564  dbgs() << ' ' << Block->getID();
1565  } dbgs() << '\n';);
1566 }
1567 
1568 bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1569  SIBlockSchedCandidate &TryCand) {
1570  if (!Cand.isValid()) {
1571  TryCand.Reason = NodeOrder;
1572  return true;
1573  }
1574 
1575  // Try to hide high latencies.
1576  if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled,
1577  Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
1578  return true;
1579  // Schedule high latencies early so you can hide them better.
1580  if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
1581  TryCand, Cand, Latency))
1582  return true;
1583  if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height,
1584  TryCand, Cand, Depth))
1585  return true;
1586  if (SISched::tryGreater(TryCand.NumHighLatencySuccessors,
1587  Cand.NumHighLatencySuccessors,
1588  TryCand, Cand, Successor))
1589  return true;
1590  return false;
1591 }
1592 
1593 bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1594  SIBlockSchedCandidate &TryCand) {
1595  if (!Cand.isValid()) {
1596  TryCand.Reason = NodeOrder;
1597  return true;
1598  }
1599 
1600  if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
1601  TryCand, Cand, RegUsage))
1602  return true;
1603  if (SISched::tryGreater(TryCand.NumSuccessors > 0,
1604  Cand.NumSuccessors > 0,
1605  TryCand, Cand, Successor))
1606  return true;
1607  if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
1608  return true;
1609  if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
1610  TryCand, Cand, RegUsage))
1611  return true;
1612  return false;
1613 }
1614 
1615 SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
1616  SIBlockSchedCandidate Cand;
1617  std::vector<SIScheduleBlock*>::iterator Best;
1618  SIScheduleBlock *Block;
1619  if (ReadyBlocks.empty())
1620  return nullptr;
1621 
1622  DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
1623  VregCurrentUsage, SregCurrentUsage);
1624  if (VregCurrentUsage > maxVregUsage)
1625  maxVregUsage = VregCurrentUsage;
1626  if (SregCurrentUsage > maxSregUsage)
1627  maxSregUsage = SregCurrentUsage;
1628  LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
1629  for (SIScheduleBlock *Block
1630  : ReadyBlocks) dbgs()
1631  << Block->getID() << ' ';
1632  dbgs() << "\nCurrent Live:\n";
1633  for (unsigned Reg
1634  : LiveRegs) dbgs()
1635  << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
1636  dbgs() << '\n';
1637  dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
1638  dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
1639 
1640  Cand.Block = nullptr;
1641  for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
1642  E = ReadyBlocks.end(); I != E; ++I) {
1643  SIBlockSchedCandidate TryCand;
1644  TryCand.Block = *I;
1645  TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1646  TryCand.VGPRUsageDiff =
1647  checkRegUsageImpact(TryCand.Block->getInRegs(),
1648  TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
1649  TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1650  TryCand.NumHighLatencySuccessors =
1651  TryCand.Block->getNumHighLatencySuccessors();
1652  TryCand.LastPosHighLatParentScheduled =
1653  (unsigned int) std::max<int> (0,
1654  LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
1655  LastPosWaitedHighLatency);
1656  TryCand.Height = TryCand.Block->Height;
1657  // Try not to increase VGPR usage too much, else we may spill.
1658  if (VregCurrentUsage > 120 ||
1660  if (!tryCandidateRegUsage(Cand, TryCand) &&
1662  tryCandidateLatency(Cand, TryCand);
1663  } else {
1664  if (!tryCandidateLatency(Cand, TryCand))
1665  tryCandidateRegUsage(Cand, TryCand);
1666  }
1667  if (TryCand.Reason != NoCand) {
1668  Cand.setBest(TryCand);
1669  Best = I;
1670  LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
1671  << getReasonStr(Cand.Reason) << '\n');
1672  }
1673  }
1674 
1675  LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n';
1676  dbgs() << "Is a block with high latency instruction: "
1677  << (Cand.IsHighLatency ? "yes\n" : "no\n");
1678  dbgs() << "Position of last high latency dependency: "
1679  << Cand.LastPosHighLatParentScheduled << '\n';
1680  dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
1681  dbgs() << '\n';);
1682 
1683  Block = Cand.Block;
1684  ReadyBlocks.erase(Best);
1685  return Block;
1686 }
1687 
1688 // Tracking of currently alive registers to determine VGPR Usage.
1689 
1690 void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
1691  for (unsigned Reg : Regs) {
1692  // For now only track virtual registers.
1694  continue;
1695  // If not already in the live set, then add it.
1696  (void) LiveRegs.insert(Reg);
1697  }
1698 }
1699 
1700 void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
1701  std::set<unsigned> &Regs) {
1702  for (unsigned Reg : Regs) {
1703  // For now only track virtual registers.
1704  std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
1705  assert (Pos != LiveRegs.end() && // Reg must be live.
1706  LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
1707  LiveRegsConsumers[Reg] >= 1);
1708  --LiveRegsConsumers[Reg];
1709  if (LiveRegsConsumers[Reg] == 0)
1710  LiveRegs.erase(Pos);
1711  }
1712 }
1713 
1714 void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
1715  for (const auto &Block : Parent->getSuccs()) {
1716  if (--BlockNumPredsLeft[Block.first->getID()] == 0)
1717  ReadyBlocks.push_back(Block.first);
1718 
1719  if (Parent->isHighLatencyBlock() &&
1720  Block.second == SIScheduleBlockLinkKind::Data)
1721  LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
1722  }
1723 }
1724 
1725 void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
1726  decreaseLiveRegs(Block, Block->getInRegs());
1727  addLiveRegs(Block->getOutRegs());
1728  releaseBlockSuccs(Block);
1729  for (std::map<unsigned, unsigned>::iterator RegI =
1730  LiveOutRegsNumUsages[Block->getID()].begin(),
1731  E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
1732  std::pair<unsigned, unsigned> RegP = *RegI;
1733  // We produce this register, thus it must not be previously alive.
1734  assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
1735  LiveRegsConsumers[RegP.first] == 0);
1736  LiveRegsConsumers[RegP.first] += RegP.second;
1737  }
1738  if (LastPosHighLatencyParentScheduled[Block->getID()] >
1739  (unsigned)LastPosWaitedHighLatency)
1740  LastPosWaitedHighLatency =
1741  LastPosHighLatencyParentScheduled[Block->getID()];
1742  ++NumBlockScheduled;
1743 }
1744 
1745 std::vector<int>
1746 SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
1747  std::set<unsigned> &OutRegs) {
1748  std::vector<int> DiffSetPressure;
1749  DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
1750 
1751  for (unsigned Reg : InRegs) {
1752  // For now only track virtual registers.
1754  continue;
1755  if (LiveRegsConsumers[Reg] > 1)
1756  continue;
1757  PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1758  for (; PSetI.isValid(); ++PSetI) {
1759  DiffSetPressure[*PSetI] -= PSetI.getWeight();
1760  }
1761  }
1762 
1763  for (unsigned Reg : OutRegs) {
1764  // For now only track virtual registers.
1766  continue;
1767  PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1768  for (; PSetI.isValid(); ++PSetI) {
1769  DiffSetPressure[*PSetI] += PSetI.getWeight();
1770  }
1771  }
1772 
1773  return DiffSetPressure;
1774 }
1775 
1776 // SIScheduler //
1777 
1778 struct SIScheduleBlockResult
1779 SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1780  SISchedulerBlockSchedulerVariant ScheduleVariant) {
1781  SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
1782  SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
1783  std::vector<SIScheduleBlock*> ScheduledBlocks;
1784  struct SIScheduleBlockResult Res;
1785 
1786  ScheduledBlocks = Scheduler.getBlocks();
1787 
1788  for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
1789  SIScheduleBlock *Block = ScheduledBlocks[b];
1790  std::vector<SUnit*> SUs = Block->getScheduledUnits();
1791 
1792  for (SUnit* SU : SUs)
1793  Res.SUs.push_back(SU->NodeNum);
1794  }
1795 
1796  Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
1797  Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
1798  return Res;
1799 }
1800 
1801 // SIScheduleDAGMI //
1802 
1804  ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)) {
1805  SITII = static_cast<const SIInstrInfo*>(TII);
1806  SITRI = static_cast<const SIRegisterInfo*>(TRI);
1807 
1808  VGPRSetID = SITRI->getVGPRPressureSet();
1809  SGPRSetID = SITRI->getSGPRPressureSet();
1810 }
1811 
1813 
1814 // Code adapted from scheduleDAG.cpp
1815 // Does a topological sort over the SUs.
1816 // Both TopDown and BottomUp
1817 void SIScheduleDAGMI::topologicalSort() {
1819 
1820  TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
1821  BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
1822 }
1823 
1824 // Move low latencies further from their user without
1825 // increasing SGPR usage (in general)
1826 // This is to be replaced by a better pass that would
1827 // take into account SGPR usage (based on VGPR Usage
1828 // and the corresponding wavefront count), that would
1829 // try to merge groups of loads if it make sense, etc
1830 void SIScheduleDAGMI::moveLowLatencies() {
1831  unsigned DAGSize = SUnits.size();
1832  int LastLowLatencyUser = -1;
1833  int LastLowLatencyPos = -1;
1834 
1835  for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
1836  SUnit *SU = &SUnits[ScheduledSUnits[i]];
1837  bool IsLowLatencyUser = false;
1838  unsigned MinPos = 0;
1839 
1840  for (SDep& PredDep : SU->Preds) {
1841  SUnit *Pred = PredDep.getSUnit();
1842  if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
1843  IsLowLatencyUser = true;
1844  }
1845  if (Pred->NodeNum >= DAGSize)
1846  continue;
1847  unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
1848  if (PredPos >= MinPos)
1849  MinPos = PredPos + 1;
1850  }
1851 
1852  if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1853  unsigned BestPos = LastLowLatencyUser + 1;
1854  if ((int)BestPos <= LastLowLatencyPos)
1855  BestPos = LastLowLatencyPos + 1;
1856  if (BestPos < MinPos)
1857  BestPos = MinPos;
1858  if (BestPos < i) {
1859  for (unsigned u = i; u > BestPos; --u) {
1860  ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1861  ScheduledSUnits[u] = ScheduledSUnits[u-1];
1862  }
1863  ScheduledSUnits[BestPos] = SU->NodeNum;
1864  ScheduledSUnitsInv[SU->NodeNum] = BestPos;
1865  }
1866  LastLowLatencyPos = BestPos;
1867  if (IsLowLatencyUser)
1868  LastLowLatencyUser = BestPos;
1869  } else if (IsLowLatencyUser) {
1870  LastLowLatencyUser = i;
1871  // Moves COPY instructions on which depends
1872  // the low latency instructions too.
1873  } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
1874  bool CopyForLowLat = false;
1875  for (SDep& SuccDep : SU->Succs) {
1876  SUnit *Succ = SuccDep.getSUnit();
1877  if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1878  continue;
1879  if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
1880  CopyForLowLat = true;
1881  }
1882  }
1883  if (!CopyForLowLat)
1884  continue;
1885  if (MinPos < i) {
1886  for (unsigned u = i; u > MinPos; --u) {
1887  ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1888  ScheduledSUnits[u] = ScheduledSUnits[u-1];
1889  }
1890  ScheduledSUnits[MinPos] = SU->NodeNum;
1891  ScheduledSUnitsInv[SU->NodeNum] = MinPos;
1892  }
1893  }
1894  }
1895 }
1896 
1898  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1899  SUnits[i].isScheduled = false;
1900  SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
1901  SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
1902  SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
1903  SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
1904  }
1905 }
1906 
1907 // Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1908 template<typename _Iterator> void
1909 SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
1910  unsigned &VgprUsage, unsigned &SgprUsage) {
1911  VgprUsage = 0;
1912  SgprUsage = 0;
1913  for (_Iterator RegI = First; RegI != End; ++RegI) {
1914  unsigned Reg = *RegI;
1915  // For now only track virtual registers
1916  if (!Register::isVirtualRegister(Reg))
1917  continue;
1918  PSetIterator PSetI = MRI.getPressureSets(Reg);
1919  for (; PSetI.isValid(); ++PSetI) {
1920  if (*PSetI == VGPRSetID)
1921  VgprUsage += PSetI.getWeight();
1922  else if (*PSetI == SGPRSetID)
1923  SgprUsage += PSetI.getWeight();
1924  }
1925  }
1926 }
1927 
1929 {
1930  SmallVector<SUnit*, 8> TopRoots, BotRoots;
1931  SIScheduleBlockResult Best, Temp;
1932  LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
1933 
1935  LLVM_DEBUG(dump());
1936 
1937  topologicalSort();
1938  findRootsAndBiasEdges(TopRoots, BotRoots);
1939  // We reuse several ScheduleDAGMI and ScheduleDAGMILive
1940  // functions, but to make them happy we must initialize
1941  // the default Scheduler implementation (even if we do not
1942  // run it)
1943  SchedImpl->initialize(this);
1944  initQueues(TopRoots, BotRoots);
1945 
1946  // Fill some stats to help scheduling.
1947 
1948  SUnitsLinksBackup = SUnits;
1949  IsLowLatencySU.clear();
1950  LowLatencyOffset.clear();
1951  IsHighLatencySU.clear();
1952 
1953  IsLowLatencySU.resize(SUnits.size(), 0);
1954  LowLatencyOffset.resize(SUnits.size(), 0);
1955  IsHighLatencySU.resize(SUnits.size(), 0);
1956 
1957  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1958  SUnit *SU = &SUnits[i];
1959  const MachineOperand *BaseLatOp;
1960  int64_t OffLatReg;
1961  if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1962  IsLowLatencySU[i] = 1;
1963  if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg,
1964  TRI))
1965  LowLatencyOffset[i] = OffLatReg;
1966  } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
1967  IsHighLatencySU[i] = 1;
1968  }
1969 
1970  SIScheduler Scheduler(this);
1973 
1974  // if VGPR usage is extremely high, try other good performing variants
1975  // which could lead to lower VGPR usage
1976  if (Best.MaxVGPRUsage > 180) {
1977  static const std::pair<SISchedulerBlockCreatorVariant,
1979  Variants[] = {
1981 // { LatenciesAlone, BlockRegUsage },
1983 // { LatenciesGrouped, BlockRegUsageLatency },
1984 // { LatenciesGrouped, BlockRegUsage },
1985  { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1986 // { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1987 // { LatenciesAlonePlusConsecutive, BlockRegUsage }
1988  };
1989  for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1990  Temp = Scheduler.scheduleVariant(v.first, v.second);
1991  if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1992  Best = Temp;
1993  }
1994  }
1995  // if VGPR usage is still extremely high, we may spill. Try other variants
1996  // which are less performing, but that could lead to lower VGPR usage.
1997  if (Best.MaxVGPRUsage > 200) {
1998  static const std::pair<SISchedulerBlockCreatorVariant,
2000  Variants[] = {
2001 // { LatenciesAlone, BlockRegUsageLatency },
2003 // { LatenciesGrouped, BlockLatencyRegUsage },
2005  { LatenciesGrouped, BlockRegUsage },
2006 // { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
2007  { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
2008  { LatenciesAlonePlusConsecutive, BlockRegUsage }
2009  };
2010  for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
2011  Temp = Scheduler.scheduleVariant(v.first, v.second);
2012  if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
2013  Best = Temp;
2014  }
2015  }
2016 
2017  ScheduledSUnits = Best.SUs;
2018  ScheduledSUnitsInv.resize(SUnits.size());
2019 
2020  for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
2021  ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
2022  }
2023 
2024  moveLowLatencies();
2025 
2026  // Tell the outside world about the result of the scheduling.
2027 
2028  assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
2030 
2031  for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
2032  E = ScheduledSUnits.end(); I != E; ++I) {
2033  SUnit *SU = &SUnits[*I];
2034 
2035  scheduleMI(SU, true);
2036 
2037  LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
2038  << *SU->getInstr());
2039  }
2040 
2041  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
2042 
2043  placeDebugValues();
2044 
2045  LLVM_DEBUG({
2046  dbgs() << "*** Final schedule for "
2047  << printMBBReference(*begin()->getParent()) << " ***\n";
2048  dumpSchedule();
2049  dbgs() << '\n';
2050  });
2051 }
uint64_t CallInst * C
ScheduleDAGTopologicalSort Topo
Topo - A topological ordering for SUnits which permits fast IsReachable and similar queries...
Interface definition for SIRegisterInfo.
SIScheduleBlockLinkKind
virtual unsigned getNumRegPressureSets() const =0
Get the number of dimensions of register pressure.
std::vector< SIScheduleBlock * > Blocks
SIScheduleDAGMI(MachineSchedContext *C)
void dumpSchedule() const
dump the scheduled Sequence.
This class represents lattice values for constants.
Definition: AllocatorList.h:23
MachineBasicBlock::iterator CurrentTop
The top of the unscheduled zone.
SIScheduleCandReason Reason
void dump() const override
unsigned Reg
std::vector< unsigned > IsLowLatencySU
void addUnit(SUnit *SU)
Functions for Block construction.
std::vector< SIScheduleBlock * > getBlocks()
static MachineBasicBlock::iterator nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::const_iterator End)
Non-const version.
std::unique_ptr< MachineSchedStrategy > SchedImpl
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition: ScheduleDAG.h:256
ScheduleDAGMILive is an implementation of ScheduleDAGInstrs that schedules machine instructions while...
bool isScheduled
True once scheduled.
Definition: ScheduleDAG.h:284
std::vector< unsigned > LowLatencyOffset
static bool hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU)
Definition: BitVector.h:937
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
Regular data dependence (aka true-dependence).
Definition: ScheduleDAG.h:53
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1179
ArrayRef< std::pair< SIScheduleBlock *, SIScheduleBlockLinkKind > > getSuccs() const
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:410
void InitDAGTopologicalSorting()
Creates the initial topological ordering from the DAG to be scheduled.
MachineBasicBlock::const_iterator getPos() const
Get the MI position corresponding to this register pressure.
static bool tryLess(int TryVal, int CandVal, SISchedulerCandidate &TryCand, SISchedulerCandidate &Cand, SIScheduleCandReason Reason)
defusechain_iterator - This class provides iterator support for machine operands in the function that...
static const char * getReasonStr(SIScheduleCandReason Reason)
std::set< unsigned > getOutRegs()
void scheduleMI(SUnit *SU, bool IsTopNode)
Move an instruction and update register pressure.
std::set< unsigned > getInRegs()
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def...
Definition: SlotIndexes.h:254
std::vector< int > GetSubGraph(const SUnit &StartSU, const SUnit &TargetSU, bool &Success)
Returns an array of SUs that are both in the successor subtree of StartSU and in the predecessor subt...
MachineBasicBlock::iterator RegionBegin
The beginning of the range to be scheduled.
void buildDAGWithRegPressure()
Call ScheduleDAGInstrs::buildSchedGraph with register pressure tracking enabled.
static def_instr_iterator def_instr_end()
MachineBasicBlock::iterator begin() const
Returns an iterator to the top of the current scheduling region.
void schedule() override
Implement ScheduleDAGInstrs interface for scheduling a sequence of reorderable instructions.
bool isCtrl() const
Shorthand for getKind() != SDep::Data.
Definition: ScheduleDAG.h:161
unsigned NumPredsLeft
of preds not scheduled.
Definition: ScheduleDAG.h:268
SUnit * getSUnit() const
Definition: ScheduleDAG.h:480
std::vector< int > TopDownIndex2Block
struct SIScheduleBlockResult scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, SISchedulerBlockSchedulerVariant ScheduleVariant)
SI Machine Scheduler interface.
Scheduling dependency.
Definition: ScheduleDAG.h:49
void addLiveRegs(ArrayRef< RegisterMaskPair > Regs)
Force liveness of virtual registers or physical register units.
#define P(N)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:373
unsigned const MachineRegisterInfo * MRI
~SIScheduleDAGMI() override
RegisterPressure computed within a region of instructions delimited by TopIdx and BottomIdx...
bool isSUInBlock(SUnit *SU, unsigned ID)
std::vector< SUnit * > getScheduledUnits()
std::vector< int > TopDownIndex2SU
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, SISchedulerBlockSchedulerVariant Variant, SIScheduleBlocks BlocksStruct)
Printable printVRegOrUnit(unsigned VRegOrUnit, const TargetRegisterInfo *TRI)
Create Printable object to print virtual registers and physical registers on a raw_ostream.
void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind)
MachineRegisterInfo * getMRI()
MachineBasicBlock * getBB()
constexpr double e
Definition: MathExtras.h:57
Track the current register pressure at some position in the instruction stream, and remember the high...
LiveIntervals * getLIS()
unsigned getSGPRPressureSet() const
const TargetRegisterInfo * getTRI()
SIScheduleBlocks getBlocks(SISchedulerBlockCreatorVariant BlockVariant)
void initQueues(ArrayRef< SUnit *> TopRoots, ArrayRef< SUnit *> BotRoots)
Release ExitSU predecessors and setup scheduler queues.
bool getMemOperandWithOffset(const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, const TargetRegisterInfo *TRI) const final
auto find(R &&Range, const T &Val) -> decltype(adl_begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1186
MachineBasicBlock::iterator getCurrentBottom()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
void schedule(MachineBasicBlock::iterator BeginBlock, MachineBasicBlock::iterator EndBlock)
unsigned getVGPRPressureSet() const
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:516
unsigned WeakPredsLeft
of weak preds not scheduled.
Definition: ScheduleDAG.h:270
std::vector< unsigned > MaxSetPressure
Map of max reg pressure indexed by pressure set ID, not class ID.
std::set< unsigned > & getOutRegs()
Color
A "color", which is either even or odd.
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:390
GenericScheduler shrinks the unscheduled zone using heuristics to balance the schedule.
bool isDebugValue() const
std::vector< unsigned > IsHighLatencySU
MachineOperand class - Representation of each machine instruction operand.
std::vector< unsigned > SUs
SISchedulerBlockCreatorVariant
SIScheduleBlockCreator(SIScheduleDAGMI *DAG)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
MachineBasicBlock::iterator getCurrentTop()
void fillVgprSgprCost(_Iterator First, _Iterator End, unsigned &VgprUsage, unsigned &SgprUsage)
static bool tryGreater(int TryVal, int CandVal, SISchedulerCandidate &TryCand, SISchedulerCandidate &Cand, SIScheduleCandReason Reason)
def_instr_iterator def_instr_begin(unsigned RegNo) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
ScheduleDAGTopologicalSort * GetTopo()
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:63
Interface definition for SIInstrInfo.
void addPred(SIScheduleBlock *Pred)
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:559
void findRootsAndBiasEdges(SmallVectorImpl< SUnit *> &TopRoots, SmallVectorImpl< SUnit *> &BotRoots)
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB &#39;Other&#39; at the position From, and insert it into this MBB right before &#39;...
std::vector< int > BottomUpIndex2SU
Iterate over the pressure sets affected by the given physical or virtual register.
PSetIterator getPressureSets(unsigned RegUnit) const
Get an iterator over the pressure sets affected by the given physical or virtual register.
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isDefBetween(unsigned Reg, SlotIndex First, SlotIndex Last, const MachineRegisterInfo *MRI, const LiveIntervals *LIS)
bool isHighLatencyInstruction(const MachineInstr &MI) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
void placeDebugValues()
Reinsert debug_values recorded in ScheduleDAGInstrs::DbgValues.
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:558
unsigned NodeNum
Entry # of node in the node vector.
Definition: ScheduleDAG.h:264
MachineBasicBlock::iterator CurrentBottom
The bottom of the unscheduled zone.
std::set< unsigned > & getInRegs()
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
std::vector< int > TopDownBlock2Index
const std::vector< SIScheduleBlock * > & getPreds() const
void setRepeat(SIScheduleCandReason R)
static bool isVirtualRegister(unsigned Reg)
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:69
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition: ScheduleDAG.h:257
static const Function * getParent(const Value *V)
bool isWeak() const
Tests if this a weak dependence.
Definition: ScheduleDAG.h:194
Machine Instruction Scheduler
IRTranslator LLVM IR MI
void setPos(MachineBasicBlock::const_iterator Pos)
unsigned getWeight() const
MachineRegisterInfo & MRI
Virtual/real register map.
Definition: ScheduleDAG.h:561
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:562
#define LLVM_DEBUG(X)
Definition: Debug.h:122
for(unsigned i=Desc.getNumOperands(), e=OldMI.getNumOperands();i !=e;++i)
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:83
unsigned getVGPRSetID() const
RegPressureTracker TopRPTracker
SISchedulerBlockSchedulerVariant
void handleMove(MachineInstr &MI, bool UpdateFlags=false)
Call this method to notify LiveIntervals that instruction MI has been moved within a basic block...
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242