LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIMachineScheduler.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 734 944 77.8 %
Date: 2017-09-14 15:23:50 Functions: 45 51 88.2 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// \brief SI Machine Scheduler interface
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #include "SIMachineScheduler.h"
      16             : #include "AMDGPU.h"
      17             : #include "SIInstrInfo.h"
      18             : #include "SIRegisterInfo.h"
      19             : #include "llvm/ADT/STLExtras.h"
      20             : #include "llvm/ADT/SmallVector.h"
      21             : #include "llvm/CodeGen/LiveInterval.h"
      22             : #include "llvm/CodeGen/LiveIntervalAnalysis.h"
      23             : #include "llvm/CodeGen/MachineInstr.h"
      24             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      25             : #include "llvm/CodeGen/MachineScheduler.h"
      26             : #include "llvm/CodeGen/RegisterPressure.h"
      27             : #include "llvm/CodeGen/SlotIndexes.h"
      28             : #include "llvm/Support/Debug.h"
      29             : #include "llvm/Support/ErrorHandling.h"
      30             : #include "llvm/Support/raw_ostream.h"
      31             : #include "llvm/Target/TargetRegisterInfo.h"
      32             : #include <algorithm>
      33             : #include <cassert>
      34             : #include <map>
      35             : #include <set>
      36             : #include <utility>
      37             : #include <vector>
      38             : 
      39             : using namespace llvm;
      40             : 
      41             : #define DEBUG_TYPE "machine-scheduler"
      42             : 
      43             : // This scheduler implements a different scheduling algorithm than
      44             : // GenericScheduler.
      45             : //
      46             : // There are several specific architecture behaviours that can't be modelled
      47             : // for GenericScheduler:
      48             : // . When accessing the result of an SGPR load instruction, you have to wait
      49             : // for all the SGPR load instructions before your current instruction to
      50             : // have finished.
      51             : // . When accessing the result of an VGPR load instruction, you have to wait
      52             : // for all the VGPR load instructions previous to the VGPR load instruction
      53             : // you are interested in to finish.
      54             : // . The less the register pressure, the best load latencies are hidden
      55             : //
      56             : // Moreover some specifities (like the fact a lot of instructions in the shader
      57             : // have few dependencies) makes the generic scheduler have some unpredictable
      58             : // behaviours. For example when register pressure becomes high, it can either
      59             : // manage to prevent register pressure from going too high, or it can
      60             : // increase register pressure even more than if it hadn't taken register
      61             : // pressure into account.
      62             : //
      63             : // Also some other bad behaviours are generated, like loading at the beginning
      64             : // of the shader a constant in VGPR you won't need until the end of the shader.
      65             : //
      66             : // The scheduling problem for SI can distinguish three main parts:
      67             : // . Hiding high latencies (texture sampling, etc)
      68             : // . Hiding low latencies (SGPR constant loading, etc)
      69             : // . Keeping register usage low for better latency hiding and general
      70             : //   performance
      71             : //
      72             : // Some other things can also affect performance, but are hard to predict
      73             : // (cache usage, the fact the HW can issue several instructions from different
      74             : // wavefronts if different types, etc)
      75             : //
      76             : // This scheduler tries to solve the scheduling problem by dividing it into
      77             : // simpler sub-problems. It divides the instructions into blocks, schedules
      78             : // locally inside the blocks where it takes care of low latencies, and then
      79             : // chooses the order of the blocks by taking care of high latencies.
      80             : // Dividing the instructions into blocks helps control keeping register
      81             : // usage low.
      82             : //
      83             : // First the instructions are put into blocks.
      84             : //   We want the blocks help control register usage and hide high latencies
      85             : //   later. To help control register usage, we typically want all local
      86             : //   computations, when for example you create a result that can be comsummed
      87             : //   right away, to be contained in a block. Block inputs and outputs would
      88             : //   typically be important results that are needed in several locations of
      89             : //   the shader. Since we do want blocks to help hide high latencies, we want
      90             : //   the instructions inside the block to have a minimal set of dependencies
      91             : //   on high latencies. It will make it easy to pick blocks to hide specific
      92             : //   high latencies.
      93             : //   The block creation algorithm is divided into several steps, and several
      94             : //   variants can be tried during the scheduling process.
      95             : //
      96             : // Second the order of the instructions inside the blocks is chosen.
      97             : //   At that step we do take into account only register usage and hiding
      98             : //   low latency instructions
      99             : //
     100             : // Third the block order is chosen, there we try to hide high latencies
     101             : // and keep register usage low.
     102             : //
     103             : // After the third step, a pass is done to improve the hiding of low
     104             : // latencies.
     105             : //
     106             : // Actually when talking about 'low latency' or 'high latency' it includes
     107             : // both the latency to get the cache (or global mem) data go to the register,
     108             : // and the bandwidth limitations.
     109             : // Increasing the number of active wavefronts helps hide the former, but it
     110             : // doesn't solve the latter, thus why even if wavefront count is high, we have
     111             : // to try have as many instructions hiding high latencies as possible.
     112             : // The OpenCL doc says for example latency of 400 cycles for a global mem access,
     113             : // which is hidden by 10 instructions if the wavefront count is 10.
     114             : 
     115             : // Some figures taken from AMD docs:
     116             : // Both texture and constant L1 caches are 4-way associative with 64 bytes
     117             : // lines.
     118             : // Constant cache is shared with 4 CUs.
     119             : // For texture sampling, the address generation unit receives 4 texture
     120             : // addresses per cycle, thus we could expect texture sampling latency to be
     121             : // equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
     122             : // instructions in a wavefront group are executed every 4 cycles),
     123             : // or 16 instructions if the other wavefronts associated to the 3 other VALUs
     124             : // of the CU do texture sampling too. (Don't take these figures too seriously,
     125             : // as I'm not 100% sure of the computation)
     126             : // Data exports should get similar latency.
     127             : // For constant loading, the cache is shader with 4 CUs.
     128             : // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
     129             : // I guess if the other CU don't read the cache, it can go up to 64B/cycle.
     130             : // It means a simple s_buffer_load should take one instruction to hide, as
     131             : // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
     132             : // cache line.
     133             : //
     134             : // As of today the driver doesn't preload the constants in cache, thus the
     135             : // first loads get extra latency. The doc says global memory access can be
     136             : // 300-600 cycles. We do not specially take that into account when scheduling
     137             : // As we expect the driver to be able to preload the constants soon.
     138             : 
     139             : // common code //
     140             : 
     141             : #ifndef NDEBUG
     142             : 
     143             : static const char *getReasonStr(SIScheduleCandReason Reason) {
     144             :   switch (Reason) {
     145             :   case NoCand:         return "NOCAND";
     146             :   case RegUsage:       return "REGUSAGE";
     147             :   case Latency:        return "LATENCY";
     148             :   case Successor:      return "SUCCESSOR";
     149             :   case Depth:          return "DEPTH";
     150             :   case NodeOrder:      return "ORDER";
     151             :   }
     152             :   llvm_unreachable("Unknown reason!");
     153             : }
     154             : 
     155             : #endif
     156             : 
     157             : static bool tryLess(int TryVal, int CandVal,
     158             :                     SISchedulerCandidate &TryCand,
     159             :                     SISchedulerCandidate &Cand,
     160             :                     SIScheduleCandReason Reason) {
     161          17 :   if (TryVal < CandVal) {
     162           2 :     TryCand.Reason = Reason;
     163             :     return true;
     164             :   }
     165          15 :   if (TryVal > CandVal) {
     166           0 :     if (Cand.Reason > Reason)
     167           0 :       Cand.Reason = Reason;
     168             :     return true;
     169             :   }
     170          15 :   Cand.setRepeat(Reason);
     171             :   return false;
     172             : }
     173             : 
     174             : static bool tryGreater(int TryVal, int CandVal,
     175             :                        SISchedulerCandidate &TryCand,
     176             :                        SISchedulerCandidate &Cand,
     177             :                        SIScheduleCandReason Reason) {
     178           9 :   if (TryVal > CandVal) {
     179           1 :     TryCand.Reason = Reason;
     180             :     return true;
     181             :   }
     182           8 :   if (TryVal < CandVal) {
     183           0 :     if (Cand.Reason > Reason)
     184           0 :       Cand.Reason = Reason;
     185             :     return true;
     186             :   }
     187           8 :   Cand.setRepeat(Reason);
     188             :   return false;
     189             : }
     190             : 
     191             : // SIScheduleBlock //
     192             : 
     193          16 : void SIScheduleBlock::addUnit(SUnit *SU) {
     194          32 :   NodeNum2Index[SU->NodeNum] = SUnits.size();
     195          16 :   SUnits.push_back(SU);
     196          16 : }
     197             : 
     198             : #ifndef NDEBUG
     199             : void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
     200             : 
     201             :   dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
     202             :   dbgs() << '\n';
     203             : }
     204             : #endif
     205             : 
     206          25 : void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
     207             :                                           SISchedCandidate &TryCand) {
     208             :   // Initialize the candidate if needed.
     209          25 :   if (!Cand.isValid()) {
     210          16 :     TryCand.Reason = NodeOrder;
     211          16 :     return;
     212             :   }
     213             : 
     214           9 :   if (Cand.SGPRUsage > 60 &&
     215           0 :       tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage))
     216             :     return;
     217             : 
     218             :   // Schedule low latency instructions as top as possible.
     219             :   // Order of priority is:
     220             :   // . Low latency instructions which do not depend on other low latency
     221             :   //   instructions we haven't waited for
     222             :   // . Other instructions which do not depend on low latency instructions
     223             :   //   we haven't waited for
     224             :   // . Low latencies
     225             :   // . All other instructions
     226             :   // Goal is to get: low latency instructions - independent instructions
     227             :   //     - (eventually some more low latency instructions)
     228             :   //     - instructions that depend on the first low latency instructions.
     229             :   // If in the block there is a lot of constant loads, the SGPR usage
     230             :   // could go quite high, thus above the arbitrary limit of 60 will encourage
     231             :   // use the already loaded constants (in order to release some SGPRs) before
     232             :   // loading more.
     233          27 :   if (tryLess(TryCand.HasLowLatencyNonWaitedParent,
     234           9 :               Cand.HasLowLatencyNonWaitedParent,
     235             :               TryCand, Cand, SIScheduleCandReason::Depth))
     236             :     return;
     237             : 
     238          18 :   if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
     239             :                  TryCand, Cand, SIScheduleCandReason::Depth))
     240             :     return;
     241             : 
     242           8 :   if (TryCand.IsLowLatency &&
     243           0 :       tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
     244             :               TryCand, Cand, SIScheduleCandReason::Depth))
     245             :     return;
     246             : 
     247          16 :   if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage))
     248             :     return;
     249             : 
     250             :   // Fall through to original instruction order.
     251           6 :   if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
     252           0 :     TryCand.Reason = NodeOrder;
     253             :   }
     254             : }
     255             : 
     256          16 : SUnit* SIScheduleBlock::pickNode() {
     257          16 :   SISchedCandidate TopCand;
     258             : 
     259          89 :   for (SUnit* SU : TopReadySUs) {
     260          25 :     SISchedCandidate TryCand;
     261          50 :     std::vector<unsigned> pressure;
     262          50 :     std::vector<unsigned> MaxPressure;
     263             :     // Predict register usage after this instruction.
     264          25 :     TryCand.SU = SU;
     265          25 :     TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
     266          50 :     TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
     267          50 :     TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
     268          50 :     TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
     269          50 :     TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
     270          25 :     TryCand.HasLowLatencyNonWaitedParent =
     271          50 :       HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
     272          25 :     tryCandidateTopDown(TopCand, TryCand);
     273          25 :     if (TryCand.Reason != NoCand)
     274             :       TopCand.setBest(TryCand);
     275             :   }
     276             : 
     277          16 :   return TopCand.SU;
     278             : }
     279             : 
     280             : 
     281             : // Schedule something valid.
     282           5 : void SIScheduleBlock::fastSchedule() {
     283          10 :   TopReadySUs.clear();
     284           5 :   if (Scheduled)
     285           0 :     undoSchedule();
     286             : 
     287          36 :   for (SUnit* SU : SUnits) {
     288          16 :     if (!SU->NumPredsLeft)
     289          10 :       TopReadySUs.push_back(SU);
     290             :   }
     291             : 
     292          58 :   while (!TopReadySUs.empty()) {
     293          16 :     SUnit *SU = TopReadySUs[0];
     294          16 :     ScheduledSUnits.push_back(SU);
     295          16 :     nodeScheduled(SU);
     296             :   }
     297             : 
     298           5 :   Scheduled = true;
     299           5 : }
     300             : 
     301             : // Returns if the register was set between first and last.
     302           8 : static bool isDefBetween(unsigned Reg,
     303             :                            SlotIndex First, SlotIndex Last,
     304             :                            const MachineRegisterInfo *MRI,
     305             :                            const LiveIntervals *LIS) {
     306             :   for (MachineRegisterInfo::def_instr_iterator
     307           8 :        UI = MRI->def_instr_begin(Reg),
     308          17 :        UE = MRI->def_instr_end(); UI != UE; ++UI) {
     309           9 :     const MachineInstr* MI = &*UI;
     310           9 :     if (MI->isDebugValue())
     311           0 :       continue;
     312          27 :     SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
     313          18 :     if (InstSlot >= First && InstSlot <= Last)
     314             :       return true;
     315             :   }
     316             :   return false;
     317             : }
     318             : 
     319           5 : void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
     320             :                                       MachineBasicBlock::iterator EndBlock) {
     321          20 :   IntervalPressure Pressure, BotPressure;
     322          15 :   RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
     323           5 :   LiveIntervals *LIS = DAG->getLIS();
     324           5 :   MachineRegisterInfo *MRI = DAG->getMRI();
     325           5 :   DAG->initRPTracker(TopRPTracker);
     326           5 :   DAG->initRPTracker(BotRPTracker);
     327           5 :   DAG->initRPTracker(RPTracker);
     328             : 
     329             :   // Goes though all SU. RPTracker captures what had to be alive for the SUs
     330             :   // to execute, and what is still alive at the end.
     331          36 :   for (SUnit* SU : ScheduledSUnits) {
     332          32 :     RPTracker.setPos(SU->getInstr());
     333          16 :     RPTracker.advance();
     334             :   }
     335             : 
     336             :   // Close the RPTracker to finalize live ins/outs.
     337           5 :   RPTracker.closeRegion();
     338             : 
     339             :   // Initialize the live ins and live outs.
     340          10 :   TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
     341          10 :   BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
     342             : 
     343             :   // Do not Track Physical Registers, because it messes up.
     344          30 :   for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
     345          30 :     if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
     346           8 :       LiveInRegs.insert(RegMaskPair.RegUnit);
     347             :   }
     348          10 :   LiveOutRegs.clear();
     349             :   // There is several possibilities to distinguish:
     350             :   // 1) Reg is not input to any instruction in the block, but is output of one
     351             :   // 2) 1) + read in the block and not needed after it
     352             :   // 3) 1) + read in the block but needed in another block
     353             :   // 4) Reg is input of an instruction but another block will read it too
     354             :   // 5) Reg is input of an instruction and then rewritten in the block.
     355             :   //    result is not read in the block (implies used in another block)
     356             :   // 6) Reg is input of an instruction and then rewritten in the block.
     357             :   //    result is read in the block and not needed in another block
     358             :   // 7) Reg is input of an instruction and then rewritten in the block.
     359             :   //    result is read in the block but also needed in another block
     360             :   // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
     361             :   // We want LiveOutRegs to contain only Regs whose content will be read after
     362             :   // in another block, and whose content was written in the current block,
     363             :   // that is we want it to get 1, 3, 5, 7
     364             :   // Since we made the MIs of a block to be packed all together before
     365             :   // scheduling, then the LiveIntervals were correct, and the RPTracker was
     366             :   // able to correctly handle 5 vs 6, 2 vs 3.
     367             :   // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
     368             :   // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
     369             :   // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
     370             :   // The use of findDefBetween removes the case 4.
     371          23 :   for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
     372           8 :     unsigned Reg = RegMaskPair.RegUnit;
     373          24 :     if (TargetRegisterInfo::isVirtualRegister(Reg) &&
     374          32 :         isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
     375          16 :                      LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
     376             :                      LIS)) {
     377           8 :       LiveOutRegs.insert(Reg);
     378             :     }
     379             :   }
     380             : 
     381             :   // Pressure = sum_alive_registers register size
     382             :   // Internally llvm will represent some registers as big 128 bits registers
     383             :   // for example, but they actually correspond to 4 actual 32 bits registers.
     384             :   // Thus Pressure is not equal to num_alive_registers * constant.
     385           5 :   LiveInPressure = TopPressure.MaxSetPressure;
     386           5 :   LiveOutPressure = BotPressure.MaxSetPressure;
     387             : 
     388             :   // Prepares TopRPTracker for top down scheduling.
     389           5 :   TopRPTracker.closeTop();
     390           5 : }
     391             : 
     392           5 : void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
     393             :                                MachineBasicBlock::iterator EndBlock) {
     394           5 :   if (!Scheduled)
     395           0 :     fastSchedule();
     396             : 
     397             :   // PreScheduling phase to set LiveIn and LiveOut.
     398           5 :   initRegPressure(BeginBlock, EndBlock);
     399           5 :   undoSchedule();
     400             : 
     401             :   // Schedule for real now.
     402             : 
     403          10 :   TopReadySUs.clear();
     404             : 
     405          36 :   for (SUnit* SU : SUnits) {
     406          16 :     if (!SU->NumPredsLeft)
     407          10 :       TopReadySUs.push_back(SU);
     408             :   }
     409             : 
     410          58 :   while (!TopReadySUs.empty()) {
     411          16 :     SUnit *SU = pickNode();
     412          16 :     ScheduledSUnits.push_back(SU);
     413          32 :     TopRPTracker.setPos(SU->getInstr());
     414          16 :     TopRPTracker.advance();
     415          16 :     nodeScheduled(SU);
     416             :   }
     417             : 
     418             :   // TODO: compute InternalAdditionnalPressure.
     419          10 :   InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
     420             : 
     421             :   // Check everything is right.
     422             : #ifndef NDEBUG
     423             :   assert(SUnits.size() == ScheduledSUnits.size() &&
     424             :             TopReadySUs.empty());
     425             :   for (SUnit* SU : SUnits) {
     426             :     assert(SU->isScheduled &&
     427             :               SU->NumPredsLeft == 0);
     428             :   }
     429             : #endif
     430             : 
     431           5 :   Scheduled = true;
     432           5 : }
     433             : 
     434           5 : void SIScheduleBlock::undoSchedule() {
     435          36 :   for (SUnit* SU : SUnits) {
     436          16 :     SU->isScheduled = false;
     437          72 :     for (SDep& Succ : SU->Succs) {
     438          48 :       if (BC->isSUInBlock(Succ.getSUnit(), ID))
     439          12 :         undoReleaseSucc(SU, &Succ);
     440             :     }
     441             :   }
     442          15 :   HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
     443          10 :   ScheduledSUnits.clear();
     444           5 :   Scheduled = false;
     445           5 : }
     446             : 
     447          12 : void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
     448          12 :   SUnit *SuccSU = SuccEdge->getSUnit();
     449             : 
     450          12 :   if (SuccEdge->isWeak()) {
     451           0 :     ++SuccSU->WeakPredsLeft;
     452           0 :     return;
     453             :   }
     454          12 :   ++SuccSU->NumPredsLeft;
     455             : }
     456             : 
     457          33 : void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
     458          33 :   SUnit *SuccSU = SuccEdge->getSUnit();
     459             : 
     460          33 :   if (SuccEdge->isWeak()) {
     461           0 :     --SuccSU->WeakPredsLeft;
     462           0 :     return;
     463             :   }
     464             : #ifndef NDEBUG
     465             :   if (SuccSU->NumPredsLeft == 0) {
     466             :     dbgs() << "*** Scheduling failed! ***\n";
     467             :     SuccSU->dump(DAG);
     468             :     dbgs() << " has been released too many times!\n";
     469             :     llvm_unreachable(nullptr);
     470             :   }
     471             : #endif
     472             : 
     473          33 :   --SuccSU->NumPredsLeft;
     474             : }
     475             : 
     476             : /// Release Successors of the SU that are in the block or not.
     477          48 : void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
     478         216 :   for (SDep& Succ : SU->Succs) {
     479          72 :     SUnit *SuccSU = Succ.getSUnit();
     480             : 
     481         144 :     if (SuccSU->NodeNum >= DAG->SUnits.size())
     482          48 :         continue;
     483             : 
     484          63 :     if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
     485          30 :       continue;
     486             : 
     487          33 :     releaseSucc(SU, &Succ);
     488          33 :     if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
     489          12 :       TopReadySUs.push_back(SuccSU);
     490             :   }
     491          48 : }
     492             : 
     493          32 : void SIScheduleBlock::nodeScheduled(SUnit *SU) {
     494             :   // Is in TopReadySUs
     495             :   assert (!SU->NumPredsLeft);
     496          64 :   std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU);
     497          96 :   if (I == TopReadySUs.end()) {
     498           0 :     dbgs() << "Data Structure Bug in SI Scheduler\n";
     499           0 :     llvm_unreachable(nullptr);
     500             :   }
     501          96 :   TopReadySUs.erase(I);
     502             : 
     503          32 :   releaseSuccessors(SU, true);
     504             :   // Scheduling this node will trigger a wait,
     505             :   // thus propagate to other instructions that they do not need to wait either.
     506          64 :   if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
     507           0 :     HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
     508             : 
     509          64 :   if (DAG->IsLowLatencySU[SU->NodeNum]) {
     510          16 :      for (SDep& Succ : SU->Succs) {
     511             :       std::map<unsigned, unsigned>::iterator I =
     512          12 :         NodeNum2Index.find(Succ.getSUnit()->NodeNum);
     513           8 :       if (I != NodeNum2Index.end())
     514           0 :         HasLowLatencyNonWaitedParent[I->second] = 1;
     515             :     }
     516             :   }
     517          32 :   SU->isScheduled = true;
     518          32 : }
     519             : 
     520           5 : void SIScheduleBlock::finalizeUnits() {
     521             :   // We remove links from outside blocks to enable scheduling inside the block.
     522          36 :   for (SUnit* SU : SUnits) {
     523          16 :     releaseSuccessors(SU, false);
     524          32 :     if (DAG->IsHighLatencySU[SU->NodeNum])
     525           1 :       HighLatencyBlock = true;
     526             :   }
     527          10 :   HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
     528           5 : }
     529             : 
     530             : // we maintain ascending order of IDs
     531           9 : void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
     532           9 :   unsigned PredID = Pred->getID();
     533             : 
     534             :   // Check if not already predecessor.
     535          37 :   for (SIScheduleBlock* P : Preds) {
     536           6 :     if (PredID == P->getID())
     537             :       return;
     538             :   }
     539           4 :   Preds.push_back(Pred);
     540             : 
     541             :   assert(none_of(Succs,
     542             :                  [=](std::pair<SIScheduleBlock*,
     543             :                      SIScheduleBlockLinkKind> S) {
     544             :                    return PredID == S.first->getID();
     545             :                     }) &&
     546             :          "Loop in the Block Graph!");
     547             : }
     548             : 
     549           9 : void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
     550             :                               SIScheduleBlockLinkKind Kind) {
     551           9 :   unsigned SuccID = Succ->getID();
     552             : 
     553             :   // Check if not already predecessor.
     554          37 :   for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
     555           6 :     if (SuccID == S.first->getID()) {
     556           5 :       if (S.second == SIScheduleBlockLinkKind::NoData &&
     557             :           Kind == SIScheduleBlockLinkKind::Data)
     558           0 :         S.second = Kind;
     559             :       return;
     560             :     }
     561             :   }
     562           4 :   if (Succ->isHighLatencyBlock())
     563           0 :     ++NumHighLatencySuccessors;
     564          12 :   Succs.push_back(std::make_pair(Succ, Kind));
     565             : 
     566             :   assert(none_of(Preds,
     567             :                  [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
     568             :          "Loop in the Block Graph!");
     569             : }
     570             : 
     571             : #ifndef NDEBUG
     572             : void SIScheduleBlock::printDebug(bool full) {
     573             :   dbgs() << "Block (" << ID << ")\n";
     574             :   if (!full)
     575             :     return;
     576             : 
     577             :   dbgs() << "\nContains High Latency Instruction: "
     578             :          << HighLatencyBlock << '\n';
     579             :   dbgs() << "\nDepends On:\n";
     580             :   for (SIScheduleBlock* P : Preds) {
     581             :     P->printDebug(false);
     582             :   }
     583             : 
     584             :   dbgs() << "\nSuccessors:\n";
     585             :   for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
     586             :     if (S.second == SIScheduleBlockLinkKind::Data)
     587             :       dbgs() << "(Data Dep) ";
     588             :     S.first->printDebug(false);
     589             :   }
     590             : 
     591             :   if (Scheduled) {
     592             :     dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
     593             :            << LiveInPressure[DAG->getVGPRSetID()] << '\n';
     594             :     dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
     595             :            << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
     596             :     dbgs() << "LiveIns:\n";
     597             :     for (unsigned Reg : LiveInRegs)
     598             :       dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
     599             : 
     600             :     dbgs() << "\nLiveOuts:\n";
     601             :     for (unsigned Reg : LiveOutRegs)
     602             :       dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
     603             :   }
     604             : 
     605             :   dbgs() << "\nInstructions:\n";
     606             :   if (!Scheduled) {
     607             :     for (SUnit* SU : SUnits) {
     608             :       SU->dump(DAG);
     609             :     }
     610             :   } else {
     611             :     for (SUnit* SU : SUnits) {
     612             :       SU->dump(DAG);
     613             :     }
     614             :   }
     615             : 
     616             :   dbgs() << "///////////////////////\n";
     617             : }
     618             : #endif
     619             : 
     620             : // SIScheduleBlockCreator //
     621             : 
     622           2 : SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
     623          22 : DAG(DAG) {
     624           2 : }
     625             : 
     626             : SIScheduleBlockCreator::~SIScheduleBlockCreator() = default;
     627             : 
     628             : SIScheduleBlocks
     629           2 : SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
     630             :   std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
     631           4 :     Blocks.find(BlockVariant);
     632           4 :   if (B == Blocks.end()) {
     633           4 :     SIScheduleBlocks Res;
     634           2 :     createBlocksForVariant(BlockVariant);
     635           2 :     topologicalSort();
     636           2 :     scheduleInsideBlocks();
     637           2 :     fillStats();
     638           2 :     Res.Blocks = CurrentBlocks;
     639           2 :     Res.TopDownIndex2Block = TopDownIndex2Block;
     640           2 :     Res.TopDownBlock2Index = TopDownBlock2Index;
     641           2 :     Blocks[BlockVariant] = Res;
     642           2 :     return Res;
     643             :   } else {
     644           0 :     return B->second;
     645             :   }
     646             : }
     647             : 
     648          87 : bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
     649         174 :   if (SU->NodeNum >= DAG->SUnits.size())
     650             :     return false;
     651         252 :   return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
     652             : }
     653             : 
     654           2 : void SIScheduleBlockCreator::colorHighLatenciesAlone() {
     655           4 :   unsigned DAGSize = DAG->SUnits.size();
     656             : 
     657          18 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
     658          32 :     SUnit *SU = &DAG->SUnits[i];
     659          32 :     if (DAG->IsHighLatencySU[SU->NodeNum]) {
     660           2 :       CurrentColoring[SU->NodeNum] = NextReservedID++;
     661             :     }
     662             :   }
     663           2 : }
     664             : 
     665             : static bool
     666             : hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
     667           0 :   for (const auto &PredDep : SU.Preds) {
     668           0 :     if (PredDep.getSUnit() == &FromSU &&
     669           0 :         PredDep.getKind() == llvm::SDep::Data)
     670             :       return true;
     671             :   }
     672             :   return false;
     673             : }
     674             : 
     675           0 : void SIScheduleBlockCreator::colorHighLatenciesGroups() {
     676           0 :   unsigned DAGSize = DAG->SUnits.size();
     677           0 :   unsigned NumHighLatencies = 0;
     678             :   unsigned GroupSize;
     679           0 :   int Color = NextReservedID;
     680           0 :   unsigned Count = 0;
     681           0 :   std::set<unsigned> FormingGroup;
     682             : 
     683           0 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
     684           0 :     SUnit *SU = &DAG->SUnits[i];
     685           0 :     if (DAG->IsHighLatencySU[SU->NodeNum])
     686           0 :       ++NumHighLatencies;
     687             :   }
     688             : 
     689           0 :   if (NumHighLatencies == 0)
     690           0 :     return;
     691             : 
     692           0 :   if (NumHighLatencies <= 6)
     693             :     GroupSize = 2;
     694           0 :   else if (NumHighLatencies <= 12)
     695             :     GroupSize = 3;
     696             :   else
     697           0 :     GroupSize = 4;
     698             : 
     699           0 :   for (unsigned SUNum : DAG->TopDownIndex2SU) {
     700           0 :     const SUnit &SU = DAG->SUnits[SUNum];
     701           0 :     if (DAG->IsHighLatencySU[SU.NodeNum]) {
     702           0 :       unsigned CompatibleGroup = true;
     703           0 :       int ProposedColor = Color;
     704           0 :       std::vector<int> AdditionalElements;
     705             : 
     706             :       // We don't want to put in the same block
     707             :       // two high latency instructions that depend
     708             :       // on each other.
     709             :       // One way would be to check canAddEdge
     710             :       // in both directions, but that currently is not
     711             :       // enough because there the high latency order is
     712             :       // enforced (via links).
     713             :       // Instead, look at the dependencies between the
     714             :       // high latency instructions and deduce if it is
     715             :       // a data dependency or not.
     716           0 :       for (unsigned j : FormingGroup) {
     717             :         bool HasSubGraph;
     718           0 :         std::vector<int> SubGraph;
     719             :         // By construction (topological order), if SU and
     720             :         // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
     721             :         // in the parent graph of SU.
     722             : #ifndef NDEBUG
     723             :         SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
     724             :                                                HasSubGraph);
     725             :         assert(!HasSubGraph);
     726             : #endif
     727           0 :         SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
     728             :                                                HasSubGraph);
     729           0 :         if (!HasSubGraph)
     730           0 :           continue; // No dependencies between each other
     731           0 :         else if (SubGraph.size() > 5) {
     732             :           // Too many elements would be required to be added to the block.
     733             :           CompatibleGroup = false;
     734           0 :           break;
     735             :         }
     736             :         else {
     737             :           // Check the type of dependency
     738           0 :           for (unsigned k : SubGraph) {
     739             :             // If in the path to join the two instructions,
     740             :             // there is another high latency instruction,
     741             :             // or instructions colored for another block
     742             :             // abort the merge.
     743           0 :             if (DAG->IsHighLatencySU[k] ||
     744           0 :                 (CurrentColoring[k] != ProposedColor &&
     745           0 :                  CurrentColoring[k] != 0)) {
     746             :               CompatibleGroup = false;
     747             :               break;
     748             :             }
     749             :             // If one of the SU in the subgraph depends on the result of SU j,
     750             :             // there'll be a data dependency.
     751           0 :             if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) {
     752             :               CompatibleGroup = false;
     753             :               break;
     754             :             }
     755             :           }
     756           0 :           if (!CompatibleGroup)
     757             :             break;
     758             :           // Same check for the SU
     759           0 :           if (hasDataDependencyPred(SU, DAG->SUnits[j])) {
     760             :             CompatibleGroup = false;
     761             :             break;
     762             :           }
     763             :           // Add all the required instructions to the block
     764             :           // These cannot live in another block (because they
     765             :           // depend (order dependency) on one of the
     766             :           // instruction in the block, and are required for the
     767             :           // high latency instruction we add.
     768           0 :           AdditionalElements.insert(AdditionalElements.end(),
     769           0 :                                     SubGraph.begin(), SubGraph.end());
     770             :         }
     771             :       }
     772           0 :       if (CompatibleGroup) {
     773           0 :         FormingGroup.insert(SU.NodeNum);
     774           0 :         for (unsigned j : AdditionalElements)
     775           0 :           CurrentColoring[j] = ProposedColor;
     776           0 :         CurrentColoring[SU.NodeNum] = ProposedColor;
     777           0 :         ++Count;
     778             :       }
     779             :       // Found one incompatible instruction,
     780             :       // or has filled a big enough group.
     781             :       // -> start a new one.
     782           0 :       if (!CompatibleGroup) {
     783           0 :         FormingGroup.clear();
     784           0 :         Color = ++NextReservedID;
     785           0 :         ProposedColor = Color;
     786           0 :         FormingGroup.insert(SU.NodeNum);
     787           0 :         CurrentColoring[SU.NodeNum] = ProposedColor;
     788           0 :         Count = 0;
     789           0 :       } else if (Count == GroupSize) {
     790           0 :         FormingGroup.clear();
     791           0 :         Color = ++NextReservedID;
     792           0 :         ProposedColor = Color;
     793           0 :         Count = 0;
     794             :       }
     795             :     }
     796             :   }
     797             : }
     798             : 
     799           2 : void SIScheduleBlockCreator::colorComputeReservedDependencies() {
     800           4 :   unsigned DAGSize = DAG->SUnits.size();
     801           4 :   std::map<std::set<unsigned>, unsigned> ColorCombinations;
     802             : 
     803           4 :   CurrentTopDownReservedDependencyColoring.clear();
     804           4 :   CurrentBottomUpReservedDependencyColoring.clear();
     805             : 
     806           2 :   CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
     807           2 :   CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
     808             : 
     809             :   // Traverse TopDown, and give different colors to SUs depending
     810             :   // on which combination of High Latencies they depend on.
     811             : 
     812          24 :   for (unsigned SUNum : DAG->TopDownIndex2SU) {
     813          32 :     SUnit *SU = &DAG->SUnits[SUNum];
     814          19 :     std::set<unsigned> SUColors;
     815             : 
     816             :     // Already given.
     817          33 :     if (CurrentColoring[SU->NodeNum]) {
     818           2 :       CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
     819           1 :         CurrentColoring[SU->NodeNum];
     820          14 :       continue;
     821             :     }
     822             : 
     823          66 :    for (SDep& PredDep : SU->Preds) {
     824          21 :       SUnit *Pred = PredDep.getSUnit();
     825          21 :       if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
     826           0 :         continue;
     827          42 :       if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
     828          10 :         SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
     829             :     }
     830             :     // Color 0 by default.
     831          15 :     if (SUColors.empty())
     832          12 :       continue;
     833             :     // Same color than parents.
     834           7 :     if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
     835           0 :       CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
     836           0 :         *SUColors.begin();
     837             :     else {
     838             :       std::map<std::set<unsigned>, unsigned>::iterator Pos =
     839           3 :         ColorCombinations.find(SUColors);
     840           3 :       if (Pos != ColorCombinations.end()) {
     841           2 :           CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
     842             :       } else {
     843           6 :         CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
     844           2 :           NextNonReservedID;
     845           2 :         ColorCombinations[SUColors] = NextNonReservedID++;
     846             :       }
     847             :     }
     848             :   }
     849             : 
     850           2 :   ColorCombinations.clear();
     851             : 
     852             :   // Same as before, but BottomUp.
     853             : 
     854          24 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
     855          32 :     SUnit *SU = &DAG->SUnits[SUNum];
     856          16 :     std::set<unsigned> SUColors;
     857             : 
     858             :     // Already given.
     859          33 :     if (CurrentColoring[SU->NodeNum]) {
     860           2 :       CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
     861           1 :         CurrentColoring[SU->NodeNum];
     862          17 :       continue;
     863             :     }
     864             : 
     865          66 :     for (SDep& SuccDep : SU->Succs) {
     866          21 :       SUnit *Succ = SuccDep.getSUnit();
     867          24 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
     868           3 :         continue;
     869          36 :       if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
     870           0 :         SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
     871             :     }
     872             :     // Keep color 0.
     873          15 :     if (SUColors.empty())
     874          15 :       continue;
     875             :     // Same color than parents.
     876           0 :     if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
     877           0 :       CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
     878           0 :         *SUColors.begin();
     879             :     else {
     880             :       std::map<std::set<unsigned>, unsigned>::iterator Pos =
     881           0 :         ColorCombinations.find(SUColors);
     882           0 :       if (Pos != ColorCombinations.end()) {
     883           0 :         CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
     884             :       } else {
     885           0 :         CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
     886           0 :           NextNonReservedID;
     887           0 :         ColorCombinations[SUColors] = NextNonReservedID++;
     888             :       }
     889             :     }
     890             :   }
     891           2 : }
     892             : 
     893           2 : void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
     894           4 :   unsigned DAGSize = DAG->SUnits.size();
     895           4 :   std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
     896             : 
     897             :   // Every combination of colors given by the top down
     898             :   // and bottom up Reserved node dependency
     899             : 
     900          18 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
     901          32 :     SUnit *SU = &DAG->SUnits[i];
     902          16 :     std::pair<unsigned, unsigned> SUColors;
     903             : 
     904             :     // High latency instructions: already given.
     905          32 :     if (CurrentColoring[SU->NodeNum])
     906           1 :       continue;
     907             : 
     908          30 :     SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
     909          30 :     SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
     910             : 
     911             :     std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
     912          15 :       ColorCombinations.find(SUColors);
     913          15 :     if (Pos != ColorCombinations.end()) {
     914          24 :       CurrentColoring[SU->NodeNum] = Pos->second;
     915             :     } else {
     916           6 :       CurrentColoring[SU->NodeNum] = NextNonReservedID;
     917           3 :       ColorCombinations[SUColors] = NextNonReservedID++;
     918             :     }
     919             :   }
     920           2 : }
     921             : 
     922           2 : void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
     923           4 :   unsigned DAGSize = DAG->SUnits.size();
     924           3 :   std::vector<int> PendingColoring = CurrentColoring;
     925             : 
     926             :   assert(DAGSize >= 1 &&
     927             :          CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
     928             :          CurrentTopDownReservedDependencyColoring.size() == DAGSize);
     929             :   // If there is no reserved block at all, do nothing. We don't want
     930             :   // everything in one block.
     931           4 :   if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
     932          11 :                         CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
     933           2 :       *std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
     934           4 :                         CurrentTopDownReservedDependencyColoring.end()) == 0)
     935           1 :     return;
     936             : 
     937           8 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
     938           8 :     SUnit *SU = &DAG->SUnits[SUNum];
     939           4 :     std::set<unsigned> SUColors;
     940           4 :     std::set<unsigned> SUColorsPending;
     941             : 
     942           8 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
     943           5 :       continue;
     944             : 
     945          12 :     if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
     946           6 :         CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
     947           3 :       continue;
     948             : 
     949           0 :     for (SDep& SuccDep : SU->Succs) {
     950           0 :       SUnit *Succ = SuccDep.getSUnit();
     951           0 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
     952           0 :         continue;
     953           0 :       if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
     954           0 :           CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
     955           0 :         SUColors.insert(CurrentColoring[Succ->NodeNum]);
     956           0 :       SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
     957             :     }
     958             :     // If there is only one child/parent block, and that block
     959             :     // is not among the ones we are removing in this path, then
     960             :     // merge the instruction to that block
     961           0 :     if (SUColors.size() == 1 && SUColorsPending.size() == 1)
     962           0 :       PendingColoring[SU->NodeNum] = *SUColors.begin();
     963             :     else // TODO: Attribute new colors depending on color
     964             :          // combination of children.
     965           0 :       PendingColoring[SU->NodeNum] = NextNonReservedID++;
     966             :   }
     967           1 :   CurrentColoring = PendingColoring;
     968             : }
     969             : 
     970             : 
     971           0 : void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
     972           0 :   unsigned DAGSize = DAG->SUnits.size();
     973             :   unsigned PreviousColor;
     974           0 :   std::set<unsigned> SeenColors;
     975             : 
     976           0 :   if (DAGSize <= 1)
     977           0 :     return;
     978             : 
     979           0 :   PreviousColor = CurrentColoring[0];
     980             : 
     981           0 :   for (unsigned i = 1, e = DAGSize; i != e; ++i) {
     982           0 :     SUnit *SU = &DAG->SUnits[i];
     983           0 :     unsigned CurrentColor = CurrentColoring[i];
     984           0 :     unsigned PreviousColorSave = PreviousColor;
     985             :     assert(i == SU->NodeNum);
     986             : 
     987           0 :     if (CurrentColor != PreviousColor)
     988             :       SeenColors.insert(PreviousColor);
     989           0 :     PreviousColor = CurrentColor;
     990             : 
     991           0 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
     992           0 :       continue;
     993             : 
     994           0 :     if (SeenColors.find(CurrentColor) == SeenColors.end())
     995           0 :       continue;
     996             : 
     997           0 :     if (PreviousColorSave != CurrentColor)
     998           0 :       CurrentColoring[i] = NextNonReservedID++;
     999             :     else
    1000           0 :       CurrentColoring[i] = CurrentColoring[i-1];
    1001             :   }
    1002             : }
    1003             : 
    1004           2 : void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
    1005           4 :   unsigned DAGSize = DAG->SUnits.size();
    1006             : 
    1007          24 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    1008          32 :     SUnit *SU = &DAG->SUnits[SUNum];
    1009          23 :     std::set<unsigned> SUColors;
    1010             : 
    1011          32 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    1012          10 :       continue;
    1013             : 
    1014             :     // No predecessor: Vgpr constant loading.
    1015             :     // Low latency instructions usually have a predecessor (the address)
    1016          40 :     if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
    1017           8 :       continue;
    1018             : 
    1019          30 :     for (SDep& SuccDep : SU->Succs) {
    1020           9 :       SUnit *Succ = SuccDep.getSUnit();
    1021          11 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    1022           2 :         continue;
    1023          21 :       SUColors.insert(CurrentColoring[Succ->NodeNum]);
    1024             :     }
    1025           7 :     if (SUColors.size() == 1)
    1026          12 :       CurrentColoring[SU->NodeNum] = *SUColors.begin();
    1027             :   }
    1028           2 : }
    1029             : 
    1030           0 : void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
    1031           0 :   unsigned DAGSize = DAG->SUnits.size();
    1032             : 
    1033           0 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    1034           0 :     SUnit *SU = &DAG->SUnits[SUNum];
    1035           0 :     std::set<unsigned> SUColors;
    1036             : 
    1037           0 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    1038           0 :       continue;
    1039             : 
    1040           0 :     for (SDep& SuccDep : SU->Succs) {
    1041           0 :        SUnit *Succ = SuccDep.getSUnit();
    1042           0 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    1043           0 :         continue;
    1044           0 :       SUColors.insert(CurrentColoring[Succ->NodeNum]);
    1045             :     }
    1046           0 :     if (SUColors.size() == 1)
    1047           0 :       CurrentColoring[SU->NodeNum] = *SUColors.begin();
    1048             :   }
    1049           0 : }
    1050             : 
    1051           2 : void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
    1052           4 :   unsigned DAGSize = DAG->SUnits.size();
    1053             : 
    1054          24 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    1055          32 :     SUnit *SU = &DAG->SUnits[SUNum];
    1056          31 :     std::set<unsigned> SUColors;
    1057             : 
    1058          32 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    1059           1 :       continue;
    1060             : 
    1061          66 :     for (SDep& SuccDep : SU->Succs) {
    1062          21 :        SUnit *Succ = SuccDep.getSUnit();
    1063          21 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    1064           3 :         continue;
    1065          54 :       SUColors.insert(CurrentColoring[Succ->NodeNum]);
    1066             :     }
    1067          33 :     if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
    1068           0 :       CurrentColoring[SU->NodeNum] = *SUColors.begin();
    1069             :   }
    1070           2 : }
    1071             : 
    1072           0 : void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
    1073           0 :   unsigned DAGSize = DAG->SUnits.size();
    1074           0 :   std::map<unsigned, unsigned> ColorCount;
    1075             : 
    1076           0 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    1077           0 :     SUnit *SU = &DAG->SUnits[SUNum];
    1078           0 :     unsigned color = CurrentColoring[SU->NodeNum];
    1079           0 :      ++ColorCount[color];
    1080             :   }
    1081             : 
    1082           0 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    1083           0 :     SUnit *SU = &DAG->SUnits[SUNum];
    1084           0 :     unsigned color = CurrentColoring[SU->NodeNum];
    1085           0 :     std::set<unsigned> SUColors;
    1086             : 
    1087           0 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    1088           0 :       continue;
    1089             : 
    1090           0 :     if (ColorCount[color] > 1)
    1091           0 :       continue;
    1092             : 
    1093           0 :     for (SDep& SuccDep : SU->Succs) {
    1094           0 :        SUnit *Succ = SuccDep.getSUnit();
    1095           0 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    1096           0 :         continue;
    1097           0 :       SUColors.insert(CurrentColoring[Succ->NodeNum]);
    1098             :     }
    1099           0 :     if (SUColors.size() == 1 && *SUColors.begin() != color) {
    1100           0 :       --ColorCount[color];
    1101           0 :       CurrentColoring[SU->NodeNum] = *SUColors.begin();
    1102           0 :       ++ColorCount[*SUColors.begin()];
    1103             :     }
    1104             :   }
    1105           0 : }
    1106             : 
    1107           0 : void SIScheduleBlockCreator::cutHugeBlocks() {
    1108             :   // TODO
    1109           0 : }
    1110             : 
    1111           2 : void SIScheduleBlockCreator::regroupNoUserInstructions() {
    1112           4 :   unsigned DAGSize = DAG->SUnits.size();
    1113           2 :   int GroupID = NextNonReservedID++;
    1114             : 
    1115          24 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    1116          32 :     SUnit *SU = &DAG->SUnits[SUNum];
    1117          16 :     bool hasSuccessor = false;
    1118             : 
    1119          32 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    1120           1 :       continue;
    1121             : 
    1122          66 :     for (SDep& SuccDep : SU->Succs) {
    1123          21 :        SUnit *Succ = SuccDep.getSUnit();
    1124          21 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    1125           3 :         continue;
    1126             :       hasSuccessor = true;
    1127             :     }
    1128          15 :     if (!hasSuccessor)
    1129           4 :       CurrentColoring[SU->NodeNum] = GroupID;
    1130             :   }
    1131           2 : }
    1132             : 
    1133           2 : void SIScheduleBlockCreator::colorExports() {
    1134           2 :   unsigned ExportColor = NextNonReservedID++;
    1135           4 :   SmallVector<unsigned, 8> ExpGroup;
    1136             : 
    1137             :   // Put all exports together in a block.
    1138             :   // The block will naturally end up being scheduled last,
    1139             :   // thus putting exports at the end of the schedule, which
    1140             :   // is better for performance.
    1141             :   // However we must ensure, for safety, the exports can be put
    1142             :   // together in the same block without any other instruction.
    1143             :   // This could happen, for example, when scheduling after regalloc
    1144             :   // if reloading a spilled register from memory using the same
    1145             :   // register than used in a previous export.
    1146             :   // If that happens, do not regroup the exports.
    1147          24 :   for (unsigned SUNum : DAG->TopDownIndex2SU) {
    1148          32 :     const SUnit &SU = DAG->SUnits[SUNum];
    1149          32 :     if (SIInstrInfo::isEXP(*SU.getInstr())) {
    1150             :       // Check the EXP can be added to the group safely,
    1151             :       // ie without needing any other instruction.
    1152             :       // The EXP is allowed to depend on other EXP
    1153             :       // (they will be in the same group).
    1154           3 :       for (unsigned j : ExpGroup) {
    1155             :         bool HasSubGraph;
    1156           0 :         std::vector<int> SubGraph;
    1157             :         // By construction (topological order), if SU and
    1158             :         // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
    1159             :         // in the parent graph of SU.
    1160             : #ifndef NDEBUG
    1161             :         SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
    1162             :                                                HasSubGraph);
    1163             :         assert(!HasSubGraph);
    1164             : #endif
    1165           0 :         SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
    1166             :                                                HasSubGraph);
    1167           0 :         if (!HasSubGraph)
    1168           0 :           continue; // No dependencies between each other
    1169             : 
    1170             :         // SubGraph contains all the instructions required
    1171             :         // between EXP SUnits[j] and EXP SU.
    1172           0 :         for (unsigned k : SubGraph) {
    1173           0 :           if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr()))
    1174             :             // Other instructions than EXP would be required in the group.
    1175             :             // Abort the groupping.
    1176           0 :             return;
    1177             :         }
    1178             :       }
    1179             : 
    1180           1 :       ExpGroup.push_back(SUNum);
    1181             :     }
    1182             :   }
    1183             : 
    1184             :   // The group can be formed. Give the color.
    1185           7 :   for (unsigned j : ExpGroup)
    1186           2 :     CurrentColoring[j] = ExportColor;
    1187             : }
    1188             : 
    1189           2 : void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
    1190           4 :   unsigned DAGSize = DAG->SUnits.size();
    1191           4 :   std::map<unsigned,unsigned> RealID;
    1192             : 
    1193           4 :   CurrentBlocks.clear();
    1194           4 :   CurrentColoring.clear();
    1195           2 :   CurrentColoring.resize(DAGSize, 0);
    1196           4 :   Node2CurrentBlock.clear();
    1197             : 
    1198             :   // Restore links previous scheduling variant has overridden.
    1199           2 :   DAG->restoreSULinksLeft();
    1200             : 
    1201           2 :   NextReservedID = 1;
    1202           2 :   NextNonReservedID = DAGSize + 1;
    1203             : 
    1204             :   DEBUG(dbgs() << "Coloring the graph\n");
    1205             : 
    1206           2 :   if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
    1207           0 :     colorHighLatenciesGroups();
    1208             :   else
    1209           2 :     colorHighLatenciesAlone();
    1210           2 :   colorComputeReservedDependencies();
    1211           2 :   colorAccordingToReservedDependencies();
    1212           2 :   colorEndsAccordingToDependencies();
    1213           2 :   if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
    1214           0 :     colorForceConsecutiveOrderInGroup();
    1215           2 :   regroupNoUserInstructions();
    1216           2 :   colorMergeConstantLoadsNextGroup();
    1217           2 :   colorMergeIfPossibleNextGroupOnlyForReserved();
    1218           2 :   colorExports();
    1219             : 
    1220             :   // Put SUs of same color into same block
    1221           2 :   Node2CurrentBlock.resize(DAGSize, -1);
    1222          18 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1223          32 :     SUnit *SU = &DAG->SUnits[i];
    1224          32 :     unsigned Color = CurrentColoring[SU->NodeNum];
    1225          32 :     if (RealID.find(Color) == RealID.end()) {
    1226          10 :       int ID = CurrentBlocks.size();
    1227          15 :       BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID));
    1228          25 :       CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
    1229           5 :       RealID[Color] = ID;
    1230             :     }
    1231          32 :     CurrentBlocks[RealID[Color]]->addUnit(SU);
    1232          32 :     Node2CurrentBlock[SU->NodeNum] = RealID[Color];
    1233             :   }
    1234             : 
    1235             :   // Build dependencies between blocks.
    1236          34 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1237          32 :     SUnit *SU = &DAG->SUnits[i];
    1238          32 :     int SUID = Node2CurrentBlock[i];
    1239          72 :      for (SDep& SuccDep : SU->Succs) {
    1240          24 :        SUnit *Succ = SuccDep.getSUnit();
    1241          24 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    1242           3 :         continue;
    1243          42 :       if (Node2CurrentBlock[Succ->NodeNum] != SUID)
    1244          36 :         CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
    1245           9 :                                      SuccDep.isCtrl() ? NoData : Data);
    1246             :     }
    1247          69 :     for (SDep& PredDep : SU->Preds) {
    1248          21 :       SUnit *Pred = PredDep.getSUnit();
    1249          21 :       if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
    1250           0 :         continue;
    1251          42 :       if (Node2CurrentBlock[Pred->NodeNum] != SUID)
    1252          27 :         CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
    1253             :     }
    1254             :   }
    1255             : 
    1256             :   // Free root and leafs of all blocks to enable scheduling inside them.
    1257           9 :   for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
    1258          10 :     SIScheduleBlock *Block = CurrentBlocks[i];
    1259           5 :     Block->finalizeUnits();
    1260             :   }
    1261             :   DEBUG(
    1262             :     dbgs() << "Blocks created:\n\n";
    1263             :     for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
    1264             :       SIScheduleBlock *Block = CurrentBlocks[i];
    1265             :       Block->printDebug(true);
    1266             :     }
    1267             :   );
    1268           2 : }
    1269             : 
    1270             : // Two functions taken from Codegen/MachineScheduler.cpp
    1271             : 
    1272             : /// Non-const version.
    1273             : static MachineBasicBlock::iterator
    1274             : nextIfDebug(MachineBasicBlock::iterator I,
    1275             :             MachineBasicBlock::const_iterator End) {
    1276          24 :   for (; I != End; ++I) {
    1277          20 :     if (!I->isDebugValue())
    1278             :       break;
    1279             :   }
    1280             :   return I;
    1281             : }
    1282             : 
    1283           2 : void SIScheduleBlockCreator::topologicalSort() {
    1284           4 :   unsigned DAGSize = CurrentBlocks.size();
    1285           4 :   std::vector<int> WorkList;
    1286             : 
    1287             :   DEBUG(dbgs() << "Topological Sort\n");
    1288             : 
    1289           2 :   WorkList.reserve(DAGSize);
    1290           2 :   TopDownIndex2Block.resize(DAGSize);
    1291           2 :   TopDownBlock2Index.resize(DAGSize);
    1292           2 :   BottomUpIndex2Block.resize(DAGSize);
    1293             : 
    1294           7 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1295          10 :     SIScheduleBlock *Block = CurrentBlocks[i];
    1296           5 :     unsigned Degree = Block->getSuccs().size();
    1297          10 :     TopDownBlock2Index[i] = Degree;
    1298           5 :     if (Degree == 0) {
    1299           4 :       WorkList.push_back(i);
    1300             :     }
    1301             :   }
    1302             : 
    1303           2 :   int Id = DAGSize;
    1304           7 :   while (!WorkList.empty()) {
    1305           5 :     int i = WorkList.back();
    1306          10 :     SIScheduleBlock *Block = CurrentBlocks[i];
    1307           5 :     WorkList.pop_back();
    1308          10 :     TopDownBlock2Index[i] = --Id;
    1309          10 :     TopDownIndex2Block[Id] = i;
    1310          24 :     for (SIScheduleBlock* Pred : Block->getPreds()) {
    1311           8 :       if (!--TopDownBlock2Index[Pred->getID()])
    1312           6 :         WorkList.push_back(Pred->getID());
    1313             :     }
    1314             :   }
    1315             : 
    1316             : #ifndef NDEBUG
    1317             :   // Check correctness of the ordering.
    1318             :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1319             :     SIScheduleBlock *Block = CurrentBlocks[i];
    1320             :     for (SIScheduleBlock* Pred : Block->getPreds()) {
    1321             :       assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
    1322             :       "Wrong Top Down topological sorting");
    1323             :     }
    1324             :   }
    1325             : #endif
    1326             : 
    1327          14 :   BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
    1328           8 :                                          TopDownIndex2Block.rend());
    1329           2 : }
    1330             : 
    1331           2 : void SIScheduleBlockCreator::scheduleInsideBlocks() {
    1332           4 :   unsigned DAGSize = CurrentBlocks.size();
    1333             : 
    1334             :   DEBUG(dbgs() << "\nScheduling Blocks\n\n");
    1335             : 
    1336             :   // We do schedule a valid scheduling such that a Block corresponds
    1337             :   // to a range of instructions.
    1338             :   DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
    1339           7 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1340          10 :     SIScheduleBlock *Block = CurrentBlocks[i];
    1341           5 :     Block->fastSchedule();
    1342             :   }
    1343             : 
    1344             :   // Note: the following code, and the part restoring previous position
    1345             :   // is by far the most expensive operation of the Scheduler.
    1346             : 
    1347             :   // Do not update CurrentTop.
    1348           2 :   MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
    1349           4 :   std::vector<MachineBasicBlock::iterator> PosOld;
    1350           4 :   std::vector<MachineBasicBlock::iterator> PosNew;
    1351           4 :   PosOld.reserve(DAG->SUnits.size());
    1352           4 :   PosNew.reserve(DAG->SUnits.size());
    1353             : 
    1354           7 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1355          10 :     int BlockIndice = TopDownIndex2Block[i];
    1356          10 :     SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
    1357          10 :     std::vector<SUnit*> SUs = Block->getScheduledUnits();
    1358             : 
    1359          36 :     for (SUnit* SU : SUs) {
    1360          16 :       MachineInstr *MI = SU->getInstr();
    1361          16 :       MachineBasicBlock::iterator Pos = MI;
    1362          16 :       PosOld.push_back(Pos);
    1363          16 :       if (&*CurrentTopFastSched == MI) {
    1364          12 :         PosNew.push_back(Pos);
    1365          24 :         CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
    1366          24 :                                           DAG->getCurrentBottom());
    1367             :       } else {
    1368             :         // Update the instruction stream.
    1369           8 :         DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
    1370             : 
    1371             :         // Update LiveIntervals.
    1372             :         // Note: Moving all instructions and calling handleMove every time
    1373             :         // is the most cpu intensive operation of the scheduler.
    1374             :         // It would gain a lot if there was a way to recompute the
    1375             :         // LiveIntervals for the entire scheduling region.
    1376           4 :         DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
    1377           4 :         PosNew.push_back(CurrentTopFastSched);
    1378             :       }
    1379             :     }
    1380             :   }
    1381             : 
    1382             :   // Now we have Block of SUs == Block of MI.
    1383             :   // We do the final schedule for the instructions inside the block.
    1384             :   // The property that all the SUs of the Block are grouped together as MI
    1385             :   // is used for correct reg usage tracking.
    1386          12 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1387          10 :     SIScheduleBlock *Block = CurrentBlocks[i];
    1388          10 :     std::vector<SUnit*> SUs = Block->getScheduledUnits();
    1389          25 :     Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
    1390             :   }
    1391             : 
    1392             :   DEBUG(dbgs() << "Restoring MI Pos\n");
    1393             :   // Restore old ordering (which prevents a LIS->handleMove bug).
    1394           4 :   for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
    1395          32 :     MachineBasicBlock::iterator POld = PosOld[i-1];
    1396          32 :     MachineBasicBlock::iterator PNew = PosNew[i-1];
    1397          16 :     if (PNew != POld) {
    1398             :       // Update the instruction stream.
    1399           4 :       DAG->getBB()->splice(POld, DAG->getBB(), PNew);
    1400             : 
    1401             :       // Update LiveIntervals.
    1402           8 :       DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
    1403             :     }
    1404             :   }
    1405             : 
    1406             :   DEBUG(
    1407             :     for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
    1408             :       SIScheduleBlock *Block = CurrentBlocks[i];
    1409             :       Block->printDebug(true);
    1410             :     }
    1411             :   );
    1412           2 : }
    1413             : 
    1414           2 : void SIScheduleBlockCreator::fillStats() {
    1415           4 :   unsigned DAGSize = CurrentBlocks.size();
    1416             : 
    1417           7 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1418          10 :     int BlockIndice = TopDownIndex2Block[i];
    1419          10 :     SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
    1420          10 :     if (Block->getPreds().empty())
    1421           2 :       Block->Depth = 0;
    1422             :     else {
    1423             :       unsigned Depth = 0;
    1424           7 :       for (SIScheduleBlock *Pred : Block->getPreds()) {
    1425           8 :         if (Depth < Pred->Depth + Pred->getCost())
    1426           6 :           Depth = Pred->Depth + Pred->getCost();
    1427             :       }
    1428           3 :       Block->Depth = Depth;
    1429             :     }
    1430             :   }
    1431             : 
    1432          12 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1433          10 :     int BlockIndice = BottomUpIndex2Block[i];
    1434          10 :     SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
    1435           5 :     if (Block->getSuccs().empty())
    1436           2 :       Block->Height = 0;
    1437             :     else {
    1438           3 :       unsigned Height = 0;
    1439          10 :       for (const auto &Succ : Block->getSuccs())
    1440          12 :         Height = std::max(Height, Succ.first->Height + Succ.first->getCost());
    1441           3 :       Block->Height = Height;
    1442             :     }
    1443             :   }
    1444           2 : }
    1445             : 
    1446             : // SIScheduleBlockScheduler //
    1447             : 
    1448           2 : SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
    1449             :                                                    SISchedulerBlockSchedulerVariant Variant,
    1450           2 :                                                    SIScheduleBlocks  BlocksStruct) :
    1451             :   DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
    1452             :   LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
    1453          18 :   SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
    1454             : 
    1455             :   // Fill the usage of every output
    1456             :   // Warning: while by construction we always have a link between two blocks
    1457             :   // when one needs a result from the other, the number of users of an output
    1458             :   // is not the sum of child blocks having as input the same virtual register.
    1459             :   // Here is an example. A produces x and y. B eats x and produces x'.
    1460             :   // C eats x' and y. The register coalescer may have attributed the same
    1461             :   // virtual register to x and x'.
    1462             :   // To count accurately, we do a topological sort. In case the register is
    1463             :   // found for several parents, we increment the usage of the one with the
    1464             :   // highest topological index.
    1465           4 :   LiveOutRegsNumUsages.resize(Blocks.size());
    1466           9 :   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
    1467          10 :     SIScheduleBlock *Block = Blocks[i];
    1468          23 :     for (unsigned Reg : Block->getInRegs()) {
    1469           8 :       bool Found = false;
    1470           8 :       int topoInd = -1;
    1471          39 :       for (SIScheduleBlock* Pred: Block->getPreds()) {
    1472          21 :         std::set<unsigned> PredOutRegs = Pred->getOutRegs();
    1473           7 :         std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
    1474             : 
    1475           7 :         if (RegPos != PredOutRegs.end()) {
    1476           5 :           Found = true;
    1477          10 :           if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
    1478           5 :             topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
    1479             :           }
    1480             :         }
    1481             :       }
    1482             : 
    1483           8 :       if (!Found)
    1484           3 :         continue;
    1485             : 
    1486          10 :       int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
    1487          10 :       ++LiveOutRegsNumUsages[PredID][Reg];
    1488             :     }
    1489             :   }
    1490             : 
    1491           4 :   LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
    1492           4 :   BlockNumPredsLeft.resize(Blocks.size());
    1493           4 :   BlockNumSuccsLeft.resize(Blocks.size());
    1494             : 
    1495           9 :   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
    1496          10 :     SIScheduleBlock *Block = Blocks[i];
    1497          15 :     BlockNumPredsLeft[i] = Block->getPreds().size();
    1498          10 :     BlockNumSuccsLeft[i] = Block->getSuccs().size();
    1499             :   }
    1500             : 
    1501             : #ifndef NDEBUG
    1502             :   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
    1503             :     SIScheduleBlock *Block = Blocks[i];
    1504             :     assert(Block->getID() == i);
    1505             :   }
    1506             : #endif
    1507             : 
    1508           4 :   std::set<unsigned> InRegs = DAG->getInRegs();
    1509           2 :   addLiveRegs(InRegs);
    1510             : 
    1511             :   // Increase LiveOutRegsNumUsages for blocks
    1512             :   // producing registers consumed in another
    1513             :   // scheduling region.
    1514          11 :   for (unsigned Reg : DAG->getOutRegs()) {
    1515           6 :     for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
    1516             :       // Do reverse traversal
    1517           6 :       int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
    1518           6 :       SIScheduleBlock *Block = Blocks[ID];
    1519           3 :       const std::set<unsigned> &OutRegs = Block->getOutRegs();
    1520             : 
    1521           6 :       if (OutRegs.find(Reg) == OutRegs.end())
    1522             :         continue;
    1523             : 
    1524           6 :       ++LiveOutRegsNumUsages[ID][Reg];
    1525           3 :       break;
    1526             :     }
    1527             :   }
    1528             : 
    1529             :   // Fill LiveRegsConsumers for regs that were already
    1530             :   // defined before scheduling.
    1531           9 :   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
    1532          10 :     SIScheduleBlock *Block = Blocks[i];
    1533          23 :     for (unsigned Reg : Block->getInRegs()) {
    1534           8 :       bool Found = false;
    1535          32 :       for (SIScheduleBlock* Pred: Block->getPreds()) {
    1536          10 :         std::set<unsigned> PredOutRegs = Pred->getOutRegs();
    1537           5 :         std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
    1538             : 
    1539           5 :         if (RegPos != PredOutRegs.end()) {
    1540           5 :           Found = true;
    1541           5 :           break;
    1542             :         }
    1543             :       }
    1544             : 
    1545             :       if (!Found)
    1546           3 :         ++LiveRegsConsumers[Reg];
    1547             :     }
    1548             :   }
    1549             : 
    1550           9 :   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
    1551          10 :     SIScheduleBlock *Block = Blocks[i];
    1552          10 :     if (BlockNumPredsLeft[i] == 0) {
    1553           2 :       ReadyBlocks.push_back(Block);
    1554             :     }
    1555             :   }
    1556             : 
    1557           7 :   while (SIScheduleBlock *Block = pickBlock()) {
    1558           5 :     BlocksScheduled.push_back(Block);
    1559           5 :     blockScheduled(Block);
    1560           5 :   }
    1561             : 
    1562             :   DEBUG(
    1563             :     dbgs() << "Block Order:";
    1564             :     for (SIScheduleBlock* Block : BlocksScheduled) {
    1565             :       dbgs() << ' ' << Block->getID();
    1566             :     }
    1567             :     dbgs() << '\n';
    1568             :   );
    1569           2 : }
    1570             : 
    1571           5 : bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
    1572             :                                                    SIBlockSchedCandidate &TryCand) {
    1573           5 :   if (!Cand.isValid()) {
    1574           5 :     TryCand.Reason = NodeOrder;
    1575           5 :     return true;
    1576             :   }
    1577             : 
    1578             :   // Try to hide high latencies.
    1579           0 :   if (tryLess(TryCand.LastPosHighLatParentScheduled,
    1580           0 :               Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
    1581             :     return true;
    1582             :   // Schedule high latencies early so you can hide them better.
    1583           0 :   if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
    1584             :                  TryCand, Cand, Latency))
    1585             :     return true;
    1586           0 :   if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height,
    1587             :                                           TryCand, Cand, Depth))
    1588             :     return true;
    1589           0 :   if (tryGreater(TryCand.NumHighLatencySuccessors,
    1590           0 :                  Cand.NumHighLatencySuccessors,
    1591             :                  TryCand, Cand, Successor))
    1592             :     return true;
    1593             :   return false;
    1594             : }
    1595             : 
    1596           0 : bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
    1597             :                                                     SIBlockSchedCandidate &TryCand) {
    1598           0 :   if (!Cand.isValid()) {
    1599           0 :     TryCand.Reason = NodeOrder;
    1600           0 :     return true;
    1601             :   }
    1602             : 
    1603           0 :   if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
    1604             :               TryCand, Cand, RegUsage))
    1605             :     return true;
    1606           0 :   if (tryGreater(TryCand.NumSuccessors > 0,
    1607           0 :                  Cand.NumSuccessors > 0,
    1608             :                  TryCand, Cand, Successor))
    1609             :     return true;
    1610           0 :   if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
    1611             :     return true;
    1612           0 :   if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
    1613             :               TryCand, Cand, RegUsage))
    1614             :     return true;
    1615             :   return false;
    1616             : }
    1617             : 
    1618           7 : SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
    1619           7 :   SIBlockSchedCandidate Cand;
    1620           7 :   std::vector<SIScheduleBlock*>::iterator Best;
    1621             :   SIScheduleBlock *Block;
    1622          14 :   if (ReadyBlocks.empty())
    1623             :     return nullptr;
    1624             : 
    1625          15 :   DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
    1626             :                         VregCurrentUsage, SregCurrentUsage);
    1627           5 :   if (VregCurrentUsage > maxVregUsage)
    1628           3 :     maxVregUsage = VregCurrentUsage;
    1629           5 :   if (SregCurrentUsage > maxSregUsage)
    1630           2 :     maxSregUsage = SregCurrentUsage;
    1631             :   DEBUG(
    1632             :     dbgs() << "Picking New Blocks\n";
    1633             :     dbgs() << "Available: ";
    1634             :     for (SIScheduleBlock* Block : ReadyBlocks)
    1635             :       dbgs() << Block->getID() << ' ';
    1636             :     dbgs() << "\nCurrent Live:\n";
    1637             :     for (unsigned Reg : LiveRegs)
    1638             :       dbgs() << PrintVRegOrUnit(Reg, DAG->getTRI()) << ' ';
    1639             :     dbgs() << '\n';
    1640             :     dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
    1641             :     dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';
    1642             :   );
    1643             : 
    1644           5 :   Cand.Block = nullptr;
    1645          10 :   for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
    1646          20 :        E = ReadyBlocks.end(); I != E; ++I) {
    1647           5 :     SIBlockSchedCandidate TryCand;
    1648           5 :     TryCand.Block = *I;
    1649           5 :     TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
    1650           5 :     TryCand.VGPRUsageDiff =
    1651          15 :       checkRegUsageImpact(TryCand.Block->getInRegs(),
    1652          15 :                           TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
    1653          10 :     TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
    1654           5 :     TryCand.NumHighLatencySuccessors =
    1655           5 :       TryCand.Block->getNumHighLatencySuccessors();
    1656           5 :     TryCand.LastPosHighLatParentScheduled =
    1657           5 :       (unsigned int) std::max<int> (0,
    1658          20 :          LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
    1659          20 :            LastPosWaitedHighLatency);
    1660           5 :     TryCand.Height = TryCand.Block->Height;
    1661             :     // Try not to increase VGPR usage too much, else we may spill.
    1662          10 :     if (VregCurrentUsage > 120 ||
    1663           5 :         Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
    1664           0 :       if (!tryCandidateRegUsage(Cand, TryCand) &&
    1665           0 :           Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
    1666           0 :         tryCandidateLatency(Cand, TryCand);
    1667             :     } else {
    1668           5 :       if (!tryCandidateLatency(Cand, TryCand))
    1669           0 :         tryCandidateRegUsage(Cand, TryCand);
    1670             :     }
    1671           5 :     if (TryCand.Reason != NoCand) {
    1672           5 :       Cand.setBest(TryCand);
    1673           5 :       Best = I;
    1674             :       DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
    1675             :                    << getReasonStr(Cand.Reason) << '\n');
    1676             :     }
    1677             :   }
    1678             : 
    1679             :   DEBUG(
    1680             :     dbgs() << "Picking: " << Cand.Block->getID() << '\n';
    1681             :     dbgs() << "Is a block with high latency instruction: "
    1682             :       << (Cand.IsHighLatency ? "yes\n" : "no\n");
    1683             :     dbgs() << "Position of last high latency dependency: "
    1684             :            << Cand.LastPosHighLatParentScheduled << '\n';
    1685             :     dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
    1686             :     dbgs() << '\n';
    1687             :   );
    1688             : 
    1689           5 :   Block = Cand.Block;
    1690          15 :   ReadyBlocks.erase(Best);
    1691           5 :   return Block;
    1692             : }
    1693             : 
    1694             : // Tracking of currently alive registers to determine VGPR Usage.
    1695             : 
    1696           7 : void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
    1697          40 :   for (unsigned Reg : Regs) {
    1698             :     // For now only track virtual registers.
    1699          38 :     if (!TargetRegisterInfo::isVirtualRegister(Reg))
    1700           7 :       continue;
    1701             :     // If not already in the live set, then add it.
    1702          12 :     (void) LiveRegs.insert(Reg);
    1703             :   }
    1704           7 : }
    1705             : 
    1706           5 : void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
    1707             :                                        std::set<unsigned> &Regs) {
    1708          23 :   for (unsigned Reg : Regs) {
    1709             :     // For now only track virtual registers.
    1710          16 :     std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
    1711             :     assert (Pos != LiveRegs.end() && // Reg must be live.
    1712             :                LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
    1713             :                LiveRegsConsumers[Reg] >= 1);
    1714           8 :     --LiveRegsConsumers[Reg];
    1715           8 :     if (LiveRegsConsumers[Reg] == 0)
    1716           8 :       LiveRegs.erase(Pos);
    1717             :   }
    1718           5 : }
    1719             : 
    1720           5 : void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
    1721          14 :   for (const auto &Block : Parent->getSuccs()) {
    1722           8 :     if (--BlockNumPredsLeft[Block.first->getID()] == 0)
    1723           3 :       ReadyBlocks.push_back(Block.first);
    1724             : 
    1725           6 :     if (Parent->isHighLatencyBlock() &&
    1726           2 :         Block.second == SIScheduleBlockLinkKind::Data)
    1727           2 :       LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
    1728             :   }
    1729           5 : }
    1730             : 
    1731           5 : void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
    1732           5 :   decreaseLiveRegs(Block, Block->getInRegs());
    1733           5 :   addLiveRegs(Block->getOutRegs());
    1734           5 :   releaseBlockSuccs(Block);
    1735             :   for (std::map<unsigned, unsigned>::iterator RegI =
    1736          15 :        LiveOutRegsNumUsages[Block->getID()].begin(),
    1737          10 :        E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
    1738          16 :     std::pair<unsigned, unsigned> RegP = *RegI;
    1739             :     // We produce this register, thus it must not be previously alive.
    1740             :     assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
    1741             :            LiveRegsConsumers[RegP.first] == 0);
    1742           8 :     LiveRegsConsumers[RegP.first] += RegP.second;
    1743             :   }
    1744          15 :   if (LastPosHighLatencyParentScheduled[Block->getID()] >
    1745           5 :         (unsigned)LastPosWaitedHighLatency)
    1746           0 :     LastPosWaitedHighLatency =
    1747           0 :       LastPosHighLatencyParentScheduled[Block->getID()];
    1748           5 :   ++NumBlockScheduled;
    1749           5 : }
    1750             : 
    1751             : std::vector<int>
    1752           5 : SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
    1753             :                                      std::set<unsigned> &OutRegs) {
    1754           5 :   std::vector<int> DiffSetPressure;
    1755          10 :   DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
    1756             : 
    1757          23 :   for (unsigned Reg : InRegs) {
    1758             :     // For now only track virtual registers.
    1759          16 :     if (!TargetRegisterInfo::isVirtualRegister(Reg))
    1760           0 :       continue;
    1761           8 :     if (LiveRegsConsumers[Reg] > 1)
    1762           0 :       continue;
    1763          16 :     PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
    1764          34 :     for (; PSetI.isValid(); ++PSetI) {
    1765          39 :       DiffSetPressure[*PSetI] -= PSetI.getWeight();
    1766             :     }
    1767             :   }
    1768             : 
    1769          23 :   for (unsigned Reg : OutRegs) {
    1770             :     // For now only track virtual registers.
    1771           8 :     if (!TargetRegisterInfo::isVirtualRegister(Reg))
    1772           0 :       continue;
    1773          16 :     PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
    1774          34 :     for (; PSetI.isValid(); ++PSetI) {
    1775          39 :       DiffSetPressure[*PSetI] += PSetI.getWeight();
    1776             :     }
    1777             :   }
    1778             : 
    1779           5 :   return DiffSetPressure;
    1780             : }
    1781             : 
    1782             : // SIScheduler //
    1783             : 
    1784             : struct SIScheduleBlockResult
    1785           2 : SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
    1786             :                              SISchedulerBlockSchedulerVariant ScheduleVariant) {
    1787           4 :   SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
    1788           4 :   SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
    1789           4 :   std::vector<SIScheduleBlock*> ScheduledBlocks;
    1790           2 :   struct SIScheduleBlockResult Res;
    1791             : 
    1792           6 :   ScheduledBlocks = Scheduler.getBlocks();
    1793             : 
    1794          14 :   for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
    1795          10 :     SIScheduleBlock *Block = ScheduledBlocks[b];
    1796          10 :     std::vector<SUnit*> SUs = Block->getScheduledUnits();
    1797             : 
    1798          36 :     for (SUnit* SU : SUs)
    1799          16 :       Res.SUs.push_back(SU->NodeNum);
    1800             :   }
    1801             : 
    1802           2 :   Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
    1803           2 :   Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
    1804           2 :   return Res;
    1805             : }
    1806             : 
    1807             : // SIScheduleDAGMI //
    1808             : 
    1809           1 : SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
    1810          13 :   ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) {
    1811           1 :   SITII = static_cast<const SIInstrInfo*>(TII);
    1812           1 :   SITRI = static_cast<const SIRegisterInfo*>(TRI);
    1813             : 
    1814           1 :   VGPRSetID = SITRI->getVGPRPressureSet();
    1815           1 :   SGPRSetID = SITRI->getSGPRPressureSet();
    1816           1 : }
    1817             : 
    1818             : SIScheduleDAGMI::~SIScheduleDAGMI() = default;
    1819             : 
    1820             : // Code adapted from scheduleDAG.cpp
    1821             : // Does a topological sort over the SUs.
    1822             : // Both TopDown and BottomUp
    1823           2 : void SIScheduleDAGMI::topologicalSort() {
    1824           2 :   Topo.InitDAGTopologicalSorting();
    1825             : 
    1826          14 :   TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
    1827          14 :   BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
    1828           2 : }
    1829             : 
    1830             : // Move low latencies further from their user without
    1831             : // increasing SGPR usage (in general)
    1832             : // This is to be replaced by a better pass that would
    1833             : // take into account SGPR usage (based on VGPR Usage
    1834             : // and the corresponding wavefront count), that would
    1835             : // try to merge groups of loads if it make sense, etc
    1836           2 : void SIScheduleDAGMI::moveLowLatencies() {
    1837           4 :    unsigned DAGSize = SUnits.size();
    1838           2 :    int LastLowLatencyUser = -1;
    1839           2 :    int LastLowLatencyPos = -1;
    1840             : 
    1841          20 :    for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
    1842          48 :     SUnit *SU = &SUnits[ScheduledSUnits[i]];
    1843          16 :     bool IsLowLatencyUser = false;
    1844          16 :     unsigned MinPos = 0;
    1845             : 
    1846          69 :     for (SDep& PredDep : SU->Preds) {
    1847          21 :       SUnit *Pred = PredDep.getSUnit();
    1848          21 :       if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
    1849           0 :         IsLowLatencyUser = true;
    1850             :       }
    1851          21 :       if (Pred->NodeNum >= DAGSize)
    1852           0 :         continue;
    1853          42 :       unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
    1854          21 :       if (PredPos >= MinPos)
    1855          10 :         MinPos = PredPos + 1;
    1856             :     }
    1857             : 
    1858          16 :     if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
    1859           2 :       unsigned BestPos = LastLowLatencyUser + 1;
    1860           2 :       if ((int)BestPos <= LastLowLatencyPos)
    1861           1 :         BestPos = LastLowLatencyPos + 1;
    1862           2 :       if (BestPos < MinPos)
    1863           1 :         BestPos = MinPos;
    1864           2 :       if (BestPos < i) {
    1865          34 :         for (unsigned u = i; u > BestPos; --u) {
    1866          48 :           ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
    1867          32 :           ScheduledSUnits[u] = ScheduledSUnits[u-1];
    1868             :         }
    1869           4 :         ScheduledSUnits[BestPos] = SU->NodeNum;
    1870           4 :         ScheduledSUnitsInv[SU->NodeNum] = BestPos;
    1871             :       }
    1872           2 :       LastLowLatencyPos = BestPos;
    1873           2 :       if (IsLowLatencyUser)
    1874           0 :         LastLowLatencyUser = BestPos;
    1875          14 :     } else if (IsLowLatencyUser) {
    1876           0 :       LastLowLatencyUser = i;
    1877             :     // Moves COPY instructions on which depends
    1878             :     // the low latency instructions too.
    1879          14 :     } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
    1880           5 :       bool CopyForLowLat = false;
    1881          22 :       for (SDep& SuccDep : SU->Succs) {
    1882           7 :         SUnit *Succ = SuccDep.getSUnit();
    1883           7 :         if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
    1884           2 :           CopyForLowLat = true;
    1885             :         }
    1886             :       }
    1887           5 :       if (!CopyForLowLat)
    1888           3 :         continue;
    1889           2 :       if (MinPos < i) {
    1890          36 :         for (unsigned u = i; u > MinPos; --u) {
    1891          51 :           ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
    1892          34 :           ScheduledSUnits[u] = ScheduledSUnits[u-1];
    1893             :         }
    1894           4 :         ScheduledSUnits[MinPos] = SU->NodeNum;
    1895           4 :         ScheduledSUnitsInv[SU->NodeNum] = MinPos;
    1896             :       }
    1897             :     }
    1898             :   }
    1899           2 : }
    1900             : 
    1901           2 : void SIScheduleDAGMI::restoreSULinksLeft() {
    1902          20 :   for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
    1903          32 :     SUnits[i].isScheduled = false;
    1904          48 :     SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
    1905          48 :     SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
    1906          48 :     SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
    1907          48 :     SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
    1908             :   }
    1909           2 : }
    1910             : 
    1911             : // Return the Vgpr and Sgpr usage corresponding to some virtual registers.
    1912             : template<typename _Iterator> void
    1913           5 : SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
    1914             :                                   unsigned &VgprUsage, unsigned &SgprUsage) {
    1915           5 :   VgprUsage = 0;
    1916           5 :   SgprUsage = 0;
    1917          20 :   for (_Iterator RegI = First; RegI != End; ++RegI) {
    1918          10 :     unsigned Reg = *RegI;
    1919             :     // For now only track virtual registers
    1920          10 :     if (!TargetRegisterInfo::isVirtualRegister(Reg))
    1921           0 :       continue;
    1922          20 :     PSetIterator PSetI = MRI.getPressureSets(Reg);
    1923          44 :     for (; PSetI.isValid(); ++PSetI) {
    1924          34 :       if (*PSetI == VGPRSetID)
    1925           6 :         VgprUsage += PSetI.getWeight();
    1926          11 :       else if (*PSetI == SGPRSetID)
    1927           4 :         SgprUsage += PSetI.getWeight();
    1928             :     }
    1929             :   }
    1930           5 : }
    1931             : 
    1932           2 : void SIScheduleDAGMI::schedule()
    1933             : {
    1934           8 :   SmallVector<SUnit*, 8> TopRoots, BotRoots;
    1935           8 :   SIScheduleBlockResult Best, Temp;
    1936             :   DEBUG(dbgs() << "Preparing Scheduling\n");
    1937             : 
    1938           2 :   buildDAGWithRegPressure();
    1939             :   DEBUG(
    1940             :     for(SUnit& SU : SUnits)
    1941             :        SU.dumpAll(this)
    1942             :   );
    1943             : 
    1944           2 :   topologicalSort();
    1945           2 :   findRootsAndBiasEdges(TopRoots, BotRoots);
    1946             :   // We reuse several ScheduleDAGMI and ScheduleDAGMILive
    1947             :   // functions, but to make them happy we must initialize
    1948             :   // the default Scheduler implementation (even if we do not
    1949             :   // run it)
    1950           4 :   SchedImpl->initialize(this);
    1951           6 :   initQueues(TopRoots, BotRoots);
    1952             : 
    1953             :   // Fill some stats to help scheduling.
    1954             : 
    1955           2 :   SUnitsLinksBackup = SUnits;
    1956           4 :   IsLowLatencySU.clear();
    1957           4 :   LowLatencyOffset.clear();
    1958           4 :   IsHighLatencySU.clear();
    1959             : 
    1960           4 :   IsLowLatencySU.resize(SUnits.size(), 0);
    1961           4 :   LowLatencyOffset.resize(SUnits.size(), 0);
    1962           4 :   IsHighLatencySU.resize(SUnits.size(), 0);
    1963             : 
    1964          20 :   for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
    1965          32 :     SUnit *SU = &SUnits[i];
    1966             :     unsigned BaseLatReg;
    1967             :     int64_t OffLatReg;
    1968          16 :     if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
    1969           4 :       IsLowLatencySU[i] = 1;
    1970           2 :       if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg,
    1971             :                                        TRI))
    1972           4 :         LowLatencyOffset[i] = OffLatReg;
    1973          14 :     } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
    1974           2 :       IsHighLatencySU[i] = 1;
    1975             :   }
    1976             : 
    1977           4 :   SIScheduler Scheduler(this);
    1978           6 :   Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
    1979             :                                    SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
    1980             : 
    1981             :   // if VGPR usage is extremely high, try other good performing variants
    1982             :   // which could lead to lower VGPR usage
    1983           2 :   if (Best.MaxVGPRUsage > 180) {
    1984             :     static const std::pair<SISchedulerBlockCreatorVariant,
    1985             :                            SISchedulerBlockSchedulerVariant>
    1986             :         Variants[] = {
    1987             :       { LatenciesAlone, BlockRegUsageLatency },
    1988             : //      { LatenciesAlone, BlockRegUsage },
    1989             :       { LatenciesGrouped, BlockLatencyRegUsage },
    1990             : //      { LatenciesGrouped, BlockRegUsageLatency },
    1991             : //      { LatenciesGrouped, BlockRegUsage },
    1992             :       { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
    1993             : //      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
    1994             : //      { LatenciesAlonePlusConsecutive, BlockRegUsage }
    1995             :     };
    1996           0 :     for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
    1997           0 :       Temp = Scheduler.scheduleVariant(v.first, v.second);
    1998           0 :       if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
    1999             :         Best = Temp;
    2000             :     }
    2001             :   }
    2002             :   // if VGPR usage is still extremely high, we may spill. Try other variants
    2003             :   // which are less performing, but that could lead to lower VGPR usage.
    2004           2 :   if (Best.MaxVGPRUsage > 200) {
    2005             :     static const std::pair<SISchedulerBlockCreatorVariant,
    2006             :                            SISchedulerBlockSchedulerVariant>
    2007             :         Variants[] = {
    2008             : //      { LatenciesAlone, BlockRegUsageLatency },
    2009             :       { LatenciesAlone, BlockRegUsage },
    2010             : //      { LatenciesGrouped, BlockLatencyRegUsage },
    2011             :       { LatenciesGrouped, BlockRegUsageLatency },
    2012             :       { LatenciesGrouped, BlockRegUsage },
    2013             : //      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
    2014             :       { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
    2015             :       { LatenciesAlonePlusConsecutive, BlockRegUsage }
    2016             :     };
    2017           0 :     for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
    2018           0 :       Temp = Scheduler.scheduleVariant(v.first, v.second);
    2019           0 :       if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
    2020             :         Best = Temp;
    2021             :     }
    2022             :   }
    2023             : 
    2024           2 :   ScheduledSUnits = Best.SUs;
    2025           4 :   ScheduledSUnitsInv.resize(SUnits.size());
    2026             : 
    2027          20 :   for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
    2028          48 :     ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
    2029             :   }
    2030             : 
    2031           2 :   moveLowLatencies();
    2032             : 
    2033             :   // Tell the outside world about the result of the scheduling.
    2034             : 
    2035             :   assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
    2036           4 :   TopRPTracker.setPos(CurrentTop);
    2037             : 
    2038           4 :   for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
    2039          22 :        E = ScheduledSUnits.end(); I != E; ++I) {
    2040          32 :     SUnit *SU = &SUnits[*I];
    2041             : 
    2042          16 :     scheduleMI(SU, true);
    2043             : 
    2044             :     DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
    2045             :                  << *SU->getInstr());
    2046             :   }
    2047             : 
    2048             :   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
    2049             : 
    2050           2 :   placeDebugValues();
    2051             : 
    2052             :   DEBUG({
    2053             :       unsigned BBNum = begin()->getParent()->getNumber();
    2054             :       dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
    2055             :       dumpSchedule();
    2056             :       dbgs() << '\n';
    2057             :     });
    2058           2 : }

Generated by: LCOV version 1.13