LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIMachineScheduler.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 640 812 78.8 %
Date: 2018-07-13 00:08:38 Functions: 46 52 88.5 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// SI Machine Scheduler interface
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #include "SIMachineScheduler.h"
      16             : #include "AMDGPU.h"
      17             : #include "SIInstrInfo.h"
      18             : #include "SIRegisterInfo.h"
      19             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      20             : #include "llvm/ADT/STLExtras.h"
      21             : #include "llvm/ADT/SmallVector.h"
      22             : #include "llvm/CodeGen/LiveInterval.h"
      23             : #include "llvm/CodeGen/LiveIntervals.h"
      24             : #include "llvm/CodeGen/MachineInstr.h"
      25             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      26             : #include "llvm/CodeGen/MachineScheduler.h"
      27             : #include "llvm/CodeGen/RegisterPressure.h"
      28             : #include "llvm/CodeGen/SlotIndexes.h"
      29             : #include "llvm/CodeGen/TargetRegisterInfo.h"
      30             : #include "llvm/Support/Debug.h"
      31             : #include "llvm/Support/ErrorHandling.h"
      32             : #include "llvm/Support/raw_ostream.h"
      33             : #include <algorithm>
      34             : #include <cassert>
      35             : #include <map>
      36             : #include <set>
      37             : #include <utility>
      38             : #include <vector>
      39             : 
      40             : using namespace llvm;
      41             : 
      42             : #define DEBUG_TYPE "machine-scheduler"
      43             : 
      44             : // This scheduler implements a different scheduling algorithm than
      45             : // GenericScheduler.
      46             : //
      47             : // There are several specific architecture behaviours that can't be modelled
      48             : // for GenericScheduler:
      49             : // . When accessing the result of an SGPR load instruction, you have to wait
      50             : // for all the SGPR load instructions before your current instruction to
      51             : // have finished.
      52             : // . When accessing the result of an VGPR load instruction, you have to wait
      53             : // for all the VGPR load instructions previous to the VGPR load instruction
      54             : // you are interested in to finish.
      55             : // . The less the register pressure, the best load latencies are hidden
      56             : //
      57             : // Moreover some specifities (like the fact a lot of instructions in the shader
      58             : // have few dependencies) makes the generic scheduler have some unpredictable
      59             : // behaviours. For example when register pressure becomes high, it can either
      60             : // manage to prevent register pressure from going too high, or it can
      61             : // increase register pressure even more than if it hadn't taken register
      62             : // pressure into account.
      63             : //
      64             : // Also some other bad behaviours are generated, like loading at the beginning
      65             : // of the shader a constant in VGPR you won't need until the end of the shader.
      66             : //
      67             : // The scheduling problem for SI can distinguish three main parts:
      68             : // . Hiding high latencies (texture sampling, etc)
      69             : // . Hiding low latencies (SGPR constant loading, etc)
      70             : // . Keeping register usage low for better latency hiding and general
      71             : //   performance
      72             : //
      73             : // Some other things can also affect performance, but are hard to predict
      74             : // (cache usage, the fact the HW can issue several instructions from different
      75             : // wavefronts if different types, etc)
      76             : //
      77             : // This scheduler tries to solve the scheduling problem by dividing it into
      78             : // simpler sub-problems. It divides the instructions into blocks, schedules
      79             : // locally inside the blocks where it takes care of low latencies, and then
      80             : // chooses the order of the blocks by taking care of high latencies.
      81             : // Dividing the instructions into blocks helps control keeping register
      82             : // usage low.
      83             : //
      84             : // First the instructions are put into blocks.
      85             : //   We want the blocks help control register usage and hide high latencies
      86             : //   later. To help control register usage, we typically want all local
      87             : //   computations, when for example you create a result that can be comsummed
      88             : //   right away, to be contained in a block. Block inputs and outputs would
      89             : //   typically be important results that are needed in several locations of
      90             : //   the shader. Since we do want blocks to help hide high latencies, we want
      91             : //   the instructions inside the block to have a minimal set of dependencies
      92             : //   on high latencies. It will make it easy to pick blocks to hide specific
      93             : //   high latencies.
      94             : //   The block creation algorithm is divided into several steps, and several
      95             : //   variants can be tried during the scheduling process.
      96             : //
      97             : // Second the order of the instructions inside the blocks is chosen.
      98             : //   At that step we do take into account only register usage and hiding
      99             : //   low latency instructions
     100             : //
     101             : // Third the block order is chosen, there we try to hide high latencies
     102             : // and keep register usage low.
     103             : //
     104             : // After the third step, a pass is done to improve the hiding of low
     105             : // latencies.
     106             : //
     107             : // Actually when talking about 'low latency' or 'high latency' it includes
     108             : // both the latency to get the cache (or global mem) data go to the register,
     109             : // and the bandwidth limitations.
     110             : // Increasing the number of active wavefronts helps hide the former, but it
     111             : // doesn't solve the latter, thus why even if wavefront count is high, we have
     112             : // to try have as many instructions hiding high latencies as possible.
     113             : // The OpenCL doc says for example latency of 400 cycles for a global mem access,
     114             : // which is hidden by 10 instructions if the wavefront count is 10.
     115             : 
     116             : // Some figures taken from AMD docs:
     117             : // Both texture and constant L1 caches are 4-way associative with 64 bytes
     118             : // lines.
     119             : // Constant cache is shared with 4 CUs.
     120             : // For texture sampling, the address generation unit receives 4 texture
     121             : // addresses per cycle, thus we could expect texture sampling latency to be
     122             : // equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
     123             : // instructions in a wavefront group are executed every 4 cycles),
     124             : // or 16 instructions if the other wavefronts associated to the 3 other VALUs
     125             : // of the CU do texture sampling too. (Don't take these figures too seriously,
     126             : // as I'm not 100% sure of the computation)
     127             : // Data exports should get similar latency.
     128             : // For constant loading, the cache is shader with 4 CUs.
     129             : // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
     130             : // I guess if the other CU don't read the cache, it can go up to 64B/cycle.
     131             : // It means a simple s_buffer_load should take one instruction to hide, as
     132             : // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
     133             : // cache line.
     134             : //
     135             : // As of today the driver doesn't preload the constants in cache, thus the
     136             : // first loads get extra latency. The doc says global memory access can be
     137             : // 300-600 cycles. We do not specially take that into account when scheduling
     138             : // As we expect the driver to be able to preload the constants soon.
     139             : 
     140             : // common code //
     141             : 
     142             : #ifndef NDEBUG
     143             : 
     144             : static const char *getReasonStr(SIScheduleCandReason Reason) {
     145             :   switch (Reason) {
     146             :   case NoCand:         return "NOCAND";
     147             :   case RegUsage:       return "REGUSAGE";
     148             :   case Latency:        return "LATENCY";
     149             :   case Successor:      return "SUCCESSOR";
     150             :   case Depth:          return "DEPTH";
     151             :   case NodeOrder:      return "ORDER";
     152             :   }
     153             :   llvm_unreachable("Unknown reason!");
     154             : }
     155             : 
     156             : #endif
     157             : 
     158             : namespace llvm {
     159             : namespace SISched {
     160             : static bool tryLess(int TryVal, int CandVal,
     161             :                     SISchedulerCandidate &TryCand,
     162             :                     SISchedulerCandidate &Cand,
     163             :                     SIScheduleCandReason Reason) {
     164          17 :   if (TryVal < CandVal) {
     165           2 :     TryCand.Reason = Reason;
     166             :     return true;
     167             :   }
     168          15 :   if (TryVal > CandVal) {
     169           0 :     if (Cand.Reason > Reason)
     170           0 :       Cand.Reason = Reason;
     171             :     return true;
     172             :   }
     173             :   Cand.setRepeat(Reason);
     174             :   return false;
     175             : }
     176             : 
     177             : static bool tryGreater(int TryVal, int CandVal,
     178             :                        SISchedulerCandidate &TryCand,
     179             :                        SISchedulerCandidate &Cand,
     180             :                        SIScheduleCandReason Reason) {
     181           9 :   if (TryVal > CandVal) {
     182           1 :     TryCand.Reason = Reason;
     183             :     return true;
     184             :   }
     185           8 :   if (TryVal < CandVal) {
     186           0 :     if (Cand.Reason > Reason)
     187           0 :       Cand.Reason = Reason;
     188             :     return true;
     189             :   }
     190             :   Cand.setRepeat(Reason);
     191             :   return false;
     192             : }
     193             : } // end namespace SISched
     194             : } // end namespace llvm
     195             : 
     196             : // SIScheduleBlock //
     197             : 
     198          16 : void SIScheduleBlock::addUnit(SUnit *SU) {
     199          32 :   NodeNum2Index[SU->NodeNum] = SUnits.size();
     200          16 :   SUnits.push_back(SU);
     201          16 : }
     202             : 
     203             : #ifndef NDEBUG
     204             : void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
     205             : 
     206             :   dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
     207             :   dbgs() << '\n';
     208             : }
     209             : #endif
     210             : 
     211          25 : void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
     212             :                                           SISchedCandidate &TryCand) {
     213             :   // Initialize the candidate if needed.
     214          25 :   if (!Cand.isValid()) {
     215          16 :     TryCand.Reason = NodeOrder;
     216          16 :     return;
     217             :   }
     218             : 
     219           9 :   if (Cand.SGPRUsage > 60 &&
     220           0 :       SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage,
     221             :                        TryCand, Cand, RegUsage))
     222             :     return;
     223             : 
     224             :   // Schedule low latency instructions as top as possible.
     225             :   // Order of priority is:
     226             :   // . Low latency instructions which do not depend on other low latency
     227             :   //   instructions we haven't waited for
     228             :   // . Other instructions which do not depend on low latency instructions
     229             :   //   we haven't waited for
     230             :   // . Low latencies
     231             :   // . All other instructions
     232             :   // Goal is to get: low latency instructions - independent instructions
     233             :   //     - (eventually some more low latency instructions)
     234             :   //     - instructions that depend on the first low latency instructions.
     235             :   // If in the block there is a lot of constant loads, the SGPR usage
     236             :   // could go quite high, thus above the arbitrary limit of 60 will encourage
     237             :   // use the already loaded constants (in order to release some SGPRs) before
     238             :   // loading more.
     239           9 :   if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent,
     240           9 :                        Cand.HasLowLatencyNonWaitedParent,
     241             :                        TryCand, Cand, SIScheduleCandReason::Depth))
     242             :     return;
     243             : 
     244           9 :   if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
     245             :                           TryCand, Cand, SIScheduleCandReason::Depth))
     246             :     return;
     247             : 
     248           8 :   if (TryCand.IsLowLatency &&
     249           0 :       SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
     250             :                        TryCand, Cand, SIScheduleCandReason::Depth))
     251             :     return;
     252             : 
     253           8 :   if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage,
     254             :                        TryCand, Cand, RegUsage))
     255             :     return;
     256             : 
     257             :   // Fall through to original instruction order.
     258           6 :   if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
     259           0 :     TryCand.Reason = NodeOrder;
     260             :   }
     261             : }
     262             : 
     263          16 : SUnit* SIScheduleBlock::pickNode() {
     264             :   SISchedCandidate TopCand;
     265             : 
     266          41 :   for (SUnit* SU : TopReadySUs) {
     267             :     SISchedCandidate TryCand;
     268             :     std::vector<unsigned> pressure;
     269             :     std::vector<unsigned> MaxPressure;
     270             :     // Predict register usage after this instruction.
     271          25 :     TryCand.SU = SU;
     272          25 :     TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
     273          50 :     TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
     274          50 :     TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
     275          50 :     TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
     276          50 :     TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
     277          25 :     TryCand.HasLowLatencyNonWaitedParent =
     278          50 :       HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
     279          25 :     tryCandidateTopDown(TopCand, TryCand);
     280          25 :     if (TryCand.Reason != NoCand)
     281             :       TopCand.setBest(TryCand);
     282             :   }
     283             : 
     284          16 :   return TopCand.SU;
     285             : }
     286             : 
     287             : 
     288             : // Schedule something valid.
     289           5 : void SIScheduleBlock::fastSchedule() {
     290           5 :   TopReadySUs.clear();
     291           5 :   if (Scheduled)
     292           0 :     undoSchedule();
     293             : 
     294          21 :   for (SUnit* SU : SUnits) {
     295          16 :     if (!SU->NumPredsLeft)
     296          10 :       TopReadySUs.push_back(SU);
     297             :   }
     298             : 
     299          37 :   while (!TopReadySUs.empty()) {
     300          16 :     SUnit *SU = TopReadySUs[0];
     301          16 :     ScheduledSUnits.push_back(SU);
     302          16 :     nodeScheduled(SU);
     303             :   }
     304             : 
     305           5 :   Scheduled = true;
     306           5 : }
     307             : 
     308             : // Returns if the register was set between first and last.
     309           8 : static bool isDefBetween(unsigned Reg,
     310             :                            SlotIndex First, SlotIndex Last,
     311             :                            const MachineRegisterInfo *MRI,
     312             :                            const LiveIntervals *LIS) {
     313             :   for (MachineRegisterInfo::def_instr_iterator
     314           8 :        UI = MRI->def_instr_begin(Reg),
     315           9 :        UE = MRI->def_instr_end(); UI != UE; ++UI) {
     316             :     const MachineInstr* MI = &*UI;
     317           9 :     if (MI->isDebugValue())
     318             :       continue;
     319           9 :     SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
     320          18 :     if (InstSlot >= First && InstSlot <= Last)
     321             :       return true;
     322             :   }
     323             :   return false;
     324             : }
     325             : 
     326           5 : void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
     327             :                                       MachineBasicBlock::iterator EndBlock) {
     328             :   IntervalPressure Pressure, BotPressure;
     329           5 :   RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
     330           5 :   LiveIntervals *LIS = DAG->getLIS();
     331           5 :   MachineRegisterInfo *MRI = DAG->getMRI();
     332           5 :   DAG->initRPTracker(TopRPTracker);
     333           5 :   DAG->initRPTracker(BotRPTracker);
     334           5 :   DAG->initRPTracker(RPTracker);
     335             : 
     336             :   // Goes though all SU. RPTracker captures what had to be alive for the SUs
     337             :   // to execute, and what is still alive at the end.
     338          21 :   for (SUnit* SU : ScheduledSUnits) {
     339          16 :     RPTracker.setPos(SU->getInstr());
     340          16 :     RPTracker.advance();
     341             :   }
     342             : 
     343             :   // Close the RPTracker to finalize live ins/outs.
     344           5 :   RPTracker.closeRegion();
     345             : 
     346             :   // Initialize the live ins and live outs.
     347          10 :   TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
     348          10 :   BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
     349             : 
     350             :   // Do not Track Physical Registers, because it messes up.
     351          40 :   for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
     352          30 :     if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
     353           8 :       LiveInRegs.insert(RegMaskPair.RegUnit);
     354             :   }
     355             :   LiveOutRegs.clear();
     356             :   // There is several possibilities to distinguish:
     357             :   // 1) Reg is not input to any instruction in the block, but is output of one
     358             :   // 2) 1) + read in the block and not needed after it
     359             :   // 3) 1) + read in the block but needed in another block
     360             :   // 4) Reg is input of an instruction but another block will read it too
     361             :   // 5) Reg is input of an instruction and then rewritten in the block.
     362             :   //    result is not read in the block (implies used in another block)
     363             :   // 6) Reg is input of an instruction and then rewritten in the block.
     364             :   //    result is read in the block and not needed in another block
     365             :   // 7) Reg is input of an instruction and then rewritten in the block.
     366             :   //    result is read in the block but also needed in another block
     367             :   // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
     368             :   // We want LiveOutRegs to contain only Regs whose content will be read after
     369             :   // in another block, and whose content was written in the current block,
     370             :   // that is we want it to get 1, 3, 5, 7
     371             :   // Since we made the MIs of a block to be packed all together before
     372             :   // scheduling, then the LiveIntervals were correct, and the RPTracker was
     373             :   // able to correctly handle 5 vs 6, 2 vs 3.
     374             :   // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
     375             :   // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
     376             :   // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
     377             :   // The use of findDefBetween removes the case 4.
     378          26 :   for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
     379           8 :     unsigned Reg = RegMaskPair.RegUnit;
     380          16 :     if (TargetRegisterInfo::isVirtualRegister(Reg) &&
     381          16 :         isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
     382           8 :                      LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
     383             :                      LIS)) {
     384             :       LiveOutRegs.insert(Reg);
     385             :     }
     386             :   }
     387             : 
     388             :   // Pressure = sum_alive_registers register size
     389             :   // Internally llvm will represent some registers as big 128 bits registers
     390             :   // for example, but they actually correspond to 4 actual 32 bits registers.
     391             :   // Thus Pressure is not equal to num_alive_registers * constant.
     392           5 :   LiveInPressure = TopPressure.MaxSetPressure;
     393           5 :   LiveOutPressure = BotPressure.MaxSetPressure;
     394             : 
     395             :   // Prepares TopRPTracker for top down scheduling.
     396           5 :   TopRPTracker.closeTop();
     397           5 : }
     398             : 
     399           5 : void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
     400             :                                MachineBasicBlock::iterator EndBlock) {
     401           5 :   if (!Scheduled)
     402           0 :     fastSchedule();
     403             : 
     404             :   // PreScheduling phase to set LiveIn and LiveOut.
     405           5 :   initRegPressure(BeginBlock, EndBlock);
     406           5 :   undoSchedule();
     407             : 
     408             :   // Schedule for real now.
     409             : 
     410           5 :   TopReadySUs.clear();
     411             : 
     412          21 :   for (SUnit* SU : SUnits) {
     413          16 :     if (!SU->NumPredsLeft)
     414          10 :       TopReadySUs.push_back(SU);
     415             :   }
     416             : 
     417          37 :   while (!TopReadySUs.empty()) {
     418          16 :     SUnit *SU = pickNode();
     419          16 :     ScheduledSUnits.push_back(SU);
     420          16 :     TopRPTracker.setPos(SU->getInstr());
     421          16 :     TopRPTracker.advance();
     422          16 :     nodeScheduled(SU);
     423             :   }
     424             : 
     425             :   // TODO: compute InternalAdditionnalPressure.
     426          10 :   InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
     427             : 
     428             :   // Check everything is right.
     429             : #ifndef NDEBUG
     430             :   assert(SUnits.size() == ScheduledSUnits.size() &&
     431             :             TopReadySUs.empty());
     432             :   for (SUnit* SU : SUnits) {
     433             :     assert(SU->isScheduled &&
     434             :               SU->NumPredsLeft == 0);
     435             :   }
     436             : #endif
     437             : 
     438           5 :   Scheduled = true;
     439           5 : }
     440             : 
     441           5 : void SIScheduleBlock::undoSchedule() {
     442          21 :   for (SUnit* SU : SUnits) {
     443          16 :     SU->isScheduled = false;
     444          64 :     for (SDep& Succ : SU->Succs) {
     445          48 :       if (BC->isSUInBlock(Succ.getSUnit(), ID))
     446          12 :         undoReleaseSucc(SU, &Succ);
     447             :     }
     448             :   }
     449          10 :   HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
     450             :   ScheduledSUnits.clear();
     451           5 :   Scheduled = false;
     452           5 : }
     453             : 
     454          12 : void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
     455             :   SUnit *SuccSU = SuccEdge->getSUnit();
     456             : 
     457             :   if (SuccEdge->isWeak()) {
     458           0 :     ++SuccSU->WeakPredsLeft;
     459           0 :     return;
     460             :   }
     461          12 :   ++SuccSU->NumPredsLeft;
     462             : }
     463             : 
     464          33 : void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
     465             :   SUnit *SuccSU = SuccEdge->getSUnit();
     466             : 
     467             :   if (SuccEdge->isWeak()) {
     468           0 :     --SuccSU->WeakPredsLeft;
     469           0 :     return;
     470             :   }
     471             : #ifndef NDEBUG
     472             :   if (SuccSU->NumPredsLeft == 0) {
     473             :     dbgs() << "*** Scheduling failed! ***\n";
     474             :     SuccSU->dump(DAG);
     475             :     dbgs() << " has been released too many times!\n";
     476             :     llvm_unreachable(nullptr);
     477             :   }
     478             : #endif
     479             : 
     480          33 :   --SuccSU->NumPredsLeft;
     481             : }
     482             : 
     483             : /// Release Successors of the SU that are in the block or not.
     484          48 : void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
     485         192 :   for (SDep& Succ : SU->Succs) {
     486          72 :     SUnit *SuccSU = Succ.getSUnit();
     487             : 
     488         144 :     if (SuccSU->NodeNum >= DAG->SUnits.size())
     489          48 :         continue;
     490             : 
     491          63 :     if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
     492          30 :       continue;
     493             : 
     494          33 :     releaseSucc(SU, &Succ);
     495          33 :     if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
     496          12 :       TopReadySUs.push_back(SuccSU);
     497             :   }
     498          48 : }
     499             : 
     500          32 : void SIScheduleBlock::nodeScheduled(SUnit *SU) {
     501             :   // Is in TopReadySUs
     502             :   assert (!SU->NumPredsLeft);
     503             :   std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU);
     504          32 :   if (I == TopReadySUs.end()) {
     505           0 :     dbgs() << "Data Structure Bug in SI Scheduler\n";
     506           0 :     llvm_unreachable(nullptr);
     507             :   }
     508          32 :   TopReadySUs.erase(I);
     509             : 
     510          32 :   releaseSuccessors(SU, true);
     511             :   // Scheduling this node will trigger a wait,
     512             :   // thus propagate to other instructions that they do not need to wait either.
     513          64 :   if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
     514           0 :     HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
     515             : 
     516          64 :   if (DAG->IsLowLatencySU[SU->NodeNum]) {
     517          12 :      for (SDep& Succ : SU->Succs) {
     518             :       std::map<unsigned, unsigned>::iterator I =
     519             :         NodeNum2Index.find(Succ.getSUnit()->NodeNum);
     520           4 :       if (I != NodeNum2Index.end())
     521           0 :         HasLowLatencyNonWaitedParent[I->second] = 1;
     522             :     }
     523             :   }
     524          32 :   SU->isScheduled = true;
     525          32 : }
     526             : 
     527           5 : void SIScheduleBlock::finalizeUnits() {
     528             :   // We remove links from outside blocks to enable scheduling inside the block.
     529          21 :   for (SUnit* SU : SUnits) {
     530          16 :     releaseSuccessors(SU, false);
     531          32 :     if (DAG->IsHighLatencySU[SU->NodeNum])
     532           1 :       HighLatencyBlock = true;
     533             :   }
     534          10 :   HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
     535           5 : }
     536             : 
     537             : // we maintain ascending order of IDs
     538           9 : void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
     539           9 :   unsigned PredID = Pred->getID();
     540             : 
     541             :   // Check if not already predecessor.
     542          10 :   for (SIScheduleBlock* P : Preds) {
     543           6 :     if (PredID == P->getID())
     544             :       return;
     545             :   }
     546           4 :   Preds.push_back(Pred);
     547             : 
     548             :   assert(none_of(Succs,
     549             :                  [=](std::pair<SIScheduleBlock*,
     550             :                      SIScheduleBlockLinkKind> S) {
     551             :                    return PredID == S.first->getID();
     552             :                     }) &&
     553             :          "Loop in the Block Graph!");
     554             : }
     555             : 
     556           9 : void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
     557             :                               SIScheduleBlockLinkKind Kind) {
     558           9 :   unsigned SuccID = Succ->getID();
     559             : 
     560             :   // Check if not already predecessor.
     561          10 :   for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
     562           6 :     if (SuccID == S.first->getID()) {
     563           5 :       if (S.second == SIScheduleBlockLinkKind::NoData &&
     564             :           Kind == SIScheduleBlockLinkKind::Data)
     565           0 :         S.second = Kind;
     566             :       return;
     567             :     }
     568             :   }
     569           4 :   if (Succ->isHighLatencyBlock())
     570           0 :     ++NumHighLatencySuccessors;
     571           8 :   Succs.push_back(std::make_pair(Succ, Kind));
     572             : 
     573             :   assert(none_of(Preds,
     574             :                  [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
     575             :          "Loop in the Block Graph!");
     576             : }
     577             : 
     578             : #ifndef NDEBUG
     579             : void SIScheduleBlock::printDebug(bool full) {
     580             :   dbgs() << "Block (" << ID << ")\n";
     581             :   if (!full)
     582             :     return;
     583             : 
     584             :   dbgs() << "\nContains High Latency Instruction: "
     585             :          << HighLatencyBlock << '\n';
     586             :   dbgs() << "\nDepends On:\n";
     587             :   for (SIScheduleBlock* P : Preds) {
     588             :     P->printDebug(false);
     589             :   }
     590             : 
     591             :   dbgs() << "\nSuccessors:\n";
     592             :   for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
     593             :     if (S.second == SIScheduleBlockLinkKind::Data)
     594             :       dbgs() << "(Data Dep) ";
     595             :     S.first->printDebug(false);
     596             :   }
     597             : 
     598             :   if (Scheduled) {
     599             :     dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
     600             :            << LiveInPressure[DAG->getVGPRSetID()] << '\n';
     601             :     dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
     602             :            << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
     603             :     dbgs() << "LiveIns:\n";
     604             :     for (unsigned Reg : LiveInRegs)
     605             :       dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
     606             : 
     607             :     dbgs() << "\nLiveOuts:\n";
     608             :     for (unsigned Reg : LiveOutRegs)
     609             :       dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
     610             :   }
     611             : 
     612             :   dbgs() << "\nInstructions:\n";
     613             :   if (!Scheduled) {
     614             :     for (SUnit* SU : SUnits) {
     615             :       SU->dump(DAG);
     616             :     }
     617             :   } else {
     618             :     for (SUnit* SU : SUnits) {
     619             :       SU->dump(DAG);
     620             :     }
     621             :   }
     622             : 
     623             :   dbgs() << "///////////////////////\n";
     624             : }
     625             : #endif
     626             : 
     627             : // SIScheduleBlockCreator //
     628             : 
     629           2 : SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
     630           2 : DAG(DAG) {
     631           2 : }
     632             : 
     633             : SIScheduleBlockCreator::~SIScheduleBlockCreator() = default;
     634             : 
     635             : SIScheduleBlocks
     636           2 : SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
     637             :   std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
     638             :     Blocks.find(BlockVariant);
     639           2 :   if (B == Blocks.end()) {
     640           2 :     SIScheduleBlocks Res;
     641           2 :     createBlocksForVariant(BlockVariant);
     642           2 :     topologicalSort();
     643           2 :     scheduleInsideBlocks();
     644           2 :     fillStats();
     645           2 :     Res.Blocks = CurrentBlocks;
     646           2 :     Res.TopDownIndex2Block = TopDownIndex2Block;
     647           2 :     Res.TopDownBlock2Index = TopDownBlock2Index;
     648           2 :     Blocks[BlockVariant] = Res;
     649           2 :     return Res;
     650             :   } else {
     651           0 :     return B->second;
     652             :   }
     653             : }
     654             : 
     655          87 : bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
     656         174 :   if (SU->NodeNum >= DAG->SUnits.size())
     657             :     return false;
     658         252 :   return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
     659             : }
     660             : 
     661           2 : void SIScheduleBlockCreator::colorHighLatenciesAlone() {
     662           4 :   unsigned DAGSize = DAG->SUnits.size();
     663             : 
     664          34 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
     665          16 :     SUnit *SU = &DAG->SUnits[i];
     666          32 :     if (DAG->IsHighLatencySU[SU->NodeNum]) {
     667           2 :       CurrentColoring[SU->NodeNum] = NextReservedID++;
     668             :     }
     669             :   }
     670           2 : }
     671             : 
     672             : static bool
     673             : hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
     674           0 :   for (const auto &PredDep : SU.Preds) {
     675           0 :     if (PredDep.getSUnit() == &FromSU &&
     676             :         PredDep.getKind() == llvm::SDep::Data)
     677             :       return true;
     678             :   }
     679             :   return false;
     680             : }
     681             : 
     682           0 : void SIScheduleBlockCreator::colorHighLatenciesGroups() {
     683           0 :   unsigned DAGSize = DAG->SUnits.size();
     684             :   unsigned NumHighLatencies = 0;
     685             :   unsigned GroupSize;
     686           0 :   int Color = NextReservedID;
     687             :   unsigned Count = 0;
     688             :   std::set<unsigned> FormingGroup;
     689             : 
     690           0 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
     691           0 :     SUnit *SU = &DAG->SUnits[i];
     692           0 :     if (DAG->IsHighLatencySU[SU->NodeNum])
     693           0 :       ++NumHighLatencies;
     694             :   }
     695             : 
     696           0 :   if (NumHighLatencies == 0)
     697             :     return;
     698             : 
     699           0 :   if (NumHighLatencies <= 6)
     700             :     GroupSize = 2;
     701           0 :   else if (NumHighLatencies <= 12)
     702             :     GroupSize = 3;
     703             :   else
     704             :     GroupSize = 4;
     705             : 
     706           0 :   for (unsigned SUNum : DAG->TopDownIndex2SU) {
     707           0 :     const SUnit &SU = DAG->SUnits[SUNum];
     708           0 :     if (DAG->IsHighLatencySU[SU.NodeNum]) {
     709             :       unsigned CompatibleGroup = true;
     710             :       int ProposedColor = Color;
     711             :       std::vector<int> AdditionalElements;
     712             : 
     713             :       // We don't want to put in the same block
     714             :       // two high latency instructions that depend
     715             :       // on each other.
     716             :       // One way would be to check canAddEdge
     717             :       // in both directions, but that currently is not
     718             :       // enough because there the high latency order is
     719             :       // enforced (via links).
     720             :       // Instead, look at the dependencies between the
     721             :       // high latency instructions and deduce if it is
     722             :       // a data dependency or not.
     723           0 :       for (unsigned j : FormingGroup) {
     724             :         bool HasSubGraph;
     725             :         std::vector<int> SubGraph;
     726             :         // By construction (topological order), if SU and
     727             :         // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
     728             :         // in the parent graph of SU.
     729             : #ifndef NDEBUG
     730             :         SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
     731             :                                                HasSubGraph);
     732             :         assert(!HasSubGraph);
     733             : #endif
     734           0 :         SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
     735             :                                                HasSubGraph);
     736           0 :         if (!HasSubGraph)
     737             :           continue; // No dependencies between each other
     738           0 :         else if (SubGraph.size() > 5) {
     739             :           // Too many elements would be required to be added to the block.
     740             :           CompatibleGroup = false;
     741             :           break;
     742             :         }
     743             :         else {
     744             :           // Check the type of dependency
     745           0 :           for (unsigned k : SubGraph) {
     746             :             // If in the path to join the two instructions,
     747             :             // there is another high latency instruction,
     748             :             // or instructions colored for another block
     749             :             // abort the merge.
     750           0 :             if (DAG->IsHighLatencySU[k] ||
     751           0 :                 (CurrentColoring[k] != ProposedColor &&
     752           0 :                  CurrentColoring[k] != 0)) {
     753             :               CompatibleGroup = false;
     754             :               break;
     755             :             }
     756             :             // If one of the SU in the subgraph depends on the result of SU j,
     757             :             // there'll be a data dependency.
     758           0 :             if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) {
     759             :               CompatibleGroup = false;
     760             :               break;
     761             :             }
     762             :           }
     763           0 :           if (!CompatibleGroup)
     764             :             break;
     765             :           // Same check for the SU
     766           0 :           if (hasDataDependencyPred(SU, DAG->SUnits[j])) {
     767             :             CompatibleGroup = false;
     768             :             break;
     769             :           }
     770             :           // Add all the required instructions to the block
     771             :           // These cannot live in another block (because they
     772             :           // depend (order dependency) on one of the
     773             :           // instruction in the block, and are required for the
     774             :           // high latency instruction we add.
     775           0 :           AdditionalElements.insert(AdditionalElements.end(),
     776           0 :                                     SubGraph.begin(), SubGraph.end());
     777             :         }
     778             :       }
     779           0 :       if (CompatibleGroup) {
     780           0 :         FormingGroup.insert(SU.NodeNum);
     781           0 :         for (unsigned j : AdditionalElements)
     782           0 :           CurrentColoring[j] = ProposedColor;
     783           0 :         CurrentColoring[SU.NodeNum] = ProposedColor;
     784           0 :         ++Count;
     785             :       }
     786             :       // Found one incompatible instruction,
     787             :       // or has filled a big enough group.
     788             :       // -> start a new one.
     789           0 :       if (!CompatibleGroup) {
     790             :         FormingGroup.clear();
     791           0 :         Color = ++NextReservedID;
     792             :         ProposedColor = Color;
     793           0 :         FormingGroup.insert(SU.NodeNum);
     794           0 :         CurrentColoring[SU.NodeNum] = ProposedColor;
     795             :         Count = 0;
     796           0 :       } else if (Count == GroupSize) {
     797             :         FormingGroup.clear();
     798           0 :         Color = ++NextReservedID;
     799             :         ProposedColor = Color;
     800             :         Count = 0;
     801             :       }
     802             :     }
     803             :   }
     804             : }
     805             : 
     806           2 : void SIScheduleBlockCreator::colorComputeReservedDependencies() {
     807           4 :   unsigned DAGSize = DAG->SUnits.size();
     808             :   std::map<std::set<unsigned>, unsigned> ColorCombinations;
     809             : 
     810           2 :   CurrentTopDownReservedDependencyColoring.clear();
     811           2 :   CurrentBottomUpReservedDependencyColoring.clear();
     812             : 
     813           2 :   CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
     814           2 :   CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
     815             : 
     816             :   // Traverse TopDown, and give different colors to SUs depending
     817             :   // on which combination of High Latencies they depend on.
     818             : 
     819          20 :   for (unsigned SUNum : DAG->TopDownIndex2SU) {
     820          16 :     SUnit *SU = &DAG->SUnits[SUNum];
     821             :     std::set<unsigned> SUColors;
     822             : 
     823             :     // Already given.
     824          33 :     if (CurrentColoring[SU->NodeNum]) {
     825           2 :       CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
     826             :         CurrentColoring[SU->NodeNum];
     827           1 :       continue;
     828             :     }
     829             : 
     830          57 :    for (SDep& PredDep : SU->Preds) {
     831             :       SUnit *Pred = PredDep.getSUnit();
     832          21 :       if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
     833           0 :         continue;
     834          42 :       if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
     835          10 :         SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
     836             :     }
     837             :     // Color 0 by default.
     838          15 :     if (SUColors.empty())
     839          12 :       continue;
     840             :     // Same color than parents.
     841           5 :     if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
     842           0 :       CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
     843           0 :         *SUColors.begin();
     844             :     else {
     845             :       std::map<std::set<unsigned>, unsigned>::iterator Pos =
     846             :         ColorCombinations.find(SUColors);
     847           3 :       if (Pos != ColorCombinations.end()) {
     848           2 :           CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
     849             :       } else {
     850           4 :         CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
     851           2 :           NextNonReservedID;
     852           2 :         ColorCombinations[SUColors] = NextNonReservedID++;
     853             :       }
     854             :     }
     855             :   }
     856             : 
     857             :   ColorCombinations.clear();
     858             : 
     859             :   // Same as before, but BottomUp.
     860             : 
     861          20 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
     862          16 :     SUnit *SU = &DAG->SUnits[SUNum];
     863             :     std::set<unsigned> SUColors;
     864             : 
     865             :     // Already given.
     866          33 :     if (CurrentColoring[SU->NodeNum]) {
     867           2 :       CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
     868             :         CurrentColoring[SU->NodeNum];
     869           1 :       continue;
     870             :     }
     871             : 
     872          57 :     for (SDep& SuccDep : SU->Succs) {
     873             :       SUnit *Succ = SuccDep.getSUnit();
     874          24 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
     875           3 :         continue;
     876          36 :       if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
     877           0 :         SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
     878             :     }
     879             :     // Keep color 0.
     880          15 :     if (SUColors.empty())
     881          15 :       continue;
     882             :     // Same color than parents.
     883           0 :     if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
     884           0 :       CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
     885           0 :         *SUColors.begin();
     886             :     else {
     887             :       std::map<std::set<unsigned>, unsigned>::iterator Pos =
     888             :         ColorCombinations.find(SUColors);
     889           0 :       if (Pos != ColorCombinations.end()) {
     890           0 :         CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
     891             :       } else {
     892           0 :         CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
     893           0 :           NextNonReservedID;
     894           0 :         ColorCombinations[SUColors] = NextNonReservedID++;
     895             :       }
     896             :     }
     897             :   }
     898           2 : }
     899             : 
     900           2 : void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
     901           4 :   unsigned DAGSize = DAG->SUnits.size();
     902             :   std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
     903             : 
     904             :   // Every combination of colors given by the top down
     905             :   // and bottom up Reserved node dependency
     906             : 
     907          34 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
     908          16 :     SUnit *SU = &DAG->SUnits[i];
     909          16 :     std::pair<unsigned, unsigned> SUColors;
     910             : 
     911             :     // High latency instructions: already given.
     912          32 :     if (CurrentColoring[SU->NodeNum])
     913           1 :       continue;
     914             : 
     915          30 :     SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
     916          30 :     SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
     917             : 
     918             :     std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
     919             :       ColorCombinations.find(SUColors);
     920          15 :     if (Pos != ColorCombinations.end()) {
     921          24 :       CurrentColoring[SU->NodeNum] = Pos->second;
     922             :     } else {
     923           6 :       CurrentColoring[SU->NodeNum] = NextNonReservedID;
     924           3 :       ColorCombinations[SUColors] = NextNonReservedID++;
     925             :     }
     926             :   }
     927           2 : }
     928             : 
     929           2 : void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
     930           4 :   unsigned DAGSize = DAG->SUnits.size();
     931           2 :   std::vector<int> PendingColoring = CurrentColoring;
     932             : 
     933             :   assert(DAGSize >= 1 &&
     934             :          CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
     935             :          CurrentTopDownReservedDependencyColoring.size() == DAGSize);
     936             :   // If there is no reserved block at all, do nothing. We don't want
     937             :   // everything in one block.
     938           2 :   if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
     939           3 :                         CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
     940           1 :       *std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
     941             :                         CurrentTopDownReservedDependencyColoring.end()) == 0)
     942             :     return;
     943             : 
     944           6 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
     945           4 :     SUnit *SU = &DAG->SUnits[SUNum];
     946             :     std::set<unsigned> SUColors;
     947             :     std::set<unsigned> SUColorsPending;
     948             : 
     949           8 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
     950           1 :       continue;
     951             : 
     952          12 :     if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
     953           6 :         CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
     954           3 :       continue;
     955             : 
     956           0 :     for (SDep& SuccDep : SU->Succs) {
     957             :       SUnit *Succ = SuccDep.getSUnit();
     958           0 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
     959           0 :         continue;
     960           0 :       if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
     961           0 :           CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
     962           0 :         SUColors.insert(CurrentColoring[Succ->NodeNum]);
     963           0 :       SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
     964             :     }
     965             :     // If there is only one child/parent block, and that block
     966             :     // is not among the ones we are removing in this path, then
     967             :     // merge the instruction to that block
     968           0 :     if (SUColors.size() == 1 && SUColorsPending.size() == 1)
     969           0 :       PendingColoring[SU->NodeNum] = *SUColors.begin();
     970             :     else // TODO: Attribute new colors depending on color
     971             :          // combination of children.
     972           0 :       PendingColoring[SU->NodeNum] = NextNonReservedID++;
     973             :   }
     974           1 :   CurrentColoring = PendingColoring;
     975             : }
     976             : 
     977             : 
     978           0 : void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
     979           0 :   unsigned DAGSize = DAG->SUnits.size();
     980             :   unsigned PreviousColor;
     981             :   std::set<unsigned> SeenColors;
     982             : 
     983           0 :   if (DAGSize <= 1)
     984             :     return;
     985             : 
     986           0 :   PreviousColor = CurrentColoring[0];
     987             : 
     988           0 :   for (unsigned i = 1, e = DAGSize; i != e; ++i) {
     989           0 :     SUnit *SU = &DAG->SUnits[i];
     990           0 :     unsigned CurrentColor = CurrentColoring[i];
     991           0 :     unsigned PreviousColorSave = PreviousColor;
     992             :     assert(i == SU->NodeNum);
     993             : 
     994           0 :     if (CurrentColor != PreviousColor)
     995             :       SeenColors.insert(PreviousColor);
     996           0 :     PreviousColor = CurrentColor;
     997             : 
     998           0 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
     999           0 :       continue;
    1000             : 
    1001           0 :     if (SeenColors.find(CurrentColor) == SeenColors.end())
    1002           0 :       continue;
    1003             : 
    1004           0 :     if (PreviousColorSave != CurrentColor)
    1005           0 :       CurrentColoring[i] = NextNonReservedID++;
    1006             :     else
    1007           0 :       CurrentColoring[i] = CurrentColoring[i-1];
    1008             :   }
    1009             : }
    1010             : 
    1011           2 : void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
    1012           4 :   unsigned DAGSize = DAG->SUnits.size();
    1013             : 
    1014          18 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    1015          16 :     SUnit *SU = &DAG->SUnits[SUNum];
    1016             :     std::set<unsigned> SUColors;
    1017             : 
    1018          32 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    1019           1 :       continue;
    1020             : 
    1021             :     // No predecessor: Vgpr constant loading.
    1022             :     // Low latency instructions usually have a predecessor (the address)
    1023          25 :     if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
    1024           8 :       continue;
    1025             : 
    1026          25 :     for (SDep& SuccDep : SU->Succs) {
    1027             :       SUnit *Succ = SuccDep.getSUnit();
    1028          11 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    1029           2 :         continue;
    1030          21 :       SUColors.insert(CurrentColoring[Succ->NodeNum]);
    1031             :     }
    1032           7 :     if (SUColors.size() == 1)
    1033           8 :       CurrentColoring[SU->NodeNum] = *SUColors.begin();
    1034             :   }
    1035           2 : }
    1036             : 
    1037           0 : void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
    1038           0 :   unsigned DAGSize = DAG->SUnits.size();
    1039             : 
    1040           0 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    1041           0 :     SUnit *SU = &DAG->SUnits[SUNum];
    1042             :     std::set<unsigned> SUColors;
    1043             : 
    1044           0 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    1045             :       continue;
    1046             : 
    1047           0 :     for (SDep& SuccDep : SU->Succs) {
    1048             :        SUnit *Succ = SuccDep.getSUnit();
    1049           0 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    1050           0 :         continue;
    1051           0 :       SUColors.insert(CurrentColoring[Succ->NodeNum]);
    1052             :     }
    1053           0 :     if (SUColors.size() == 1)
    1054           0 :       CurrentColoring[SU->NodeNum] = *SUColors.begin();
    1055             :   }
    1056           0 : }
    1057             : 
    1058           2 : void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
    1059           4 :   unsigned DAGSize = DAG->SUnits.size();
    1060             : 
    1061          18 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    1062          16 :     SUnit *SU = &DAG->SUnits[SUNum];
    1063             :     std::set<unsigned> SUColors;
    1064             : 
    1065          32 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    1066             :       continue;
    1067             : 
    1068          57 :     for (SDep& SuccDep : SU->Succs) {
    1069             :        SUnit *Succ = SuccDep.getSUnit();
    1070          21 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    1071           3 :         continue;
    1072          54 :       SUColors.insert(CurrentColoring[Succ->NodeNum]);
    1073             :     }
    1074          24 :     if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
    1075           0 :       CurrentColoring[SU->NodeNum] = *SUColors.begin();
    1076             :   }
    1077           2 : }
    1078             : 
    1079           0 : void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
    1080           0 :   unsigned DAGSize = DAG->SUnits.size();
    1081             :   std::map<unsigned, unsigned> ColorCount;
    1082             : 
    1083           0 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    1084           0 :     SUnit *SU = &DAG->SUnits[SUNum];
    1085           0 :     unsigned color = CurrentColoring[SU->NodeNum];
    1086           0 :      ++ColorCount[color];
    1087             :   }
    1088             : 
    1089           0 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    1090           0 :     SUnit *SU = &DAG->SUnits[SUNum];
    1091           0 :     unsigned color = CurrentColoring[SU->NodeNum];
    1092             :     std::set<unsigned> SUColors;
    1093             : 
    1094           0 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    1095           0 :       continue;
    1096             : 
    1097           0 :     if (ColorCount[color] > 1)
    1098           0 :       continue;
    1099             : 
    1100           0 :     for (SDep& SuccDep : SU->Succs) {
    1101             :        SUnit *Succ = SuccDep.getSUnit();
    1102           0 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    1103           0 :         continue;
    1104           0 :       SUColors.insert(CurrentColoring[Succ->NodeNum]);
    1105             :     }
    1106           0 :     if (SUColors.size() == 1 && *SUColors.begin() != color) {
    1107           0 :       --ColorCount[color];
    1108           0 :       CurrentColoring[SU->NodeNum] = *SUColors.begin();
    1109           0 :       ++ColorCount[*SUColors.begin()];
    1110             :     }
    1111             :   }
    1112           0 : }
    1113             : 
    1114           0 : void SIScheduleBlockCreator::cutHugeBlocks() {
    1115             :   // TODO
    1116           0 : }
    1117             : 
    1118           2 : void SIScheduleBlockCreator::regroupNoUserInstructions() {
    1119           4 :   unsigned DAGSize = DAG->SUnits.size();
    1120           2 :   int GroupID = NextNonReservedID++;
    1121             : 
    1122          18 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    1123          16 :     SUnit *SU = &DAG->SUnits[SUNum];
    1124             :     bool hasSuccessor = false;
    1125             : 
    1126          32 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    1127           1 :       continue;
    1128             : 
    1129          57 :     for (SDep& SuccDep : SU->Succs) {
    1130             :        SUnit *Succ = SuccDep.getSUnit();
    1131          21 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    1132           3 :         continue;
    1133             :       hasSuccessor = true;
    1134             :     }
    1135          15 :     if (!hasSuccessor)
    1136           4 :       CurrentColoring[SU->NodeNum] = GroupID;
    1137             :   }
    1138           2 : }
    1139             : 
    1140           2 : void SIScheduleBlockCreator::colorExports() {
    1141           2 :   unsigned ExportColor = NextNonReservedID++;
    1142             :   SmallVector<unsigned, 8> ExpGroup;
    1143             : 
    1144             :   // Put all exports together in a block.
    1145             :   // The block will naturally end up being scheduled last,
    1146             :   // thus putting exports at the end of the schedule, which
    1147             :   // is better for performance.
    1148             :   // However we must ensure, for safety, the exports can be put
    1149             :   // together in the same block without any other instruction.
    1150             :   // This could happen, for example, when scheduling after regalloc
    1151             :   // if reloading a spilled register from memory using the same
    1152             :   // register than used in a previous export.
    1153             :   // If that happens, do not regroup the exports.
    1154          20 :   for (unsigned SUNum : DAG->TopDownIndex2SU) {
    1155          16 :     const SUnit &SU = DAG->SUnits[SUNum];
    1156          32 :     if (SIInstrInfo::isEXP(*SU.getInstr())) {
    1157             :       // Check the EXP can be added to the group safely,
    1158             :       // ie without needing any other instruction.
    1159             :       // The EXP is allowed to depend on other EXP
    1160             :       // (they will be in the same group).
    1161           1 :       for (unsigned j : ExpGroup) {
    1162             :         bool HasSubGraph;
    1163             :         std::vector<int> SubGraph;
    1164             :         // By construction (topological order), if SU and
    1165             :         // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
    1166             :         // in the parent graph of SU.
    1167             : #ifndef NDEBUG
    1168             :         SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
    1169             :                                                HasSubGraph);
    1170             :         assert(!HasSubGraph);
    1171             : #endif
    1172           0 :         SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
    1173             :                                                HasSubGraph);
    1174           0 :         if (!HasSubGraph)
    1175             :           continue; // No dependencies between each other
    1176             : 
    1177             :         // SubGraph contains all the instructions required
    1178             :         // between EXP SUnits[j] and EXP SU.
    1179           0 :         for (unsigned k : SubGraph) {
    1180           0 :           if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr()))
    1181             :             // Other instructions than EXP would be required in the group.
    1182             :             // Abort the groupping.
    1183             :             return;
    1184             :         }
    1185             :       }
    1186             : 
    1187           1 :       ExpGroup.push_back(SUNum);
    1188             :     }
    1189             :   }
    1190             : 
    1191             :   // The group can be formed. Give the color.
    1192           4 :   for (unsigned j : ExpGroup)
    1193           2 :     CurrentColoring[j] = ExportColor;
    1194             : }
    1195             : 
    1196           2 : void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
    1197           4 :   unsigned DAGSize = DAG->SUnits.size();
    1198             :   std::map<unsigned,unsigned> RealID;
    1199             : 
    1200           2 :   CurrentBlocks.clear();
    1201           2 :   CurrentColoring.clear();
    1202           2 :   CurrentColoring.resize(DAGSize, 0);
    1203           2 :   Node2CurrentBlock.clear();
    1204             : 
    1205             :   // Restore links previous scheduling variant has overridden.
    1206           2 :   DAG->restoreSULinksLeft();
    1207             : 
    1208           2 :   NextReservedID = 1;
    1209           2 :   NextNonReservedID = DAGSize + 1;
    1210             : 
    1211             :   LLVM_DEBUG(dbgs() << "Coloring the graph\n");
    1212             : 
    1213           2 :   if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
    1214           0 :     colorHighLatenciesGroups();
    1215             :   else
    1216           2 :     colorHighLatenciesAlone();
    1217           2 :   colorComputeReservedDependencies();
    1218           2 :   colorAccordingToReservedDependencies();
    1219           2 :   colorEndsAccordingToDependencies();
    1220           2 :   if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
    1221           0 :     colorForceConsecutiveOrderInGroup();
    1222           2 :   regroupNoUserInstructions();
    1223           2 :   colorMergeConstantLoadsNextGroup();
    1224           2 :   colorMergeIfPossibleNextGroupOnlyForReserved();
    1225           2 :   colorExports();
    1226             : 
    1227             :   // Put SUs of same color into same block
    1228           2 :   Node2CurrentBlock.resize(DAGSize, -1);
    1229          34 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1230          16 :     SUnit *SU = &DAG->SUnits[i];
    1231          32 :     unsigned Color = CurrentColoring[SU->NodeNum];
    1232          16 :     if (RealID.find(Color) == RealID.end()) {
    1233           5 :       int ID = CurrentBlocks.size();
    1234          10 :       BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID));
    1235          10 :       CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
    1236           5 :       RealID[Color] = ID;
    1237             :     }
    1238          32 :     CurrentBlocks[RealID[Color]]->addUnit(SU);
    1239          32 :     Node2CurrentBlock[SU->NodeNum] = RealID[Color];
    1240             :   }
    1241             : 
    1242             :   // Build dependencies between blocks.
    1243          34 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1244          16 :     SUnit *SU = &DAG->SUnits[i];
    1245          32 :     int SUID = Node2CurrentBlock[i];
    1246          64 :      for (SDep& SuccDep : SU->Succs) {
    1247             :        SUnit *Succ = SuccDep.getSUnit();
    1248          24 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    1249           3 :         continue;
    1250          42 :       if (Node2CurrentBlock[Succ->NodeNum] != SUID)
    1251          27 :         CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
    1252             :                                      SuccDep.isCtrl() ? NoData : Data);
    1253             :     }
    1254          58 :     for (SDep& PredDep : SU->Preds) {
    1255             :       SUnit *Pred = PredDep.getSUnit();
    1256          21 :       if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
    1257           0 :         continue;
    1258          42 :       if (Node2CurrentBlock[Pred->NodeNum] != SUID)
    1259          27 :         CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
    1260             :     }
    1261             :   }
    1262             : 
    1263             :   // Free root and leafs of all blocks to enable scheduling inside them.
    1264           9 :   for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
    1265          10 :     SIScheduleBlock *Block = CurrentBlocks[i];
    1266           5 :     Block->finalizeUnits();
    1267             :   }
    1268             :   LLVM_DEBUG(dbgs() << "Blocks created:\n\n";
    1269             :              for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
    1270             :                SIScheduleBlock *Block = CurrentBlocks[i];
    1271             :                Block->printDebug(true);
    1272             :              });
    1273           2 : }
    1274             : 
    1275             : // Two functions taken from Codegen/MachineScheduler.cpp
    1276             : 
    1277             : /// Non-const version.
    1278             : static MachineBasicBlock::iterator
    1279          12 : nextIfDebug(MachineBasicBlock::iterator I,
    1280             :             MachineBasicBlock::const_iterator End) {
    1281          12 :   for (; I != End; ++I) {
    1282             :     if (!I->isDebugInstr())
    1283             :       break;
    1284             :   }
    1285          12 :   return I;
    1286             : }
    1287             : 
    1288           2 : void SIScheduleBlockCreator::topologicalSort() {
    1289           4 :   unsigned DAGSize = CurrentBlocks.size();
    1290             :   std::vector<int> WorkList;
    1291             : 
    1292             :   LLVM_DEBUG(dbgs() << "Topological Sort\n");
    1293             : 
    1294           2 :   WorkList.reserve(DAGSize);
    1295           2 :   TopDownIndex2Block.resize(DAGSize);
    1296           2 :   TopDownBlock2Index.resize(DAGSize);
    1297           2 :   BottomUpIndex2Block.resize(DAGSize);
    1298             : 
    1299          12 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1300          10 :     SIScheduleBlock *Block = CurrentBlocks[i];
    1301           5 :     unsigned Degree = Block->getSuccs().size();
    1302          10 :     TopDownBlock2Index[i] = Degree;
    1303           5 :     if (Degree == 0) {
    1304           4 :       WorkList.push_back(i);
    1305             :     }
    1306             :   }
    1307             : 
    1308           2 :   int Id = DAGSize;
    1309           7 :   while (!WorkList.empty()) {
    1310           5 :     int i = WorkList.back();
    1311          10 :     SIScheduleBlock *Block = CurrentBlocks[i];
    1312             :     WorkList.pop_back();
    1313          10 :     TopDownBlock2Index[i] = --Id;
    1314          10 :     TopDownIndex2Block[Id] = i;
    1315           5 :     for (SIScheduleBlock* Pred : Block->getPreds()) {
    1316           8 :       if (!--TopDownBlock2Index[Pred->getID()])
    1317           6 :         WorkList.push_back(Pred->getID());
    1318             :     }
    1319             :   }
    1320             : 
    1321             : #ifndef NDEBUG
    1322             :   // Check correctness of the ordering.
    1323             :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1324             :     SIScheduleBlock *Block = CurrentBlocks[i];
    1325             :     for (SIScheduleBlock* Pred : Block->getPreds()) {
    1326             :       assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
    1327             :       "Wrong Top Down topological sorting");
    1328             :     }
    1329             :   }
    1330             : #endif
    1331             : 
    1332           2 :   BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
    1333             :                                          TopDownIndex2Block.rend());
    1334           2 : }
    1335             : 
    1336           2 : void SIScheduleBlockCreator::scheduleInsideBlocks() {
    1337           4 :   unsigned DAGSize = CurrentBlocks.size();
    1338             : 
    1339             :   LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
    1340             : 
    1341             :   // We do schedule a valid scheduling such that a Block corresponds
    1342             :   // to a range of instructions.
    1343             :   LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
    1344          12 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1345          10 :     SIScheduleBlock *Block = CurrentBlocks[i];
    1346           5 :     Block->fastSchedule();
    1347             :   }
    1348             : 
    1349             :   // Note: the following code, and the part restoring previous position
    1350             :   // is by far the most expensive operation of the Scheduler.
    1351             : 
    1352             :   // Do not update CurrentTop.
    1353           2 :   MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
    1354             :   std::vector<MachineBasicBlock::iterator> PosOld;
    1355             :   std::vector<MachineBasicBlock::iterator> PosNew;
    1356           4 :   PosOld.reserve(DAG->SUnits.size());
    1357           4 :   PosNew.reserve(DAG->SUnits.size());
    1358             : 
    1359          12 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1360          10 :     int BlockIndice = TopDownIndex2Block[i];
    1361          10 :     SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
    1362             :     std::vector<SUnit*> SUs = Block->getScheduledUnits();
    1363             : 
    1364          21 :     for (SUnit* SU : SUs) {
    1365          16 :       MachineInstr *MI = SU->getInstr();
    1366             :       MachineBasicBlock::iterator Pos = MI;
    1367          16 :       PosOld.push_back(Pos);
    1368          16 :       if (&*CurrentTopFastSched == MI) {
    1369          12 :         PosNew.push_back(Pos);
    1370          12 :         CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
    1371          24 :                                           DAG->getCurrentBottom());
    1372             :       } else {
    1373             :         // Update the instruction stream.
    1374           8 :         DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
    1375             : 
    1376             :         // Update LiveIntervals.
    1377             :         // Note: Moving all instructions and calling handleMove every time
    1378             :         // is the most cpu intensive operation of the scheduler.
    1379             :         // It would gain a lot if there was a way to recompute the
    1380             :         // LiveIntervals for the entire scheduling region.
    1381           4 :         DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
    1382           4 :         PosNew.push_back(CurrentTopFastSched);
    1383             :       }
    1384             :     }
    1385             :   }
    1386             : 
    1387             :   // Now we have Block of SUs == Block of MI.
    1388             :   // We do the final schedule for the instructions inside the block.
    1389             :   // The property that all the SUs of the Block are grouped together as MI
    1390             :   // is used for correct reg usage tracking.
    1391          12 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1392          10 :     SIScheduleBlock *Block = CurrentBlocks[i];
    1393             :     std::vector<SUnit*> SUs = Block->getScheduledUnits();
    1394          15 :     Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
    1395             :   }
    1396             : 
    1397             :   LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
    1398             :   // Restore old ordering (which prevents a LIS->handleMove bug).
    1399           4 :   for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
    1400          32 :     MachineBasicBlock::iterator POld = PosOld[i-1];
    1401          32 :     MachineBasicBlock::iterator PNew = PosNew[i-1];
    1402          16 :     if (PNew != POld) {
    1403             :       // Update the instruction stream.
    1404           4 :       DAG->getBB()->splice(POld, DAG->getBB(), PNew);
    1405             : 
    1406             :       // Update LiveIntervals.
    1407           4 :       DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
    1408             :     }
    1409             :   }
    1410             : 
    1411             :   LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
    1412             :     SIScheduleBlock *Block = CurrentBlocks[i];
    1413             :     Block->printDebug(true);
    1414             :   });
    1415           2 : }
    1416             : 
    1417           2 : void SIScheduleBlockCreator::fillStats() {
    1418           4 :   unsigned DAGSize = CurrentBlocks.size();
    1419             : 
    1420          12 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1421          10 :     int BlockIndice = TopDownIndex2Block[i];
    1422          10 :     SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
    1423           5 :     if (Block->getPreds().empty())
    1424           2 :       Block->Depth = 0;
    1425             :     else {
    1426             :       unsigned Depth = 0;
    1427           7 :       for (SIScheduleBlock *Pred : Block->getPreds()) {
    1428           8 :         if (Depth < Pred->Depth + Pred->getCost())
    1429             :           Depth = Pred->Depth + Pred->getCost();
    1430             :       }
    1431           3 :       Block->Depth = Depth;
    1432             :     }
    1433             :   }
    1434             : 
    1435          12 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1436          10 :     int BlockIndice = BottomUpIndex2Block[i];
    1437          10 :     SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
    1438           5 :     if (Block->getSuccs().empty())
    1439           2 :       Block->Height = 0;
    1440             :     else {
    1441           3 :       unsigned Height = 0;
    1442          11 :       for (const auto &Succ : Block->getSuccs())
    1443          12 :         Height = std::max(Height, Succ.first->Height + Succ.first->getCost());
    1444           3 :       Block->Height = Height;
    1445             :     }
    1446             :   }
    1447           2 : }
    1448             : 
    1449             : // SIScheduleBlockScheduler //
    1450             : 
    1451           2 : SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
    1452             :                                                    SISchedulerBlockSchedulerVariant Variant,
    1453           2 :                                                    SIScheduleBlocks  BlocksStruct) :
    1454             :   DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
    1455             :   LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
    1456          14 :   SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
    1457             : 
    1458             :   // Fill the usage of every output
    1459             :   // Warning: while by construction we always have a link between two blocks
    1460             :   // when one needs a result from the other, the number of users of an output
    1461             :   // is not the sum of child blocks having as input the same virtual register.
    1462             :   // Here is an example. A produces x and y. B eats x and produces x'.
    1463             :   // C eats x' and y. The register coalescer may have attributed the same
    1464             :   // virtual register to x and x'.
    1465             :   // To count accurately, we do a topological sort. In case the register is
    1466             :   // found for several parents, we increment the usage of the one with the
    1467             :   // highest topological index.
    1468           4 :   LiveOutRegsNumUsages.resize(Blocks.size());
    1469           9 :   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
    1470          10 :     SIScheduleBlock *Block = Blocks[i];
    1471          13 :     for (unsigned Reg : Block->getInRegs()) {
    1472             :       bool Found = false;
    1473             :       int topoInd = -1;
    1474           8 :       for (SIScheduleBlock* Pred: Block->getPreds()) {
    1475             :         std::set<unsigned> PredOutRegs = Pred->getOutRegs();
    1476             :         std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
    1477             : 
    1478           7 :         if (RegPos != PredOutRegs.end()) {
    1479             :           Found = true;
    1480          10 :           if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
    1481             :             topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
    1482             :           }
    1483             :         }
    1484             :       }
    1485             : 
    1486           8 :       if (!Found)
    1487           3 :         continue;
    1488             : 
    1489          10 :       int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
    1490          10 :       ++LiveOutRegsNumUsages[PredID][Reg];
    1491             :     }
    1492             :   }
    1493             : 
    1494           4 :   LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
    1495           4 :   BlockNumPredsLeft.resize(Blocks.size());
    1496           4 :   BlockNumSuccsLeft.resize(Blocks.size());
    1497             : 
    1498           9 :   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
    1499          10 :     SIScheduleBlock *Block = Blocks[i];
    1500          15 :     BlockNumPredsLeft[i] = Block->getPreds().size();
    1501          10 :     BlockNumSuccsLeft[i] = Block->getSuccs().size();
    1502             :   }
    1503             : 
    1504             : #ifndef NDEBUG
    1505             :   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
    1506             :     SIScheduleBlock *Block = Blocks[i];
    1507             :     assert(Block->getID() == i);
    1508             :   }
    1509             : #endif
    1510             : 
    1511             :   std::set<unsigned> InRegs = DAG->getInRegs();
    1512           2 :   addLiveRegs(InRegs);
    1513             : 
    1514             :   // Increase LiveOutRegsNumUsages for blocks
    1515             :   // producing registers consumed in another
    1516             :   // scheduling region.
    1517           5 :   for (unsigned Reg : DAG->getOutRegs()) {
    1518           6 :     for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
    1519             :       // Do reverse traversal
    1520           6 :       int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
    1521           6 :       SIScheduleBlock *Block = Blocks[ID];
    1522             :       const std::set<unsigned> &OutRegs = Block->getOutRegs();
    1523             : 
    1524           3 :       if (OutRegs.find(Reg) == OutRegs.end())
    1525             :         continue;
    1526             : 
    1527           6 :       ++LiveOutRegsNumUsages[ID][Reg];
    1528           3 :       break;
    1529             :     }
    1530             :   }
    1531             : 
    1532             :   // Fill LiveRegsConsumers for regs that were already
    1533             :   // defined before scheduling.
    1534           9 :   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
    1535          10 :     SIScheduleBlock *Block = Blocks[i];
    1536          13 :     for (unsigned Reg : Block->getInRegs()) {
    1537             :       bool Found = false;
    1538           8 :       for (SIScheduleBlock* Pred: Block->getPreds()) {
    1539             :         std::set<unsigned> PredOutRegs = Pred->getOutRegs();
    1540             :         std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
    1541             : 
    1542           5 :         if (RegPos != PredOutRegs.end()) {
    1543             :           Found = true;
    1544             :           break;
    1545             :         }
    1546             :       }
    1547             : 
    1548             :       if (!Found)
    1549           3 :         ++LiveRegsConsumers[Reg];
    1550             :     }
    1551             :   }
    1552             : 
    1553           9 :   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
    1554          10 :     SIScheduleBlock *Block = Blocks[i];
    1555          10 :     if (BlockNumPredsLeft[i] == 0) {
    1556           2 :       ReadyBlocks.push_back(Block);
    1557             :     }
    1558             :   }
    1559             : 
    1560           7 :   while (SIScheduleBlock *Block = pickBlock()) {
    1561           5 :     BlocksScheduled.push_back(Block);
    1562           5 :     blockScheduled(Block);
    1563           5 :   }
    1564             : 
    1565             :   LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block
    1566             :                                             : BlocksScheduled) {
    1567             :     dbgs() << ' ' << Block->getID();
    1568             :   } dbgs() << '\n';);
    1569           2 : }
    1570             : 
    1571           5 : bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
    1572             :                                                    SIBlockSchedCandidate &TryCand) {
    1573           5 :   if (!Cand.isValid()) {
    1574           5 :     TryCand.Reason = NodeOrder;
    1575           5 :     return true;
    1576             :   }
    1577             : 
    1578             :   // Try to hide high latencies.
    1579           0 :   if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled,
    1580           0 :                  Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
    1581             :     return true;
    1582             :   // Schedule high latencies early so you can hide them better.
    1583           0 :   if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
    1584             :                           TryCand, Cand, Latency))
    1585             :     return true;
    1586           0 :   if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height,
    1587             :                                                    TryCand, Cand, Depth))
    1588             :     return true;
    1589           0 :   if (SISched::tryGreater(TryCand.NumHighLatencySuccessors,
    1590           0 :                           Cand.NumHighLatencySuccessors,
    1591             :                           TryCand, Cand, Successor))
    1592             :     return true;
    1593             :   return false;
    1594             : }
    1595             : 
    1596           0 : bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
    1597             :                                                     SIBlockSchedCandidate &TryCand) {
    1598           0 :   if (!Cand.isValid()) {
    1599           0 :     TryCand.Reason = NodeOrder;
    1600           0 :     return true;
    1601             :   }
    1602             : 
    1603           0 :   if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
    1604             :                        TryCand, Cand, RegUsage))
    1605             :     return true;
    1606           0 :   if (SISched::tryGreater(TryCand.NumSuccessors > 0,
    1607           0 :                           Cand.NumSuccessors > 0,
    1608             :                           TryCand, Cand, Successor))
    1609             :     return true;
    1610           0 :   if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
    1611             :     return true;
    1612             :   if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
    1613             :                        TryCand, Cand, RegUsage))
    1614             :     return true;
    1615             :   return false;
    1616             : }
    1617             : 
    1618           7 : SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
    1619             :   SIBlockSchedCandidate Cand;
    1620             :   std::vector<SIScheduleBlock*>::iterator Best;
    1621             :   SIScheduleBlock *Block;
    1622           7 :   if (ReadyBlocks.empty())
    1623             :     return nullptr;
    1624             : 
    1625          10 :   DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
    1626             :                         VregCurrentUsage, SregCurrentUsage);
    1627           5 :   if (VregCurrentUsage > maxVregUsage)
    1628           3 :     maxVregUsage = VregCurrentUsage;
    1629           5 :   if (SregCurrentUsage > maxSregUsage)
    1630           2 :     maxSregUsage = SregCurrentUsage;
    1631             :   LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
    1632             :              for (SIScheduleBlock *Block
    1633             :                   : ReadyBlocks) dbgs()
    1634             :              << Block->getID() << ' ';
    1635             :              dbgs() << "\nCurrent Live:\n";
    1636             :              for (unsigned Reg
    1637             :                   : LiveRegs) dbgs()
    1638             :              << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
    1639             :              dbgs() << '\n';
    1640             :              dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
    1641             :              dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
    1642             : 
    1643           5 :   Cand.Block = nullptr;
    1644             :   for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
    1645          10 :        E = ReadyBlocks.end(); I != E; ++I) {
    1646             :     SIBlockSchedCandidate TryCand;
    1647           5 :     TryCand.Block = *I;
    1648           5 :     TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
    1649           5 :     TryCand.VGPRUsageDiff =
    1650          10 :       checkRegUsageImpact(TryCand.Block->getInRegs(),
    1651          10 :                           TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
    1652          10 :     TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
    1653           5 :     TryCand.NumHighLatencySuccessors =
    1654           5 :       TryCand.Block->getNumHighLatencySuccessors();
    1655           5 :     TryCand.LastPosHighLatParentScheduled =
    1656           5 :       (unsigned int) std::max<int> (0,
    1657          20 :          LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
    1658          15 :            LastPosWaitedHighLatency);
    1659           5 :     TryCand.Height = TryCand.Block->Height;
    1660             :     // Try not to increase VGPR usage too much, else we may spill.
    1661          10 :     if (VregCurrentUsage > 120 ||
    1662           5 :         Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
    1663           0 :       if (!tryCandidateRegUsage(Cand, TryCand) &&
    1664           0 :           Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
    1665           0 :         tryCandidateLatency(Cand, TryCand);
    1666             :     } else {
    1667           5 :       if (!tryCandidateLatency(Cand, TryCand))
    1668           0 :         tryCandidateRegUsage(Cand, TryCand);
    1669             :     }
    1670           5 :     if (TryCand.Reason != NoCand) {
    1671             :       Cand.setBest(TryCand);
    1672             :       Best = I;
    1673             :       LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
    1674             :                         << getReasonStr(Cand.Reason) << '\n');
    1675             :     }
    1676             :   }
    1677             : 
    1678             :   LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n';
    1679             :              dbgs() << "Is a block with high latency instruction: "
    1680             :                     << (Cand.IsHighLatency ? "yes\n" : "no\n");
    1681             :              dbgs() << "Position of last high latency dependency: "
    1682             :                     << Cand.LastPosHighLatParentScheduled << '\n';
    1683             :              dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
    1684             :              dbgs() << '\n';);
    1685             : 
    1686           5 :   Block = Cand.Block;
    1687           5 :   ReadyBlocks.erase(Best);
    1688           5 :   return Block;
    1689             : }
    1690             : 
    1691             : // Tracking of currently alive registers to determine VGPR Usage.
    1692             : 
    1693           7 : void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
    1694          26 :   for (unsigned Reg : Regs) {
    1695             :     // For now only track virtual registers.
    1696          19 :     if (!TargetRegisterInfo::isVirtualRegister(Reg))
    1697           7 :       continue;
    1698             :     // If not already in the live set, then add it.
    1699             :     (void) LiveRegs.insert(Reg);
    1700             :   }
    1701           7 : }
    1702             : 
    1703           5 : void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
    1704             :                                        std::set<unsigned> &Regs) {
    1705          13 :   for (unsigned Reg : Regs) {
    1706             :     // For now only track virtual registers.
    1707             :     std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
    1708             :     assert (Pos != LiveRegs.end() && // Reg must be live.
    1709             :                LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
    1710             :                LiveRegsConsumers[Reg] >= 1);
    1711           8 :     --LiveRegsConsumers[Reg];
    1712           8 :     if (LiveRegsConsumers[Reg] == 0)
    1713             :       LiveRegs.erase(Pos);
    1714             :   }
    1715           5 : }
    1716             : 
    1717           5 : void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
    1718          13 :   for (const auto &Block : Parent->getSuccs()) {
    1719           8 :     if (--BlockNumPredsLeft[Block.first->getID()] == 0)
    1720           3 :       ReadyBlocks.push_back(Block.first);
    1721             : 
    1722           6 :     if (Parent->isHighLatencyBlock() &&
    1723           2 :         Block.second == SIScheduleBlockLinkKind::Data)
    1724           2 :       LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
    1725             :   }
    1726           5 : }
    1727             : 
    1728           5 : void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
    1729           5 :   decreaseLiveRegs(Block, Block->getInRegs());
    1730           5 :   addLiveRegs(Block->getOutRegs());
    1731           5 :   releaseBlockSuccs(Block);
    1732             :   for (std::map<unsigned, unsigned>::iterator RegI =
    1733           5 :        LiveOutRegsNumUsages[Block->getID()].begin(),
    1734          13 :        E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
    1735             :     std::pair<unsigned, unsigned> RegP = *RegI;
    1736             :     // We produce this register, thus it must not be previously alive.
    1737             :     assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
    1738             :            LiveRegsConsumers[RegP.first] == 0);
    1739           8 :     LiveRegsConsumers[RegP.first] += RegP.second;
    1740             :   }
    1741          15 :   if (LastPosHighLatencyParentScheduled[Block->getID()] >
    1742           5 :         (unsigned)LastPosWaitedHighLatency)
    1743           0 :     LastPosWaitedHighLatency =
    1744             :       LastPosHighLatencyParentScheduled[Block->getID()];
    1745           5 :   ++NumBlockScheduled;
    1746           5 : }
    1747             : 
    1748             : std::vector<int>
    1749           5 : SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
    1750             :                                      std::set<unsigned> &OutRegs) {
    1751             :   std::vector<int> DiffSetPressure;
    1752          10 :   DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
    1753             : 
    1754          13 :   for (unsigned Reg : InRegs) {
    1755             :     // For now only track virtual registers.
    1756           8 :     if (!TargetRegisterInfo::isVirtualRegister(Reg))
    1757           0 :       continue;
    1758           8 :     if (LiveRegsConsumers[Reg] > 1)
    1759           0 :       continue;
    1760           8 :     PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
    1761          20 :     for (; PSetI.isValid(); ++PSetI) {
    1762          24 :       DiffSetPressure[*PSetI] -= PSetI.getWeight();
    1763             :     }
    1764             :   }
    1765             : 
    1766          13 :   for (unsigned Reg : OutRegs) {
    1767             :     // For now only track virtual registers.
    1768           8 :     if (!TargetRegisterInfo::isVirtualRegister(Reg))
    1769             :       continue;
    1770           8 :     PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
    1771          20 :     for (; PSetI.isValid(); ++PSetI) {
    1772          24 :       DiffSetPressure[*PSetI] += PSetI.getWeight();
    1773             :     }
    1774             :   }
    1775             : 
    1776           5 :   return DiffSetPressure;
    1777             : }
    1778             : 
    1779             : // SIScheduler //
    1780             : 
    1781             : struct SIScheduleBlockResult
    1782           2 : SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
    1783             :                              SISchedulerBlockSchedulerVariant ScheduleVariant) {
    1784           4 :   SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
    1785           4 :   SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
    1786             :   std::vector<SIScheduleBlock*> ScheduledBlocks;
    1787             :   struct SIScheduleBlockResult Res;
    1788             : 
    1789           2 :   ScheduledBlocks = Scheduler.getBlocks();
    1790             : 
    1791          19 :   for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
    1792           5 :     SIScheduleBlock *Block = ScheduledBlocks[b];
    1793             :     std::vector<SUnit*> SUs = Block->getScheduledUnits();
    1794             : 
    1795          21 :     for (SUnit* SU : SUs)
    1796          16 :       Res.SUs.push_back(SU->NodeNum);
    1797             :   }
    1798             : 
    1799           2 :   Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
    1800           2 :   Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
    1801           2 :   return Res;
    1802             : }
    1803             : 
    1804             : // SIScheduleDAGMI //
    1805             : 
    1806           1 : SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
    1807           3 :   ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) {
    1808           1 :   SITII = static_cast<const SIInstrInfo*>(TII);
    1809           1 :   SITRI = static_cast<const SIRegisterInfo*>(TRI);
    1810             : 
    1811           1 :   VGPRSetID = SITRI->getVGPRPressureSet();
    1812           1 :   SGPRSetID = SITRI->getSGPRPressureSet();
    1813           1 : }
    1814             : 
    1815             : SIScheduleDAGMI::~SIScheduleDAGMI() = default;
    1816             : 
    1817             : // Code adapted from scheduleDAG.cpp
    1818             : // Does a topological sort over the SUs.
    1819             : // Both TopDown and BottomUp
    1820           2 : void SIScheduleDAGMI::topologicalSort() {
    1821           2 :   Topo.InitDAGTopologicalSorting();
    1822             : 
    1823           4 :   TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
    1824           4 :   BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
    1825           2 : }
    1826             : 
    1827             : // Move low latencies further from their user without
    1828             : // increasing SGPR usage (in general)
    1829             : // This is to be replaced by a better pass that would
    1830             : // take into account SGPR usage (based on VGPR Usage
    1831             : // and the corresponding wavefront count), that would
    1832             : // try to merge groups of loads if it make sense, etc
    1833           2 : void SIScheduleDAGMI::moveLowLatencies() {
    1834           4 :    unsigned DAGSize = SUnits.size();
    1835             :    int LastLowLatencyUser = -1;
    1836             :    int LastLowLatencyPos = -1;
    1837             : 
    1838          20 :    for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
    1839          32 :     SUnit *SU = &SUnits[ScheduledSUnits[i]];
    1840             :     bool IsLowLatencyUser = false;
    1841             :     unsigned MinPos = 0;
    1842             : 
    1843          58 :     for (SDep& PredDep : SU->Preds) {
    1844             :       SUnit *Pred = PredDep.getSUnit();
    1845          21 :       if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
    1846             :         IsLowLatencyUser = true;
    1847             :       }
    1848          21 :       if (Pred->NodeNum >= DAGSize)
    1849           0 :         continue;
    1850          42 :       unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
    1851          21 :       if (PredPos >= MinPos)
    1852          10 :         MinPos = PredPos + 1;
    1853             :     }
    1854             : 
    1855          16 :     if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
    1856           2 :       unsigned BestPos = LastLowLatencyUser + 1;
    1857           2 :       if ((int)BestPos <= LastLowLatencyPos)
    1858           1 :         BestPos = LastLowLatencyPos + 1;
    1859           2 :       if (BestPos < MinPos)
    1860             :         BestPos = MinPos;
    1861           2 :       if (BestPos < i) {
    1862          34 :         for (unsigned u = i; u > BestPos; --u) {
    1863          48 :           ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
    1864          32 :           ScheduledSUnits[u] = ScheduledSUnits[u-1];
    1865             :         }
    1866           4 :         ScheduledSUnits[BestPos] = SU->NodeNum;
    1867           4 :         ScheduledSUnitsInv[SU->NodeNum] = BestPos;
    1868             :       }
    1869           2 :       LastLowLatencyPos = BestPos;
    1870           2 :       if (IsLowLatencyUser)
    1871             :         LastLowLatencyUser = BestPos;
    1872          14 :     } else if (IsLowLatencyUser) {
    1873           0 :       LastLowLatencyUser = i;
    1874             :     // Moves COPY instructions on which depends
    1875             :     // the low latency instructions too.
    1876          28 :     } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
    1877             :       bool CopyForLowLat = false;
    1878          19 :       for (SDep& SuccDep : SU->Succs) {
    1879             :         SUnit *Succ = SuccDep.getSUnit();
    1880           7 :         if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
    1881             :           CopyForLowLat = true;
    1882             :         }
    1883             :       }
    1884           5 :       if (!CopyForLowLat)
    1885           3 :         continue;
    1886           2 :       if (MinPos < i) {
    1887          36 :         for (unsigned u = i; u > MinPos; --u) {
    1888          51 :           ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
    1889          34 :           ScheduledSUnits[u] = ScheduledSUnits[u-1];
    1890             :         }
    1891           4 :         ScheduledSUnits[MinPos] = SU->NodeNum;
    1892           4 :         ScheduledSUnitsInv[SU->NodeNum] = MinPos;
    1893             :       }
    1894             :     }
    1895             :   }
    1896           2 : }
    1897             : 
    1898           2 : void SIScheduleDAGMI::restoreSULinksLeft() {
    1899          20 :   for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
    1900          32 :     SUnits[i].isScheduled = false;
    1901          48 :     SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
    1902          16 :     SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
    1903          16 :     SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
    1904          16 :     SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
    1905             :   }
    1906           2 : }
    1907             : 
    1908             : // Return the Vgpr and Sgpr usage corresponding to some virtual registers.
    1909             : template<typename _Iterator> void
    1910           5 : SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
    1911             :                                   unsigned &VgprUsage, unsigned &SgprUsage) {
    1912           5 :   VgprUsage = 0;
    1913           5 :   SgprUsage = 0;
    1914          20 :   for (_Iterator RegI = First; RegI != End; ++RegI) {
    1915          10 :     unsigned Reg = *RegI;
    1916             :     // For now only track virtual registers
    1917          10 :     if (!TargetRegisterInfo::isVirtualRegister(Reg))
    1918             :       continue;
    1919          10 :     PSetIterator PSetI = MRI.getPressureSets(Reg);
    1920          26 :     for (; PSetI.isValid(); ++PSetI) {
    1921          16 :       if (*PSetI == VGPRSetID)
    1922           6 :         VgprUsage += PSetI.getWeight();
    1923          10 :       else if (*PSetI == SGPRSetID)
    1924           4 :         SgprUsage += PSetI.getWeight();
    1925             :     }
    1926             :   }
    1927           5 : }
    1928             : 
    1929           2 : void SIScheduleDAGMI::schedule()
    1930             : {
    1931             :   SmallVector<SUnit*, 8> TopRoots, BotRoots;
    1932             :   SIScheduleBlockResult Best, Temp;
    1933             :   LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
    1934             : 
    1935           2 :   buildDAGWithRegPressure();
    1936             :   LLVM_DEBUG(for (SUnit &SU : SUnits) SU.dumpAll(this));
    1937             : 
    1938           2 :   topologicalSort();
    1939           2 :   findRootsAndBiasEdges(TopRoots, BotRoots);
    1940             :   // We reuse several ScheduleDAGMI and ScheduleDAGMILive
    1941             :   // functions, but to make them happy we must initialize
    1942             :   // the default Scheduler implementation (even if we do not
    1943             :   // run it)
    1944           2 :   SchedImpl->initialize(this);
    1945           2 :   initQueues(TopRoots, BotRoots);
    1946             : 
    1947             :   // Fill some stats to help scheduling.
    1948             : 
    1949           2 :   SUnitsLinksBackup = SUnits;
    1950           2 :   IsLowLatencySU.clear();
    1951           2 :   LowLatencyOffset.clear();
    1952           2 :   IsHighLatencySU.clear();
    1953             : 
    1954           4 :   IsLowLatencySU.resize(SUnits.size(), 0);
    1955           4 :   LowLatencyOffset.resize(SUnits.size(), 0);
    1956           4 :   IsHighLatencySU.resize(SUnits.size(), 0);
    1957             : 
    1958          20 :   for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
    1959          16 :     SUnit *SU = &SUnits[i];
    1960             :     unsigned BaseLatReg;
    1961             :     int64_t OffLatReg;
    1962          16 :     if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
    1963           4 :       IsLowLatencySU[i] = 1;
    1964           2 :       if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg,
    1965             :                                        TRI))
    1966           4 :         LowLatencyOffset[i] = OffLatReg;
    1967          14 :     } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
    1968           2 :       IsHighLatencySU[i] = 1;
    1969             :   }
    1970             : 
    1971             :   SIScheduler Scheduler(this);
    1972           4 :   Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
    1973             :                                    SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
    1974             : 
    1975             :   // if VGPR usage is extremely high, try other good performing variants
    1976             :   // which could lead to lower VGPR usage
    1977           2 :   if (Best.MaxVGPRUsage > 180) {
    1978             :     static const std::pair<SISchedulerBlockCreatorVariant,
    1979             :                            SISchedulerBlockSchedulerVariant>
    1980             :         Variants[] = {
    1981             :       { LatenciesAlone, BlockRegUsageLatency },
    1982             : //      { LatenciesAlone, BlockRegUsage },
    1983             :       { LatenciesGrouped, BlockLatencyRegUsage },
    1984             : //      { LatenciesGrouped, BlockRegUsageLatency },
    1985             : //      { LatenciesGrouped, BlockRegUsage },
    1986             :       { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
    1987             : //      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
    1988             : //      { LatenciesAlonePlusConsecutive, BlockRegUsage }
    1989             :     };
    1990           0 :     for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
    1991           0 :       Temp = Scheduler.scheduleVariant(v.first, v.second);
    1992           0 :       if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
    1993             :         Best = Temp;
    1994             :     }
    1995             :   }
    1996             :   // if VGPR usage is still extremely high, we may spill. Try other variants
    1997             :   // which are less performing, but that could lead to lower VGPR usage.
    1998           2 :   if (Best.MaxVGPRUsage > 200) {
    1999             :     static const std::pair<SISchedulerBlockCreatorVariant,
    2000             :                            SISchedulerBlockSchedulerVariant>
    2001             :         Variants[] = {
    2002             : //      { LatenciesAlone, BlockRegUsageLatency },
    2003             :       { LatenciesAlone, BlockRegUsage },
    2004             : //      { LatenciesGrouped, BlockLatencyRegUsage },
    2005             :       { LatenciesGrouped, BlockRegUsageLatency },
    2006             :       { LatenciesGrouped, BlockRegUsage },
    2007             : //      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
    2008             :       { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
    2009             :       { LatenciesAlonePlusConsecutive, BlockRegUsage }
    2010             :     };
    2011           0 :     for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
    2012           0 :       Temp = Scheduler.scheduleVariant(v.first, v.second);
    2013           0 :       if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
    2014             :         Best = Temp;
    2015             :     }
    2016             :   }
    2017             : 
    2018           2 :   ScheduledSUnits = Best.SUs;
    2019           4 :   ScheduledSUnitsInv.resize(SUnits.size());
    2020             : 
    2021          20 :   for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
    2022          48 :     ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
    2023             :   }
    2024             : 
    2025           2 :   moveLowLatencies();
    2026             : 
    2027             :   // Tell the outside world about the result of the scheduling.
    2028             : 
    2029             :   assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
    2030             :   TopRPTracker.setPos(CurrentTop);
    2031             : 
    2032             :   for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
    2033          18 :        E = ScheduledSUnits.end(); I != E; ++I) {
    2034          16 :     SUnit *SU = &SUnits[*I];
    2035             : 
    2036          16 :     scheduleMI(SU, true);
    2037             : 
    2038             :     LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
    2039             :                       << *SU->getInstr());
    2040             :   }
    2041             : 
    2042             :   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
    2043             : 
    2044           2 :   placeDebugValues();
    2045             : 
    2046             :   LLVM_DEBUG({
    2047             :     dbgs() << "*** Final schedule for "
    2048             :            << printMBBReference(*begin()->getParent()) << " ***\n";
    2049             :     dumpSchedule();
    2050             :     dbgs() << '\n';
    2051             :   });
    2052           2 : }

Generated by: LCOV version 1.13