LCOV - code coverage report
Current view: top level - lib/Target/AMDGPU - SIMachineScheduler.cpp (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 628 809 77.6 %
Date: 2018-10-20 13:21:21 Functions: 46 54 85.2 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : /// \file
      11             : /// SI Machine Scheduler interface
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #include "SIMachineScheduler.h"
      16             : #include "AMDGPU.h"
      17             : #include "SIInstrInfo.h"
      18             : #include "SIRegisterInfo.h"
      19             : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
      20             : #include "llvm/ADT/STLExtras.h"
      21             : #include "llvm/ADT/SmallVector.h"
      22             : #include "llvm/CodeGen/LiveInterval.h"
      23             : #include "llvm/CodeGen/LiveIntervals.h"
      24             : #include "llvm/CodeGen/MachineInstr.h"
      25             : #include "llvm/CodeGen/MachineRegisterInfo.h"
      26             : #include "llvm/CodeGen/MachineScheduler.h"
      27             : #include "llvm/CodeGen/RegisterPressure.h"
      28             : #include "llvm/CodeGen/SlotIndexes.h"
      29             : #include "llvm/CodeGen/TargetRegisterInfo.h"
      30             : #include "llvm/Support/Debug.h"
      31             : #include "llvm/Support/ErrorHandling.h"
      32             : #include "llvm/Support/raw_ostream.h"
      33             : #include <algorithm>
      34             : #include <cassert>
      35             : #include <map>
      36             : #include <set>
      37             : #include <utility>
      38             : #include <vector>
      39             : 
      40             : using namespace llvm;
      41             : 
      42             : #define DEBUG_TYPE "machine-scheduler"
      43             : 
      44             : // This scheduler implements a different scheduling algorithm than
      45             : // GenericScheduler.
      46             : //
      47             : // There are several specific architecture behaviours that can't be modelled
      48             : // for GenericScheduler:
      49             : // . When accessing the result of an SGPR load instruction, you have to wait
      50             : // for all the SGPR load instructions before your current instruction to
      51             : // have finished.
      52             : // . When accessing the result of an VGPR load instruction, you have to wait
      53             : // for all the VGPR load instructions previous to the VGPR load instruction
      54             : // you are interested in to finish.
      55             : // . The less the register pressure, the best load latencies are hidden
      56             : //
      57             : // Moreover some specifities (like the fact a lot of instructions in the shader
      58             : // have few dependencies) makes the generic scheduler have some unpredictable
      59             : // behaviours. For example when register pressure becomes high, it can either
      60             : // manage to prevent register pressure from going too high, or it can
      61             : // increase register pressure even more than if it hadn't taken register
      62             : // pressure into account.
      63             : //
      64             : // Also some other bad behaviours are generated, like loading at the beginning
      65             : // of the shader a constant in VGPR you won't need until the end of the shader.
      66             : //
      67             : // The scheduling problem for SI can distinguish three main parts:
      68             : // . Hiding high latencies (texture sampling, etc)
      69             : // . Hiding low latencies (SGPR constant loading, etc)
      70             : // . Keeping register usage low for better latency hiding and general
      71             : //   performance
      72             : //
      73             : // Some other things can also affect performance, but are hard to predict
      74             : // (cache usage, the fact the HW can issue several instructions from different
      75             : // wavefronts if different types, etc)
      76             : //
      77             : // This scheduler tries to solve the scheduling problem by dividing it into
      78             : // simpler sub-problems. It divides the instructions into blocks, schedules
      79             : // locally inside the blocks where it takes care of low latencies, and then
      80             : // chooses the order of the blocks by taking care of high latencies.
      81             : // Dividing the instructions into blocks helps control keeping register
      82             : // usage low.
      83             : //
      84             : // First the instructions are put into blocks.
      85             : //   We want the blocks help control register usage and hide high latencies
      86             : //   later. To help control register usage, we typically want all local
      87             : //   computations, when for example you create a result that can be comsummed
      88             : //   right away, to be contained in a block. Block inputs and outputs would
      89             : //   typically be important results that are needed in several locations of
      90             : //   the shader. Since we do want blocks to help hide high latencies, we want
      91             : //   the instructions inside the block to have a minimal set of dependencies
      92             : //   on high latencies. It will make it easy to pick blocks to hide specific
      93             : //   high latencies.
      94             : //   The block creation algorithm is divided into several steps, and several
      95             : //   variants can be tried during the scheduling process.
      96             : //
      97             : // Second the order of the instructions inside the blocks is chosen.
      98             : //   At that step we do take into account only register usage and hiding
      99             : //   low latency instructions
     100             : //
     101             : // Third the block order is chosen, there we try to hide high latencies
     102             : // and keep register usage low.
     103             : //
     104             : // After the third step, a pass is done to improve the hiding of low
     105             : // latencies.
     106             : //
     107             : // Actually when talking about 'low latency' or 'high latency' it includes
     108             : // both the latency to get the cache (or global mem) data go to the register,
     109             : // and the bandwidth limitations.
     110             : // Increasing the number of active wavefronts helps hide the former, but it
     111             : // doesn't solve the latter, thus why even if wavefront count is high, we have
     112             : // to try have as many instructions hiding high latencies as possible.
     113             : // The OpenCL doc says for example latency of 400 cycles for a global mem access,
     114             : // which is hidden by 10 instructions if the wavefront count is 10.
     115             : 
     116             : // Some figures taken from AMD docs:
     117             : // Both texture and constant L1 caches are 4-way associative with 64 bytes
     118             : // lines.
     119             : // Constant cache is shared with 4 CUs.
     120             : // For texture sampling, the address generation unit receives 4 texture
     121             : // addresses per cycle, thus we could expect texture sampling latency to be
     122             : // equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
     123             : // instructions in a wavefront group are executed every 4 cycles),
     124             : // or 16 instructions if the other wavefronts associated to the 3 other VALUs
     125             : // of the CU do texture sampling too. (Don't take these figures too seriously,
     126             : // as I'm not 100% sure of the computation)
     127             : // Data exports should get similar latency.
     128             : // For constant loading, the cache is shader with 4 CUs.
     129             : // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
     130             : // I guess if the other CU don't read the cache, it can go up to 64B/cycle.
     131             : // It means a simple s_buffer_load should take one instruction to hide, as
     132             : // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
     133             : // cache line.
     134             : //
     135             : // As of today the driver doesn't preload the constants in cache, thus the
     136             : // first loads get extra latency. The doc says global memory access can be
     137             : // 300-600 cycles. We do not specially take that into account when scheduling
     138             : // As we expect the driver to be able to preload the constants soon.
     139             : 
     140             : // common code //
     141             : 
     142             : #ifndef NDEBUG
     143             : 
     144             : static const char *getReasonStr(SIScheduleCandReason Reason) {
     145             :   switch (Reason) {
     146             :   case NoCand:         return "NOCAND";
     147             :   case RegUsage:       return "REGUSAGE";
     148             :   case Latency:        return "LATENCY";
     149             :   case Successor:      return "SUCCESSOR";
     150             :   case Depth:          return "DEPTH";
     151             :   case NodeOrder:      return "ORDER";
     152             :   }
     153             :   llvm_unreachable("Unknown reason!");
     154             : }
     155             : 
     156             : #endif
     157             : 
     158             : namespace llvm {
     159             : namespace SISched {
     160           0 : static bool tryLess(int TryVal, int CandVal,
     161             :                     SISchedulerCandidate &TryCand,
     162             :                     SISchedulerCandidate &Cand,
     163             :                     SIScheduleCandReason Reason) {
     164           0 :   if (TryVal < CandVal) {
     165           2 :     TryCand.Reason = Reason;
     166           0 :     return true;
     167             :   }
     168          15 :   if (TryVal > CandVal) {
     169           0 :     if (Cand.Reason > Reason)
     170           0 :       Cand.Reason = Reason;
     171           0 :     return true;
     172             :   }
     173             :   Cand.setRepeat(Reason);
     174           0 :   return false;
     175             : }
     176             : 
     177           0 : static bool tryGreater(int TryVal, int CandVal,
     178             :                        SISchedulerCandidate &TryCand,
     179             :                        SISchedulerCandidate &Cand,
     180             :                        SIScheduleCandReason Reason) {
     181           0 :   if (TryVal > CandVal) {
     182           1 :     TryCand.Reason = Reason;
     183           0 :     return true;
     184             :   }
     185           8 :   if (TryVal < CandVal) {
     186           0 :     if (Cand.Reason > Reason)
     187           0 :       Cand.Reason = Reason;
     188           0 :     return true;
     189             :   }
     190             :   Cand.setRepeat(Reason);
     191           0 :   return false;
     192             : }
     193             : } // end namespace SISched
     194             : } // end namespace llvm
     195             : 
     196             : // SIScheduleBlock //
     197             : 
     198          16 : void SIScheduleBlock::addUnit(SUnit *SU) {
     199          32 :   NodeNum2Index[SU->NodeNum] = SUnits.size();
     200          16 :   SUnits.push_back(SU);
     201          16 : }
     202             : 
     203             : #ifndef NDEBUG
     204             : void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
     205             : 
     206             :   dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
     207             :   dbgs() << '\n';
     208             : }
     209             : #endif
     210             : 
     211          25 : void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
     212             :                                           SISchedCandidate &TryCand) {
     213             :   // Initialize the candidate if needed.
     214          25 :   if (!Cand.isValid()) {
     215          16 :     TryCand.Reason = NodeOrder;
     216          16 :     return;
     217             :   }
     218             : 
     219           9 :   if (Cand.SGPRUsage > 60 &&
     220           0 :       SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage,
     221             :                        TryCand, Cand, RegUsage))
     222           0 :     return;
     223             : 
     224             :   // Schedule low latency instructions as top as possible.
     225             :   // Order of priority is:
     226             :   // . Low latency instructions which do not depend on other low latency
     227             :   //   instructions we haven't waited for
     228             :   // . Other instructions which do not depend on low latency instructions
     229             :   //   we haven't waited for
     230             :   // . Low latencies
     231             :   // . All other instructions
     232             :   // Goal is to get: low latency instructions - independent instructions
     233             :   //     - (eventually some more low latency instructions)
     234             :   //     - instructions that depend on the first low latency instructions.
     235             :   // If in the block there is a lot of constant loads, the SGPR usage
     236             :   // could go quite high, thus above the arbitrary limit of 60 will encourage
     237             :   // use the already loaded constants (in order to release some SGPRs) before
     238             :   // loading more.
     239           9 :   if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent,
     240           9 :                        Cand.HasLowLatencyNonWaitedParent,
     241             :                        TryCand, Cand, SIScheduleCandReason::Depth))
     242           0 :     return;
     243             : 
     244           9 :   if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
     245             :                           TryCand, Cand, SIScheduleCandReason::Depth))
     246           1 :     return;
     247             : 
     248           8 :   if (TryCand.IsLowLatency &&
     249           0 :       SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
     250             :                        TryCand, Cand, SIScheduleCandReason::Depth))
     251           0 :     return;
     252             : 
     253           8 :   if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage,
     254             :                        TryCand, Cand, RegUsage))
     255           2 :     return;
     256             : 
     257             :   // Fall through to original instruction order.
     258           6 :   if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
     259           0 :     TryCand.Reason = NodeOrder;
     260             :   }
     261             : }
     262             : 
     263          16 : SUnit* SIScheduleBlock::pickNode() {
     264             :   SISchedCandidate TopCand;
     265             : 
     266          41 :   for (SUnit* SU : TopReadySUs) {
     267             :     SISchedCandidate TryCand;
     268             :     std::vector<unsigned> pressure;
     269             :     std::vector<unsigned> MaxPressure;
     270             :     // Predict register usage after this instruction.
     271          25 :     TryCand.SU = SU;
     272          25 :     TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
     273          25 :     TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
     274          25 :     TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
     275          25 :     TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
     276          25 :     TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
     277          25 :     TryCand.HasLowLatencyNonWaitedParent =
     278          25 :       HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
     279          25 :     tryCandidateTopDown(TopCand, TryCand);
     280          25 :     if (TryCand.Reason != NoCand)
     281             :       TopCand.setBest(TryCand);
     282             :   }
     283             : 
     284          16 :   return TopCand.SU;
     285             : }
     286             : 
     287             : 
     288             : // Schedule something valid.
     289           5 : void SIScheduleBlock::fastSchedule() {
     290           5 :   TopReadySUs.clear();
     291           5 :   if (Scheduled)
     292           0 :     undoSchedule();
     293             : 
     294          21 :   for (SUnit* SU : SUnits) {
     295          16 :     if (!SU->NumPredsLeft)
     296          10 :       TopReadySUs.push_back(SU);
     297             :   }
     298             : 
     299          21 :   while (!TopReadySUs.empty()) {
     300          16 :     SUnit *SU = TopReadySUs[0];
     301          16 :     ScheduledSUnits.push_back(SU);
     302          16 :     nodeScheduled(SU);
     303             :   }
     304             : 
     305           5 :   Scheduled = true;
     306           5 : }
     307             : 
     308             : // Returns if the register was set between first and last.
     309           8 : static bool isDefBetween(unsigned Reg,
     310             :                            SlotIndex First, SlotIndex Last,
     311             :                            const MachineRegisterInfo *MRI,
     312             :                            const LiveIntervals *LIS) {
     313             :   for (MachineRegisterInfo::def_instr_iterator
     314           8 :        UI = MRI->def_instr_begin(Reg),
     315           9 :        UE = MRI->def_instr_end(); UI != UE; ++UI) {
     316             :     const MachineInstr* MI = &*UI;
     317           9 :     if (MI->isDebugValue())
     318             :       continue;
     319           9 :     SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
     320           9 :     if (InstSlot >= First && InstSlot <= Last)
     321             :       return true;
     322             :   }
     323             :   return false;
     324             : }
     325             : 
     326           5 : void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
     327             :                                       MachineBasicBlock::iterator EndBlock) {
     328             :   IntervalPressure Pressure, BotPressure;
     329           5 :   RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
     330           5 :   LiveIntervals *LIS = DAG->getLIS();
     331           5 :   MachineRegisterInfo *MRI = DAG->getMRI();
     332           5 :   DAG->initRPTracker(TopRPTracker);
     333           5 :   DAG->initRPTracker(BotRPTracker);
     334           5 :   DAG->initRPTracker(RPTracker);
     335             : 
     336             :   // Goes though all SU. RPTracker captures what had to be alive for the SUs
     337             :   // to execute, and what is still alive at the end.
     338          21 :   for (SUnit* SU : ScheduledSUnits) {
     339          16 :     RPTracker.setPos(SU->getInstr());
     340          16 :     RPTracker.advance();
     341             :   }
     342             : 
     343             :   // Close the RPTracker to finalize live ins/outs.
     344           5 :   RPTracker.closeRegion();
     345             : 
     346             :   // Initialize the live ins and live outs.
     347          10 :   TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
     348          10 :   BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
     349             : 
     350             :   // Do not Track Physical Registers, because it messes up.
     351          20 :   for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
     352          30 :     if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
     353           8 :       LiveInRegs.insert(RegMaskPair.RegUnit);
     354             :   }
     355             :   LiveOutRegs.clear();
     356             :   // There is several possibilities to distinguish:
     357             :   // 1) Reg is not input to any instruction in the block, but is output of one
     358             :   // 2) 1) + read in the block and not needed after it
     359             :   // 3) 1) + read in the block but needed in another block
     360             :   // 4) Reg is input of an instruction but another block will read it too
     361             :   // 5) Reg is input of an instruction and then rewritten in the block.
     362             :   //    result is not read in the block (implies used in another block)
     363             :   // 6) Reg is input of an instruction and then rewritten in the block.
     364             :   //    result is read in the block and not needed in another block
     365             :   // 7) Reg is input of an instruction and then rewritten in the block.
     366             :   //    result is read in the block but also needed in another block
     367             :   // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
     368             :   // We want LiveOutRegs to contain only Regs whose content will be read after
     369             :   // in another block, and whose content was written in the current block,
     370             :   // that is we want it to get 1, 3, 5, 7
     371             :   // Since we made the MIs of a block to be packed all together before
     372             :   // scheduling, then the LiveIntervals were correct, and the RPTracker was
     373             :   // able to correctly handle 5 vs 6, 2 vs 3.
     374             :   // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
     375             :   // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
     376             :   // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
     377             :   // The use of findDefBetween removes the case 4.
     378          13 :   for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
     379           8 :     unsigned Reg = RegMaskPair.RegUnit;
     380          16 :     if (TargetRegisterInfo::isVirtualRegister(Reg) &&
     381           8 :         isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
     382           8 :                      LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
     383             :                      LIS)) {
     384             :       LiveOutRegs.insert(Reg);
     385             :     }
     386             :   }
     387             : 
     388             :   // Pressure = sum_alive_registers register size
     389             :   // Internally llvm will represent some registers as big 128 bits registers
     390             :   // for example, but they actually correspond to 4 actual 32 bits registers.
     391             :   // Thus Pressure is not equal to num_alive_registers * constant.
     392           5 :   LiveInPressure = TopPressure.MaxSetPressure;
     393           5 :   LiveOutPressure = BotPressure.MaxSetPressure;
     394             : 
     395             :   // Prepares TopRPTracker for top down scheduling.
     396           5 :   TopRPTracker.closeTop();
     397           5 : }
     398             : 
     399           5 : void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
     400             :                                MachineBasicBlock::iterator EndBlock) {
     401           5 :   if (!Scheduled)
     402           0 :     fastSchedule();
     403             : 
     404             :   // PreScheduling phase to set LiveIn and LiveOut.
     405           5 :   initRegPressure(BeginBlock, EndBlock);
     406           5 :   undoSchedule();
     407             : 
     408             :   // Schedule for real now.
     409             : 
     410           5 :   TopReadySUs.clear();
     411             : 
     412          21 :   for (SUnit* SU : SUnits) {
     413          16 :     if (!SU->NumPredsLeft)
     414          10 :       TopReadySUs.push_back(SU);
     415             :   }
     416             : 
     417          21 :   while (!TopReadySUs.empty()) {
     418          16 :     SUnit *SU = pickNode();
     419          16 :     ScheduledSUnits.push_back(SU);
     420          16 :     TopRPTracker.setPos(SU->getInstr());
     421          16 :     TopRPTracker.advance();
     422          16 :     nodeScheduled(SU);
     423             :   }
     424             : 
     425             :   // TODO: compute InternalAdditionnalPressure.
     426          10 :   InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
     427             : 
     428             :   // Check everything is right.
     429             : #ifndef NDEBUG
     430             :   assert(SUnits.size() == ScheduledSUnits.size() &&
     431             :             TopReadySUs.empty());
     432             :   for (SUnit* SU : SUnits) {
     433             :     assert(SU->isScheduled &&
     434             :               SU->NumPredsLeft == 0);
     435             :   }
     436             : #endif
     437             : 
     438           5 :   Scheduled = true;
     439           5 : }
     440             : 
     441           5 : void SIScheduleBlock::undoSchedule() {
     442          21 :   for (SUnit* SU : SUnits) {
     443          16 :     SU->isScheduled = false;
     444          40 :     for (SDep& Succ : SU->Succs) {
     445          48 :       if (BC->isSUInBlock(Succ.getSUnit(), ID))
     446          12 :         undoReleaseSucc(SU, &Succ);
     447             :     }
     448             :   }
     449           5 :   HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
     450             :   ScheduledSUnits.clear();
     451           5 :   Scheduled = false;
     452           5 : }
     453             : 
     454          12 : void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
     455             :   SUnit *SuccSU = SuccEdge->getSUnit();
     456             : 
     457             :   if (SuccEdge->isWeak()) {
     458           0 :     ++SuccSU->WeakPredsLeft;
     459           0 :     return;
     460             :   }
     461          12 :   ++SuccSU->NumPredsLeft;
     462             : }
     463             : 
     464          33 : void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
     465             :   SUnit *SuccSU = SuccEdge->getSUnit();
     466             : 
     467             :   if (SuccEdge->isWeak()) {
     468           0 :     --SuccSU->WeakPredsLeft;
     469           0 :     return;
     470             :   }
     471             : #ifndef NDEBUG
     472             :   if (SuccSU->NumPredsLeft == 0) {
     473             :     dbgs() << "*** Scheduling failed! ***\n";
     474             :     DAG->dumpNode(*SuccSU);
     475             :     dbgs() << " has been released too many times!\n";
     476             :     llvm_unreachable(nullptr);
     477             :   }
     478             : #endif
     479             : 
     480          33 :   --SuccSU->NumPredsLeft;
     481             : }
     482             : 
     483             : /// Release Successors of the SU that are in the block or not.
     484          48 : void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
     485         120 :   for (SDep& Succ : SU->Succs) {
     486          72 :     SUnit *SuccSU = Succ.getSUnit();
     487             : 
     488         144 :     if (SuccSU->NodeNum >= DAG->SUnits.size())
     489          39 :         continue;
     490             : 
     491          63 :     if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
     492             :       continue;
     493             : 
     494          33 :     releaseSucc(SU, &Succ);
     495          33 :     if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
     496          12 :       TopReadySUs.push_back(SuccSU);
     497             :   }
     498          48 : }
     499             : 
     500          32 : void SIScheduleBlock::nodeScheduled(SUnit *SU) {
     501             :   // Is in TopReadySUs
     502             :   assert (!SU->NumPredsLeft);
     503             :   std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU);
     504          32 :   if (I == TopReadySUs.end()) {
     505           0 :     dbgs() << "Data Structure Bug in SI Scheduler\n";
     506           0 :     llvm_unreachable(nullptr);
     507             :   }
     508          32 :   TopReadySUs.erase(I);
     509             : 
     510          32 :   releaseSuccessors(SU, true);
     511             :   // Scheduling this node will trigger a wait,
     512             :   // thus propagate to other instructions that they do not need to wait either.
     513          32 :   if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
     514           0 :     HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
     515             : 
     516          64 :   if (DAG->IsLowLatencySU[SU->NodeNum]) {
     517           8 :      for (SDep& Succ : SU->Succs) {
     518             :       std::map<unsigned, unsigned>::iterator I =
     519             :         NodeNum2Index.find(Succ.getSUnit()->NodeNum);
     520           4 :       if (I != NodeNum2Index.end())
     521           0 :         HasLowLatencyNonWaitedParent[I->second] = 1;
     522             :     }
     523             :   }
     524          32 :   SU->isScheduled = true;
     525          32 : }
     526             : 
     527           5 : void SIScheduleBlock::finalizeUnits() {
     528             :   // We remove links from outside blocks to enable scheduling inside the block.
     529          21 :   for (SUnit* SU : SUnits) {
     530          16 :     releaseSuccessors(SU, false);
     531          32 :     if (DAG->IsHighLatencySU[SU->NodeNum])
     532           1 :       HighLatencyBlock = true;
     533             :   }
     534          10 :   HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
     535           5 : }
     536             : 
     537             : // we maintain ascending order of IDs
     538           9 : void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
     539           9 :   unsigned PredID = Pred->getID();
     540             : 
     541             :   // Check if not already predecessor.
     542          10 :   for (SIScheduleBlock* P : Preds) {
     543           6 :     if (PredID == P->getID())
     544             :       return;
     545             :   }
     546           4 :   Preds.push_back(Pred);
     547             : 
     548             :   assert(none_of(Succs,
     549             :                  [=](std::pair<SIScheduleBlock*,
     550             :                      SIScheduleBlockLinkKind> S) {
     551             :                    return PredID == S.first->getID();
     552             :                     }) &&
     553             :          "Loop in the Block Graph!");
     554             : }
     555             : 
     556           9 : void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
     557             :                               SIScheduleBlockLinkKind Kind) {
     558           9 :   unsigned SuccID = Succ->getID();
     559             : 
     560             :   // Check if not already predecessor.
     561          10 :   for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
     562           6 :     if (SuccID == S.first->getID()) {
     563           5 :       if (S.second == SIScheduleBlockLinkKind::NoData &&
     564             :           Kind == SIScheduleBlockLinkKind::Data)
     565           0 :         S.second = Kind;
     566             :       return;
     567             :     }
     568             :   }
     569           4 :   if (Succ->isHighLatencyBlock())
     570           0 :     ++NumHighLatencySuccessors;
     571           4 :   Succs.push_back(std::make_pair(Succ, Kind));
     572             : 
     573             :   assert(none_of(Preds,
     574             :                  [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
     575             :          "Loop in the Block Graph!");
     576             : }
     577             : 
     578             : #ifndef NDEBUG
     579             : void SIScheduleBlock::printDebug(bool full) {
     580             :   dbgs() << "Block (" << ID << ")\n";
     581             :   if (!full)
     582             :     return;
     583             : 
     584             :   dbgs() << "\nContains High Latency Instruction: "
     585             :          << HighLatencyBlock << '\n';
     586             :   dbgs() << "\nDepends On:\n";
     587             :   for (SIScheduleBlock* P : Preds) {
     588             :     P->printDebug(false);
     589             :   }
     590             : 
     591             :   dbgs() << "\nSuccessors:\n";
     592             :   for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
     593             :     if (S.second == SIScheduleBlockLinkKind::Data)
     594             :       dbgs() << "(Data Dep) ";
     595             :     S.first->printDebug(false);
     596             :   }
     597             : 
     598             :   if (Scheduled) {
     599             :     dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
     600             :            << LiveInPressure[DAG->getVGPRSetID()] << '\n';
     601             :     dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
     602             :            << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
     603             :     dbgs() << "LiveIns:\n";
     604             :     for (unsigned Reg : LiveInRegs)
     605             :       dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
     606             : 
     607             :     dbgs() << "\nLiveOuts:\n";
     608             :     for (unsigned Reg : LiveOutRegs)
     609             :       dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
     610             :   }
     611             : 
     612             :   dbgs() << "\nInstructions:\n";
     613             :   if (!Scheduled) {
     614             :     for (const SUnit* SU : SUnits)
     615             :       DAG->dumpNode(*SU);
     616             :   } else {
     617             :     for (const SUnit* SU : SUnits)
     618             :       DAG->dumpNode(*SU);
     619             :   }
     620             : 
     621             :   dbgs() << "///////////////////////\n";
     622             : }
     623             : #endif
     624             : 
     625             : // SIScheduleBlockCreator //
     626             : 
     627           2 : SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
     628           2 : DAG(DAG) {
     629           2 : }
     630             : 
     631             : SIScheduleBlockCreator::~SIScheduleBlockCreator() = default;
     632             : 
     633             : SIScheduleBlocks
     634           2 : SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
     635             :   std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
     636             :     Blocks.find(BlockVariant);
     637           2 :   if (B == Blocks.end()) {
     638           2 :     SIScheduleBlocks Res;
     639           2 :     createBlocksForVariant(BlockVariant);
     640           2 :     topologicalSort();
     641           2 :     scheduleInsideBlocks();
     642           2 :     fillStats();
     643           2 :     Res.Blocks = CurrentBlocks;
     644           2 :     Res.TopDownIndex2Block = TopDownIndex2Block;
     645           2 :     Res.TopDownBlock2Index = TopDownBlock2Index;
     646           2 :     Blocks[BlockVariant] = Res;
     647           2 :     return Res;
     648             :   } else {
     649           0 :     return B->second;
     650             :   }
     651             : }
     652             : 
     653          87 : bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
     654         174 :   if (SU->NodeNum >= DAG->SUnits.size())
     655             :     return false;
     656         252 :   return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
     657             : }
     658             : 
     659           2 : void SIScheduleBlockCreator::colorHighLatenciesAlone() {
     660           4 :   unsigned DAGSize = DAG->SUnits.size();
     661             : 
     662          18 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
     663          16 :     SUnit *SU = &DAG->SUnits[i];
     664          32 :     if (DAG->IsHighLatencySU[SU->NodeNum]) {
     665           2 :       CurrentColoring[SU->NodeNum] = NextReservedID++;
     666             :     }
     667             :   }
     668           2 : }
     669             : 
     670             : static bool
     671             : hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
     672           0 :   for (const auto &PredDep : SU.Preds) {
     673           0 :     if (PredDep.getSUnit() == &FromSU &&
     674             :         PredDep.getKind() == llvm::SDep::Data)
     675             :       return true;
     676             :   }
     677             :   return false;
     678             : }
     679             : 
     680           0 : void SIScheduleBlockCreator::colorHighLatenciesGroups() {
     681           0 :   unsigned DAGSize = DAG->SUnits.size();
     682             :   unsigned NumHighLatencies = 0;
     683             :   unsigned GroupSize;
     684           0 :   int Color = NextReservedID;
     685             :   unsigned Count = 0;
     686             :   std::set<unsigned> FormingGroup;
     687             : 
     688           0 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
     689           0 :     SUnit *SU = &DAG->SUnits[i];
     690           0 :     if (DAG->IsHighLatencySU[SU->NodeNum])
     691           0 :       ++NumHighLatencies;
     692             :   }
     693             : 
     694           0 :   if (NumHighLatencies == 0)
     695             :     return;
     696             : 
     697           0 :   if (NumHighLatencies <= 6)
     698             :     GroupSize = 2;
     699           0 :   else if (NumHighLatencies <= 12)
     700             :     GroupSize = 3;
     701             :   else
     702             :     GroupSize = 4;
     703             : 
     704           0 :   for (unsigned SUNum : DAG->TopDownIndex2SU) {
     705           0 :     const SUnit &SU = DAG->SUnits[SUNum];
     706           0 :     if (DAG->IsHighLatencySU[SU.NodeNum]) {
     707             :       unsigned CompatibleGroup = true;
     708             :       int ProposedColor = Color;
     709             :       std::vector<int> AdditionalElements;
     710             : 
     711             :       // We don't want to put in the same block
     712             :       // two high latency instructions that depend
     713             :       // on each other.
     714             :       // One way would be to check canAddEdge
     715             :       // in both directions, but that currently is not
     716             :       // enough because there the high latency order is
     717             :       // enforced (via links).
     718             :       // Instead, look at the dependencies between the
     719             :       // high latency instructions and deduce if it is
     720             :       // a data dependency or not.
     721           0 :       for (unsigned j : FormingGroup) {
     722             :         bool HasSubGraph;
     723             :         std::vector<int> SubGraph;
     724             :         // By construction (topological order), if SU and
     725             :         // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
     726             :         // in the parent graph of SU.
     727             : #ifndef NDEBUG
     728             :         SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
     729             :                                                HasSubGraph);
     730             :         assert(!HasSubGraph);
     731             : #endif
     732           0 :         SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
     733             :                                                HasSubGraph);
     734           0 :         if (!HasSubGraph)
     735             :           continue; // No dependencies between each other
     736           0 :         else if (SubGraph.size() > 5) {
     737             :           // Too many elements would be required to be added to the block.
     738             :           CompatibleGroup = false;
     739             :           break;
     740             :         }
     741             :         else {
     742             :           // Check the type of dependency
     743           0 :           for (unsigned k : SubGraph) {
     744             :             // If in the path to join the two instructions,
     745             :             // there is another high latency instruction,
     746             :             // or instructions colored for another block
     747             :             // abort the merge.
     748           0 :             if (DAG->IsHighLatencySU[k] ||
     749           0 :                 (CurrentColoring[k] != ProposedColor &&
     750           0 :                  CurrentColoring[k] != 0)) {
     751             :               CompatibleGroup = false;
     752             :               break;
     753             :             }
     754             :             // If one of the SU in the subgraph depends on the result of SU j,
     755             :             // there'll be a data dependency.
     756           0 :             if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) {
     757             :               CompatibleGroup = false;
     758             :               break;
     759             :             }
     760             :           }
     761           0 :           if (!CompatibleGroup)
     762             :             break;
     763             :           // Same check for the SU
     764           0 :           if (hasDataDependencyPred(SU, DAG->SUnits[j])) {
     765             :             CompatibleGroup = false;
     766             :             break;
     767             :           }
     768             :           // Add all the required instructions to the block
     769             :           // These cannot live in another block (because they
     770             :           // depend (order dependency) on one of the
     771             :           // instruction in the block, and are required for the
     772             :           // high latency instruction we add.
     773             :           AdditionalElements.insert(AdditionalElements.end(),
     774           0 :                                     SubGraph.begin(), SubGraph.end());
     775             :         }
     776             :       }
     777           0 :       if (CompatibleGroup) {
     778           0 :         FormingGroup.insert(SU.NodeNum);
     779           0 :         for (unsigned j : AdditionalElements)
     780           0 :           CurrentColoring[j] = ProposedColor;
     781           0 :         CurrentColoring[SU.NodeNum] = ProposedColor;
     782           0 :         ++Count;
     783             :       }
     784             :       // Found one incompatible instruction,
     785             :       // or has filled a big enough group.
     786             :       // -> start a new one.
     787           0 :       if (!CompatibleGroup) {
     788             :         FormingGroup.clear();
     789           0 :         Color = ++NextReservedID;
     790             :         ProposedColor = Color;
     791           0 :         FormingGroup.insert(SU.NodeNum);
     792           0 :         CurrentColoring[SU.NodeNum] = ProposedColor;
     793             :         Count = 0;
     794           0 :       } else if (Count == GroupSize) {
     795             :         FormingGroup.clear();
     796           0 :         Color = ++NextReservedID;
     797             :         ProposedColor = Color;
     798             :         Count = 0;
     799             :       }
     800             :     }
     801             :   }
     802             : }
     803             : 
     804           2 : void SIScheduleBlockCreator::colorComputeReservedDependencies() {
     805           4 :   unsigned DAGSize = DAG->SUnits.size();
     806             :   std::map<std::set<unsigned>, unsigned> ColorCombinations;
     807             : 
     808           2 :   CurrentTopDownReservedDependencyColoring.clear();
     809           2 :   CurrentBottomUpReservedDependencyColoring.clear();
     810             : 
     811           2 :   CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
     812           2 :   CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
     813             : 
     814             :   // Traverse TopDown, and give different colors to SUs depending
     815             :   // on which combination of High Latencies they depend on.
     816             : 
     817          18 :   for (unsigned SUNum : DAG->TopDownIndex2SU) {
     818          16 :     SUnit *SU = &DAG->SUnits[SUNum];
     819             :     std::set<unsigned> SUColors;
     820             : 
     821             :     // Already given.
     822          32 :     if (CurrentColoring[SU->NodeNum]) {
     823           1 :       CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
     824             :         CurrentColoring[SU->NodeNum];
     825           1 :       continue;
     826             :     }
     827             : 
     828          36 :    for (SDep& PredDep : SU->Preds) {
     829             :       SUnit *Pred = PredDep.getSUnit();
     830          21 :       if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
     831             :         continue;
     832          42 :       if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
     833           5 :         SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
     834             :     }
     835             :     // Color 0 by default.
     836          15 :     if (SUColors.empty())
     837             :       continue;
     838             :     // Same color than parents.
     839           3 :     if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
     840           0 :       CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
     841           0 :         *SUColors.begin();
     842             :     else {
     843             :       std::map<std::set<unsigned>, unsigned>::iterator Pos =
     844             :         ColorCombinations.find(SUColors);
     845           3 :       if (Pos != ColorCombinations.end()) {
     846           2 :           CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
     847             :       } else {
     848           2 :         CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
     849           2 :           NextNonReservedID;
     850           2 :         ColorCombinations[SUColors] = NextNonReservedID++;
     851             :       }
     852             :     }
     853             :   }
     854             : 
     855             :   ColorCombinations.clear();
     856             : 
     857             :   // Same as before, but BottomUp.
     858             : 
     859          18 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
     860          16 :     SUnit *SU = &DAG->SUnits[SUNum];
     861             :     std::set<unsigned> SUColors;
     862             : 
     863             :     // Already given.
     864          32 :     if (CurrentColoring[SU->NodeNum]) {
     865           1 :       CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
     866             :         CurrentColoring[SU->NodeNum];
     867           1 :       continue;
     868             :     }
     869             : 
     870          36 :     for (SDep& SuccDep : SU->Succs) {
     871             :       SUnit *Succ = SuccDep.getSUnit();
     872          21 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
     873             :         continue;
     874          36 :       if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
     875           0 :         SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
     876             :     }
     877             :     // Keep color 0.
     878          15 :     if (SUColors.empty())
     879             :       continue;
     880             :     // Same color than parents.
     881           0 :     if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
     882           0 :       CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
     883           0 :         *SUColors.begin();
     884             :     else {
     885             :       std::map<std::set<unsigned>, unsigned>::iterator Pos =
     886             :         ColorCombinations.find(SUColors);
     887           0 :       if (Pos != ColorCombinations.end()) {
     888           0 :         CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
     889             :       } else {
     890           0 :         CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
     891           0 :           NextNonReservedID;
     892           0 :         ColorCombinations[SUColors] = NextNonReservedID++;
     893             :       }
     894             :     }
     895             :   }
     896           2 : }
     897             : 
     898           2 : void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
     899           4 :   unsigned DAGSize = DAG->SUnits.size();
     900             :   std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
     901             : 
     902             :   // Every combination of colors given by the top down
     903             :   // and bottom up Reserved node dependency
     904             : 
     905          18 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
     906          16 :     SUnit *SU = &DAG->SUnits[i];
     907          16 :     std::pair<unsigned, unsigned> SUColors;
     908             : 
     909             :     // High latency instructions: already given.
     910          32 :     if (CurrentColoring[SU->NodeNum])
     911           1 :       continue;
     912             : 
     913          15 :     SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
     914          30 :     SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
     915             : 
     916             :     std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
     917             :       ColorCombinations.find(SUColors);
     918          15 :     if (Pos != ColorCombinations.end()) {
     919          24 :       CurrentColoring[SU->NodeNum] = Pos->second;
     920             :     } else {
     921           3 :       CurrentColoring[SU->NodeNum] = NextNonReservedID;
     922           3 :       ColorCombinations[SUColors] = NextNonReservedID++;
     923             :     }
     924             :   }
     925           2 : }
     926             : 
     927           2 : void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
     928           2 :   unsigned DAGSize = DAG->SUnits.size();
     929           2 :   std::vector<int> PendingColoring = CurrentColoring;
     930             : 
     931             :   assert(DAGSize >= 1 &&
     932             :          CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
     933             :          CurrentTopDownReservedDependencyColoring.size() == DAGSize);
     934             :   // If there is no reserved block at all, do nothing. We don't want
     935             :   // everything in one block.
     936           2 :   if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
     937           2 :                         CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
     938           1 :       *std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
     939             :                         CurrentTopDownReservedDependencyColoring.end()) == 0)
     940             :     return;
     941             : 
     942           5 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
     943           4 :     SUnit *SU = &DAG->SUnits[SUNum];
     944             :     std::set<unsigned> SUColors;
     945             :     std::set<unsigned> SUColorsPending;
     946             : 
     947           8 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
     948             :       continue;
     949             : 
     950           6 :     if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
     951           6 :         CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
     952             :       continue;
     953             : 
     954           0 :     for (SDep& SuccDep : SU->Succs) {
     955             :       SUnit *Succ = SuccDep.getSUnit();
     956           0 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
     957             :         continue;
     958           0 :       if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
     959           0 :           CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
     960           0 :         SUColors.insert(CurrentColoring[Succ->NodeNum]);
     961           0 :       SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
     962             :     }
     963             :     // If there is only one child/parent block, and that block
     964             :     // is not among the ones we are removing in this path, then
     965             :     // merge the instruction to that block
     966           0 :     if (SUColors.size() == 1 && SUColorsPending.size() == 1)
     967           0 :       PendingColoring[SU->NodeNum] = *SUColors.begin();
     968             :     else // TODO: Attribute new colors depending on color
     969             :          // combination of children.
     970           0 :       PendingColoring[SU->NodeNum] = NextNonReservedID++;
     971             :   }
     972           1 :   CurrentColoring = PendingColoring;
     973             : }
     974             : 
     975             : 
     976           0 : void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
     977           0 :   unsigned DAGSize = DAG->SUnits.size();
     978             :   unsigned PreviousColor;
     979             :   std::set<unsigned> SeenColors;
     980             : 
     981           0 :   if (DAGSize <= 1)
     982             :     return;
     983             : 
     984           0 :   PreviousColor = CurrentColoring[0];
     985             : 
     986           0 :   for (unsigned i = 1, e = DAGSize; i != e; ++i) {
     987           0 :     SUnit *SU = &DAG->SUnits[i];
     988           0 :     unsigned CurrentColor = CurrentColoring[i];
     989           0 :     unsigned PreviousColorSave = PreviousColor;
     990             :     assert(i == SU->NodeNum);
     991             : 
     992           0 :     if (CurrentColor != PreviousColor)
     993             :       SeenColors.insert(PreviousColor);
     994           0 :     PreviousColor = CurrentColor;
     995             : 
     996           0 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
     997             :       continue;
     998             : 
     999           0 :     if (SeenColors.find(CurrentColor) == SeenColors.end())
    1000             :       continue;
    1001             : 
    1002           0 :     if (PreviousColorSave != CurrentColor)
    1003           0 :       CurrentColoring[i] = NextNonReservedID++;
    1004             :     else
    1005           0 :       CurrentColoring[i] = CurrentColoring[i-1];
    1006             :   }
    1007             : }
    1008             : 
    1009           2 : void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
    1010           4 :   unsigned DAGSize = DAG->SUnits.size();
    1011             : 
    1012          18 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    1013          16 :     SUnit *SU = &DAG->SUnits[SUNum];
    1014             :     std::set<unsigned> SUColors;
    1015             : 
    1016          32 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    1017             :       continue;
    1018             : 
    1019             :     // No predecessor: Vgpr constant loading.
    1020             :     // Low latency instructions usually have a predecessor (the address)
    1021          15 :     if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
    1022             :       continue;
    1023             : 
    1024          16 :     for (SDep& SuccDep : SU->Succs) {
    1025             :       SUnit *Succ = SuccDep.getSUnit();
    1026           9 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    1027             :         continue;
    1028          14 :       SUColors.insert(CurrentColoring[Succ->NodeNum]);
    1029             :     }
    1030           7 :     if (SUColors.size() == 1)
    1031           8 :       CurrentColoring[SU->NodeNum] = *SUColors.begin();
    1032             :   }
    1033           2 : }
    1034             : 
    1035           0 : void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
    1036           0 :   unsigned DAGSize = DAG->SUnits.size();
    1037             : 
    1038           0 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    1039           0 :     SUnit *SU = &DAG->SUnits[SUNum];
    1040             :     std::set<unsigned> SUColors;
    1041             : 
    1042           0 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    1043             :       continue;
    1044             : 
    1045           0 :     for (SDep& SuccDep : SU->Succs) {
    1046             :        SUnit *Succ = SuccDep.getSUnit();
    1047           0 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    1048             :         continue;
    1049           0 :       SUColors.insert(CurrentColoring[Succ->NodeNum]);
    1050             :     }
    1051           0 :     if (SUColors.size() == 1)
    1052           0 :       CurrentColoring[SU->NodeNum] = *SUColors.begin();
    1053             :   }
    1054           0 : }
    1055             : 
    1056           2 : void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
    1057           4 :   unsigned DAGSize = DAG->SUnits.size();
    1058             : 
    1059          18 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    1060          16 :     SUnit *SU = &DAG->SUnits[SUNum];
    1061             :     std::set<unsigned> SUColors;
    1062             : 
    1063          32 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    1064             :       continue;
    1065             : 
    1066          36 :     for (SDep& SuccDep : SU->Succs) {
    1067             :        SUnit *Succ = SuccDep.getSUnit();
    1068          21 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    1069             :         continue;
    1070          36 :       SUColors.insert(CurrentColoring[Succ->NodeNum]);
    1071             :     }
    1072          15 :     if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
    1073           0 :       CurrentColoring[SU->NodeNum] = *SUColors.begin();
    1074             :   }
    1075           2 : }
    1076             : 
    1077           0 : void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
    1078           0 :   unsigned DAGSize = DAG->SUnits.size();
    1079             :   std::map<unsigned, unsigned> ColorCount;
    1080             : 
    1081           0 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    1082           0 :     SUnit *SU = &DAG->SUnits[SUNum];
    1083           0 :     unsigned color = CurrentColoring[SU->NodeNum];
    1084           0 :      ++ColorCount[color];
    1085             :   }
    1086             : 
    1087           0 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    1088           0 :     SUnit *SU = &DAG->SUnits[SUNum];
    1089           0 :     unsigned color = CurrentColoring[SU->NodeNum];
    1090             :     std::set<unsigned> SUColors;
    1091             : 
    1092           0 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    1093             :       continue;
    1094             : 
    1095           0 :     if (ColorCount[color] > 1)
    1096             :       continue;
    1097             : 
    1098           0 :     for (SDep& SuccDep : SU->Succs) {
    1099             :        SUnit *Succ = SuccDep.getSUnit();
    1100           0 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    1101             :         continue;
    1102           0 :       SUColors.insert(CurrentColoring[Succ->NodeNum]);
    1103             :     }
    1104           0 :     if (SUColors.size() == 1 && *SUColors.begin() != color) {
    1105           0 :       --ColorCount[color];
    1106           0 :       CurrentColoring[SU->NodeNum] = *SUColors.begin();
    1107           0 :       ++ColorCount[*SUColors.begin()];
    1108             :     }
    1109             :   }
    1110           0 : }
    1111             : 
    1112           0 : void SIScheduleBlockCreator::cutHugeBlocks() {
    1113             :   // TODO
    1114           0 : }
    1115             : 
    1116           2 : void SIScheduleBlockCreator::regroupNoUserInstructions() {
    1117           2 :   unsigned DAGSize = DAG->SUnits.size();
    1118           2 :   int GroupID = NextNonReservedID++;
    1119             : 
    1120          18 :   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
    1121          16 :     SUnit *SU = &DAG->SUnits[SUNum];
    1122             :     bool hasSuccessor = false;
    1123             : 
    1124          32 :     if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
    1125             :       continue;
    1126             : 
    1127          36 :     for (SDep& SuccDep : SU->Succs) {
    1128             :        SUnit *Succ = SuccDep.getSUnit();
    1129          21 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    1130           3 :         continue;
    1131             :       hasSuccessor = true;
    1132             :     }
    1133          15 :     if (!hasSuccessor)
    1134           4 :       CurrentColoring[SU->NodeNum] = GroupID;
    1135             :   }
    1136           2 : }
    1137             : 
    1138           2 : void SIScheduleBlockCreator::colorExports() {
    1139           2 :   unsigned ExportColor = NextNonReservedID++;
    1140             :   SmallVector<unsigned, 8> ExpGroup;
    1141             : 
    1142             :   // Put all exports together in a block.
    1143             :   // The block will naturally end up being scheduled last,
    1144             :   // thus putting exports at the end of the schedule, which
    1145             :   // is better for performance.
    1146             :   // However we must ensure, for safety, the exports can be put
    1147             :   // together in the same block without any other instruction.
    1148             :   // This could happen, for example, when scheduling after regalloc
    1149             :   // if reloading a spilled register from memory using the same
    1150             :   // register than used in a previous export.
    1151             :   // If that happens, do not regroup the exports.
    1152          18 :   for (unsigned SUNum : DAG->TopDownIndex2SU) {
    1153          16 :     const SUnit &SU = DAG->SUnits[SUNum];
    1154          32 :     if (SIInstrInfo::isEXP(*SU.getInstr())) {
    1155             :       // Check the EXP can be added to the group safely,
    1156             :       // ie without needing any other instruction.
    1157             :       // The EXP is allowed to depend on other EXP
    1158             :       // (they will be in the same group).
    1159           1 :       for (unsigned j : ExpGroup) {
    1160             :         bool HasSubGraph;
    1161             :         std::vector<int> SubGraph;
    1162             :         // By construction (topological order), if SU and
    1163             :         // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
    1164             :         // in the parent graph of SU.
    1165             : #ifndef NDEBUG
    1166             :         SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
    1167             :                                                HasSubGraph);
    1168             :         assert(!HasSubGraph);
    1169             : #endif
    1170           0 :         SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
    1171             :                                                HasSubGraph);
    1172           0 :         if (!HasSubGraph)
    1173             :           continue; // No dependencies between each other
    1174             : 
    1175             :         // SubGraph contains all the instructions required
    1176             :         // between EXP SUnits[j] and EXP SU.
    1177           0 :         for (unsigned k : SubGraph) {
    1178           0 :           if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr()))
    1179             :             // Other instructions than EXP would be required in the group.
    1180             :             // Abort the groupping.
    1181             :             return;
    1182             :         }
    1183             :       }
    1184             : 
    1185           1 :       ExpGroup.push_back(SUNum);
    1186             :     }
    1187             :   }
    1188             : 
    1189             :   // The group can be formed. Give the color.
    1190           3 :   for (unsigned j : ExpGroup)
    1191           2 :     CurrentColoring[j] = ExportColor;
    1192             : }
    1193             : 
    1194           2 : void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
    1195           4 :   unsigned DAGSize = DAG->SUnits.size();
    1196             :   std::map<unsigned,unsigned> RealID;
    1197             : 
    1198           2 :   CurrentBlocks.clear();
    1199           2 :   CurrentColoring.clear();
    1200           2 :   CurrentColoring.resize(DAGSize, 0);
    1201           2 :   Node2CurrentBlock.clear();
    1202             : 
    1203             :   // Restore links previous scheduling variant has overridden.
    1204           2 :   DAG->restoreSULinksLeft();
    1205             : 
    1206           2 :   NextReservedID = 1;
    1207           2 :   NextNonReservedID = DAGSize + 1;
    1208             : 
    1209             :   LLVM_DEBUG(dbgs() << "Coloring the graph\n");
    1210             : 
    1211           2 :   if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
    1212           0 :     colorHighLatenciesGroups();
    1213             :   else
    1214           2 :     colorHighLatenciesAlone();
    1215           2 :   colorComputeReservedDependencies();
    1216           2 :   colorAccordingToReservedDependencies();
    1217           2 :   colorEndsAccordingToDependencies();
    1218           2 :   if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
    1219           0 :     colorForceConsecutiveOrderInGroup();
    1220           2 :   regroupNoUserInstructions();
    1221           2 :   colorMergeConstantLoadsNextGroup();
    1222           2 :   colorMergeIfPossibleNextGroupOnlyForReserved();
    1223           2 :   colorExports();
    1224             : 
    1225             :   // Put SUs of same color into same block
    1226           2 :   Node2CurrentBlock.resize(DAGSize, -1);
    1227          18 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1228          16 :     SUnit *SU = &DAG->SUnits[i];
    1229          32 :     unsigned Color = CurrentColoring[SU->NodeNum];
    1230          16 :     if (RealID.find(Color) == RealID.end()) {
    1231           5 :       int ID = CurrentBlocks.size();
    1232          10 :       BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID));
    1233           5 :       CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
    1234           5 :       RealID[Color] = ID;
    1235             :     }
    1236          16 :     CurrentBlocks[RealID[Color]]->addUnit(SU);
    1237          16 :     Node2CurrentBlock[SU->NodeNum] = RealID[Color];
    1238             :   }
    1239             : 
    1240             :   // Build dependencies between blocks.
    1241          18 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1242          16 :     SUnit *SU = &DAG->SUnits[i];
    1243          32 :     int SUID = Node2CurrentBlock[i];
    1244          40 :      for (SDep& SuccDep : SU->Succs) {
    1245             :        SUnit *Succ = SuccDep.getSUnit();
    1246          24 :       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
    1247             :         continue;
    1248          42 :       if (Node2CurrentBlock[Succ->NodeNum] != SUID)
    1249          32 :         CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
    1250             :                                      SuccDep.isCtrl() ? NoData : Data);
    1251             :     }
    1252          37 :     for (SDep& PredDep : SU->Preds) {
    1253             :       SUnit *Pred = PredDep.getSUnit();
    1254          21 :       if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
    1255             :         continue;
    1256          42 :       if (Node2CurrentBlock[Pred->NodeNum] != SUID)
    1257          27 :         CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
    1258             :     }
    1259             :   }
    1260             : 
    1261             :   // Free root and leafs of all blocks to enable scheduling inside them.
    1262           9 :   for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
    1263           5 :     SIScheduleBlock *Block = CurrentBlocks[i];
    1264           5 :     Block->finalizeUnits();
    1265             :   }
    1266             :   LLVM_DEBUG(dbgs() << "Blocks created:\n\n";
    1267             :              for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
    1268             :                SIScheduleBlock *Block = CurrentBlocks[i];
    1269             :                Block->printDebug(true);
    1270             :              });
    1271           2 : }
    1272             : 
    1273             : // Two functions taken from Codegen/MachineScheduler.cpp
    1274             : 
    1275             : /// Non-const version.
    1276             : static MachineBasicBlock::iterator
    1277          12 : nextIfDebug(MachineBasicBlock::iterator I,
    1278             :             MachineBasicBlock::const_iterator End) {
    1279          12 :   for (; I != End; ++I) {
    1280             :     if (!I->isDebugInstr())
    1281             :       break;
    1282             :   }
    1283          12 :   return I;
    1284             : }
    1285             : 
    1286           2 : void SIScheduleBlockCreator::topologicalSort() {
    1287           4 :   unsigned DAGSize = CurrentBlocks.size();
    1288             :   std::vector<int> WorkList;
    1289             : 
    1290             :   LLVM_DEBUG(dbgs() << "Topological Sort\n");
    1291             : 
    1292           2 :   WorkList.reserve(DAGSize);
    1293           2 :   TopDownIndex2Block.resize(DAGSize);
    1294           2 :   TopDownBlock2Index.resize(DAGSize);
    1295           2 :   BottomUpIndex2Block.resize(DAGSize);
    1296             : 
    1297           7 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1298          10 :     SIScheduleBlock *Block = CurrentBlocks[i];
    1299           5 :     unsigned Degree = Block->getSuccs().size();
    1300           5 :     TopDownBlock2Index[i] = Degree;
    1301           5 :     if (Degree == 0) {
    1302           2 :       WorkList.push_back(i);
    1303             :     }
    1304             :   }
    1305             : 
    1306           2 :   int Id = DAGSize;
    1307           7 :   while (!WorkList.empty()) {
    1308           5 :     int i = WorkList.back();
    1309          10 :     SIScheduleBlock *Block = CurrentBlocks[i];
    1310             :     WorkList.pop_back();
    1311           5 :     TopDownBlock2Index[i] = --Id;
    1312           5 :     TopDownIndex2Block[Id] = i;
    1313           9 :     for (SIScheduleBlock* Pred : Block->getPreds()) {
    1314           8 :       if (!--TopDownBlock2Index[Pred->getID()])
    1315           3 :         WorkList.push_back(Pred->getID());
    1316             :     }
    1317             :   }
    1318             : 
    1319             : #ifndef NDEBUG
    1320             :   // Check correctness of the ordering.
    1321             :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1322             :     SIScheduleBlock *Block = CurrentBlocks[i];
    1323             :     for (SIScheduleBlock* Pred : Block->getPreds()) {
    1324             :       assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
    1325             :       "Wrong Top Down topological sorting");
    1326             :     }
    1327             :   }
    1328             : #endif
    1329             : 
    1330           2 :   BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
    1331           2 :                                          TopDownIndex2Block.rend());
    1332           2 : }
    1333             : 
    1334           2 : void SIScheduleBlockCreator::scheduleInsideBlocks() {
    1335           4 :   unsigned DAGSize = CurrentBlocks.size();
    1336             : 
    1337             :   LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
    1338             : 
    1339             :   // We do schedule a valid scheduling such that a Block corresponds
    1340             :   // to a range of instructions.
    1341             :   LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
    1342           7 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1343           5 :     SIScheduleBlock *Block = CurrentBlocks[i];
    1344           5 :     Block->fastSchedule();
    1345             :   }
    1346             : 
    1347             :   // Note: the following code, and the part restoring previous position
    1348             :   // is by far the most expensive operation of the Scheduler.
    1349             : 
    1350             :   // Do not update CurrentTop.
    1351           2 :   MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
    1352             :   std::vector<MachineBasicBlock::iterator> PosOld;
    1353             :   std::vector<MachineBasicBlock::iterator> PosNew;
    1354           4 :   PosOld.reserve(DAG->SUnits.size());
    1355           4 :   PosNew.reserve(DAG->SUnits.size());
    1356             : 
    1357           7 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1358           5 :     int BlockIndice = TopDownIndex2Block[i];
    1359          10 :     SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
    1360             :     std::vector<SUnit*> SUs = Block->getScheduledUnits();
    1361             : 
    1362          21 :     for (SUnit* SU : SUs) {
    1363          16 :       MachineInstr *MI = SU->getInstr();
    1364             :       MachineBasicBlock::iterator Pos = MI;
    1365          16 :       PosOld.push_back(Pos);
    1366          16 :       if (&*CurrentTopFastSched == MI) {
    1367          12 :         PosNew.push_back(Pos);
    1368             :         CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
    1369          24 :                                           DAG->getCurrentBottom());
    1370             :       } else {
    1371             :         // Update the instruction stream.
    1372           8 :         DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
    1373             : 
    1374             :         // Update LiveIntervals.
    1375             :         // Note: Moving all instructions and calling handleMove every time
    1376             :         // is the most cpu intensive operation of the scheduler.
    1377             :         // It would gain a lot if there was a way to recompute the
    1378             :         // LiveIntervals for the entire scheduling region.
    1379           4 :         DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
    1380           4 :         PosNew.push_back(CurrentTopFastSched);
    1381             :       }
    1382             :     }
    1383             :   }
    1384             : 
    1385             :   // Now we have Block of SUs == Block of MI.
    1386             :   // We do the final schedule for the instructions inside the block.
    1387             :   // The property that all the SUs of the Block are grouped together as MI
    1388             :   // is used for correct reg usage tracking.
    1389           7 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1390          10 :     SIScheduleBlock *Block = CurrentBlocks[i];
    1391             :     std::vector<SUnit*> SUs = Block->getScheduledUnits();
    1392          15 :     Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
    1393             :   }
    1394             : 
    1395             :   LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
    1396             :   // Restore old ordering (which prevents a LIS->handleMove bug).
    1397          20 :   for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
    1398          16 :     MachineBasicBlock::iterator POld = PosOld[i-1];
    1399          16 :     MachineBasicBlock::iterator PNew = PosNew[i-1];
    1400          16 :     if (PNew != POld) {
    1401             :       // Update the instruction stream.
    1402           4 :       DAG->getBB()->splice(POld, DAG->getBB(), PNew);
    1403             : 
    1404             :       // Update LiveIntervals.
    1405           4 :       DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
    1406             :     }
    1407             :   }
    1408             : 
    1409             :   LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
    1410             :     SIScheduleBlock *Block = CurrentBlocks[i];
    1411             :     Block->printDebug(true);
    1412             :   });
    1413           2 : }
    1414             : 
    1415           2 : void SIScheduleBlockCreator::fillStats() {
    1416           4 :   unsigned DAGSize = CurrentBlocks.size();
    1417             : 
    1418           7 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1419           5 :     int BlockIndice = TopDownIndex2Block[i];
    1420          10 :     SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
    1421           5 :     if (Block->getPreds().empty())
    1422           2 :       Block->Depth = 0;
    1423             :     else {
    1424             :       unsigned Depth = 0;
    1425           7 :       for (SIScheduleBlock *Pred : Block->getPreds()) {
    1426           8 :         if (Depth < Pred->Depth + Pred->getCost())
    1427             :           Depth = Pred->Depth + Pred->getCost();
    1428             :       }
    1429           3 :       Block->Depth = Depth;
    1430             :     }
    1431             :   }
    1432             : 
    1433           7 :   for (unsigned i = 0, e = DAGSize; i != e; ++i) {
    1434           5 :     int BlockIndice = BottomUpIndex2Block[i];
    1435          10 :     SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
    1436           5 :     if (Block->getSuccs().empty())
    1437           2 :       Block->Height = 0;
    1438             :     else {
    1439           3 :       unsigned Height = 0;
    1440           7 :       for (const auto &Succ : Block->getSuccs())
    1441          11 :         Height = std::max(Height, Succ.first->Height + Succ.first->getCost());
    1442           3 :       Block->Height = Height;
    1443             :     }
    1444             :   }
    1445           2 : }
    1446             : 
    1447             : // SIScheduleBlockScheduler //
    1448             : 
    1449           2 : SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
    1450             :                                                    SISchedulerBlockSchedulerVariant Variant,
    1451           2 :                                                    SIScheduleBlocks  BlocksStruct) :
    1452           2 :   DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
    1453             :   LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
    1454           2 :   SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
    1455             : 
    1456             :   // Fill the usage of every output
    1457             :   // Warning: while by construction we always have a link between two blocks
    1458             :   // when one needs a result from the other, the number of users of an output
    1459             :   // is not the sum of child blocks having as input the same virtual register.
    1460             :   // Here is an example. A produces x and y. B eats x and produces x'.
    1461             :   // C eats x' and y. The register coalescer may have attributed the same
    1462             :   // virtual register to x and x'.
    1463             :   // To count accurately, we do a topological sort. In case the register is
    1464             :   // found for several parents, we increment the usage of the one with the
    1465             :   // highest topological index.
    1466           4 :   LiveOutRegsNumUsages.resize(Blocks.size());
    1467           9 :   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
    1468          10 :     SIScheduleBlock *Block = Blocks[i];
    1469          13 :     for (unsigned Reg : Block->getInRegs()) {
    1470             :       bool Found = false;
    1471             :       int topoInd = -1;
    1472          15 :       for (SIScheduleBlock* Pred: Block->getPreds()) {
    1473             :         std::set<unsigned> PredOutRegs = Pred->getOutRegs();
    1474             :         std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
    1475             : 
    1476           7 :         if (RegPos != PredOutRegs.end()) {
    1477             :           Found = true;
    1478          10 :           if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
    1479             :             topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
    1480             :           }
    1481             :         }
    1482             :       }
    1483             : 
    1484           8 :       if (!Found)
    1485             :         continue;
    1486             : 
    1487           5 :       int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
    1488          10 :       ++LiveOutRegsNumUsages[PredID][Reg];
    1489             :     }
    1490             :   }
    1491             : 
    1492           4 :   LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
    1493           4 :   BlockNumPredsLeft.resize(Blocks.size());
    1494           4 :   BlockNumSuccsLeft.resize(Blocks.size());
    1495             : 
    1496           9 :   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
    1497           5 :     SIScheduleBlock *Block = Blocks[i];
    1498          15 :     BlockNumPredsLeft[i] = Block->getPreds().size();
    1499          10 :     BlockNumSuccsLeft[i] = Block->getSuccs().size();
    1500             :   }
    1501             : 
    1502             : #ifndef NDEBUG
    1503             :   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
    1504             :     SIScheduleBlock *Block = Blocks[i];
    1505             :     assert(Block->getID() == i);
    1506             :   }
    1507             : #endif
    1508             : 
    1509           2 :   std::set<unsigned> InRegs = DAG->getInRegs();
    1510           2 :   addLiveRegs(InRegs);
    1511             : 
    1512             :   // Increase LiveOutRegsNumUsages for blocks
    1513             :   // producing registers consumed in another
    1514             :   // scheduling region.
    1515           7 :   for (unsigned Reg : DAG->getOutRegs()) {
    1516           6 :     for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
    1517             :       // Do reverse traversal
    1518           3 :       int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
    1519           6 :       SIScheduleBlock *Block = Blocks[ID];
    1520             :       const std::set<unsigned> &OutRegs = Block->getOutRegs();
    1521             : 
    1522           3 :       if (OutRegs.find(Reg) == OutRegs.end())
    1523             :         continue;
    1524             : 
    1525           6 :       ++LiveOutRegsNumUsages[ID][Reg];
    1526           3 :       break;
    1527             :     }
    1528             :   }
    1529             : 
    1530             :   // Fill LiveRegsConsumers for regs that were already
    1531             :   // defined before scheduling.
    1532           9 :   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
    1533          10 :     SIScheduleBlock *Block = Blocks[i];
    1534          13 :     for (unsigned Reg : Block->getInRegs()) {
    1535             :       bool Found = false;
    1536           8 :       for (SIScheduleBlock* Pred: Block->getPreds()) {
    1537             :         std::set<unsigned> PredOutRegs = Pred->getOutRegs();
    1538             :         std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
    1539             : 
    1540           5 :         if (RegPos != PredOutRegs.end()) {
    1541             :           Found = true;
    1542             :           break;
    1543             :         }
    1544             :       }
    1545             : 
    1546             :       if (!Found)
    1547           3 :         ++LiveRegsConsumers[Reg];
    1548             :     }
    1549             :   }
    1550             : 
    1551           9 :   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
    1552           5 :     SIScheduleBlock *Block = Blocks[i];
    1553          10 :     if (BlockNumPredsLeft[i] == 0) {
    1554           2 :       ReadyBlocks.push_back(Block);
    1555             :     }
    1556             :   }
    1557             : 
    1558           7 :   while (SIScheduleBlock *Block = pickBlock()) {
    1559           5 :     BlocksScheduled.push_back(Block);
    1560           5 :     blockScheduled(Block);
    1561           5 :   }
    1562             : 
    1563             :   LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block
    1564             :                                             : BlocksScheduled) {
    1565             :     dbgs() << ' ' << Block->getID();
    1566             :   } dbgs() << '\n';);
    1567           2 : }
    1568             : 
    1569           5 : bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
    1570             :                                                    SIBlockSchedCandidate &TryCand) {
    1571           5 :   if (!Cand.isValid()) {
    1572           5 :     TryCand.Reason = NodeOrder;
    1573           5 :     return true;
    1574             :   }
    1575             : 
    1576             :   // Try to hide high latencies.
    1577           0 :   if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled,
    1578           0 :                  Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
    1579           0 :     return true;
    1580             :   // Schedule high latencies early so you can hide them better.
    1581           0 :   if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
    1582             :                           TryCand, Cand, Latency))
    1583           0 :     return true;
    1584           0 :   if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height,
    1585             :                                                    TryCand, Cand, Depth))
    1586           0 :     return true;
    1587           0 :   if (SISched::tryGreater(TryCand.NumHighLatencySuccessors,
    1588           0 :                           Cand.NumHighLatencySuccessors,
    1589             :                           TryCand, Cand, Successor))
    1590           0 :     return true;
    1591             :   return false;
    1592             : }
    1593             : 
    1594           0 : bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
    1595             :                                                     SIBlockSchedCandidate &TryCand) {
    1596           0 :   if (!Cand.isValid()) {
    1597           0 :     TryCand.Reason = NodeOrder;
    1598           0 :     return true;
    1599             :   }
    1600             : 
    1601           0 :   if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
    1602             :                        TryCand, Cand, RegUsage))
    1603           0 :     return true;
    1604           0 :   if (SISched::tryGreater(TryCand.NumSuccessors > 0,
    1605           0 :                           Cand.NumSuccessors > 0,
    1606             :                           TryCand, Cand, Successor))
    1607           0 :     return true;
    1608           0 :   if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
    1609           0 :     return true;
    1610           0 :   if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
    1611             :                        TryCand, Cand, RegUsage))
    1612           0 :     return true;
    1613             :   return false;
    1614             : }
    1615             : 
    1616           7 : SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
    1617             :   SIBlockSchedCandidate Cand;
    1618             :   std::vector<SIScheduleBlock*>::iterator Best;
    1619             :   SIScheduleBlock *Block;
    1620           7 :   if (ReadyBlocks.empty())
    1621             :     return nullptr;
    1622             : 
    1623           5 :   DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
    1624           5 :                         VregCurrentUsage, SregCurrentUsage);
    1625           5 :   if (VregCurrentUsage > maxVregUsage)
    1626           3 :     maxVregUsage = VregCurrentUsage;
    1627           5 :   if (SregCurrentUsage > maxSregUsage)
    1628           2 :     maxSregUsage = SregCurrentUsage;
    1629             :   LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
    1630             :              for (SIScheduleBlock *Block
    1631             :                   : ReadyBlocks) dbgs()
    1632             :              << Block->getID() << ' ';
    1633             :              dbgs() << "\nCurrent Live:\n";
    1634             :              for (unsigned Reg
    1635             :                   : LiveRegs) dbgs()
    1636             :              << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
    1637             :              dbgs() << '\n';
    1638             :              dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
    1639             :              dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
    1640             : 
    1641           5 :   Cand.Block = nullptr;
    1642             :   for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
    1643          10 :        E = ReadyBlocks.end(); I != E; ++I) {
    1644             :     SIBlockSchedCandidate TryCand;
    1645           5 :     TryCand.Block = *I;
    1646           5 :     TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
    1647           5 :     TryCand.VGPRUsageDiff =
    1648           5 :       checkRegUsageImpact(TryCand.Block->getInRegs(),
    1649           5 :                           TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
    1650           5 :     TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
    1651           5 :     TryCand.NumHighLatencySuccessors =
    1652           5 :       TryCand.Block->getNumHighLatencySuccessors();
    1653           5 :     TryCand.LastPosHighLatParentScheduled =
    1654          15 :       (unsigned int) std::max<int> (0,
    1655           5 :          LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
    1656           5 :            LastPosWaitedHighLatency);
    1657           5 :     TryCand.Height = TryCand.Block->Height;
    1658             :     // Try not to increase VGPR usage too much, else we may spill.
    1659           5 :     if (VregCurrentUsage > 120 ||
    1660           5 :         Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
    1661           0 :       if (!tryCandidateRegUsage(Cand, TryCand) &&
    1662           0 :           Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
    1663           0 :         tryCandidateLatency(Cand, TryCand);
    1664             :     } else {
    1665           5 :       if (!tryCandidateLatency(Cand, TryCand))
    1666           0 :         tryCandidateRegUsage(Cand, TryCand);
    1667             :     }
    1668           5 :     if (TryCand.Reason != NoCand) {
    1669             :       Cand.setBest(TryCand);
    1670             :       Best = I;
    1671             :       LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
    1672             :                         << getReasonStr(Cand.Reason) << '\n');
    1673             :     }
    1674             :   }
    1675             : 
    1676             :   LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n';
    1677             :              dbgs() << "Is a block with high latency instruction: "
    1678             :                     << (Cand.IsHighLatency ? "yes\n" : "no\n");
    1679             :              dbgs() << "Position of last high latency dependency: "
    1680             :                     << Cand.LastPosHighLatParentScheduled << '\n';
    1681             :              dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
    1682             :              dbgs() << '\n';);
    1683             : 
    1684           5 :   Block = Cand.Block;
    1685           5 :   ReadyBlocks.erase(Best);
    1686           5 :   return Block;
    1687             : }
    1688             : 
    1689             : // Tracking of currently alive registers to determine VGPR Usage.
    1690             : 
    1691           7 : void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
    1692          26 :   for (unsigned Reg : Regs) {
    1693             :     // For now only track virtual registers.
    1694          19 :     if (!TargetRegisterInfo::isVirtualRegister(Reg))
    1695             :       continue;
    1696             :     // If not already in the live set, then add it.
    1697             :     (void) LiveRegs.insert(Reg);
    1698             :   }
    1699           7 : }
    1700             : 
    1701           5 : void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
    1702             :                                        std::set<unsigned> &Regs) {
    1703          13 :   for (unsigned Reg : Regs) {
    1704             :     // For now only track virtual registers.
    1705             :     std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
    1706             :     assert (Pos != LiveRegs.end() && // Reg must be live.
    1707             :                LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
    1708             :                LiveRegsConsumers[Reg] >= 1);
    1709           8 :     --LiveRegsConsumers[Reg];
    1710           8 :     if (LiveRegsConsumers[Reg] == 0)
    1711             :       LiveRegs.erase(Pos);
    1712             :   }
    1713           5 : }
    1714             : 
    1715           5 : void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
    1716           9 :   for (const auto &Block : Parent->getSuccs()) {
    1717           8 :     if (--BlockNumPredsLeft[Block.first->getID()] == 0)
    1718           3 :       ReadyBlocks.push_back(Block.first);
    1719             : 
    1720           4 :     if (Parent->isHighLatencyBlock() &&
    1721           2 :         Block.second == SIScheduleBlockLinkKind::Data)
    1722           2 :       LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
    1723             :   }
    1724           5 : }
    1725             : 
    1726           5 : void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
    1727           5 :   decreaseLiveRegs(Block, Block->getInRegs());
    1728           5 :   addLiveRegs(Block->getOutRegs());
    1729           5 :   releaseBlockSuccs(Block);
    1730             :   for (std::map<unsigned, unsigned>::iterator RegI =
    1731           5 :        LiveOutRegsNumUsages[Block->getID()].begin(),
    1732          13 :        E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
    1733             :     std::pair<unsigned, unsigned> RegP = *RegI;
    1734             :     // We produce this register, thus it must not be previously alive.
    1735             :     assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
    1736             :            LiveRegsConsumers[RegP.first] == 0);
    1737           8 :     LiveRegsConsumers[RegP.first] += RegP.second;
    1738             :   }
    1739           5 :   if (LastPosHighLatencyParentScheduled[Block->getID()] >
    1740           5 :         (unsigned)LastPosWaitedHighLatency)
    1741           0 :     LastPosWaitedHighLatency =
    1742             :       LastPosHighLatencyParentScheduled[Block->getID()];
    1743           5 :   ++NumBlockScheduled;
    1744           5 : }
    1745             : 
    1746             : std::vector<int>
    1747           5 : SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
    1748             :                                      std::set<unsigned> &OutRegs) {
    1749             :   std::vector<int> DiffSetPressure;
    1750           5 :   DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
    1751             : 
    1752          13 :   for (unsigned Reg : InRegs) {
    1753             :     // For now only track virtual registers.
    1754           8 :     if (!TargetRegisterInfo::isVirtualRegister(Reg))
    1755             :       continue;
    1756           8 :     if (LiveRegsConsumers[Reg] > 1)
    1757             :       continue;
    1758           8 :     PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
    1759          20 :     for (; PSetI.isValid(); ++PSetI) {
    1760          24 :       DiffSetPressure[*PSetI] -= PSetI.getWeight();
    1761             :     }
    1762             :   }
    1763             : 
    1764          13 :   for (unsigned Reg : OutRegs) {
    1765             :     // For now only track virtual registers.
    1766           8 :     if (!TargetRegisterInfo::isVirtualRegister(Reg))
    1767             :       continue;
    1768           8 :     PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
    1769          20 :     for (; PSetI.isValid(); ++PSetI) {
    1770          24 :       DiffSetPressure[*PSetI] += PSetI.getWeight();
    1771             :     }
    1772             :   }
    1773             : 
    1774           5 :   return DiffSetPressure;
    1775             : }
    1776             : 
    1777             : // SIScheduler //
    1778             : 
    1779             : struct SIScheduleBlockResult
    1780           2 : SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
    1781             :                              SISchedulerBlockSchedulerVariant ScheduleVariant) {
    1782           4 :   SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
    1783           6 :   SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
    1784             :   std::vector<SIScheduleBlock*> ScheduledBlocks;
    1785             :   struct SIScheduleBlockResult Res;
    1786             : 
    1787           2 :   ScheduledBlocks = Scheduler.getBlocks();
    1788             : 
    1789           9 :   for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
    1790           5 :     SIScheduleBlock *Block = ScheduledBlocks[b];
    1791             :     std::vector<SUnit*> SUs = Block->getScheduledUnits();
    1792             : 
    1793          21 :     for (SUnit* SU : SUs)
    1794          16 :       Res.SUs.push_back(SU->NodeNum);
    1795             :   }
    1796             : 
    1797           2 :   Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
    1798           2 :   Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
    1799           2 :   return Res;
    1800             : }
    1801             : 
    1802             : // SIScheduleDAGMI //
    1803             : 
    1804           1 : SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
    1805           2 :   ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) {
    1806           1 :   SITII = static_cast<const SIInstrInfo*>(TII);
    1807           1 :   SITRI = static_cast<const SIRegisterInfo*>(TRI);
    1808             : 
    1809           1 :   VGPRSetID = SITRI->getVGPRPressureSet();
    1810           1 :   SGPRSetID = SITRI->getSGPRPressureSet();
    1811           1 : }
    1812             : 
    1813             : SIScheduleDAGMI::~SIScheduleDAGMI() = default;
    1814             : 
    1815             : // Code adapted from scheduleDAG.cpp
    1816             : // Does a topological sort over the SUs.
    1817             : // Both TopDown and BottomUp
    1818           2 : void SIScheduleDAGMI::topologicalSort() {
    1819           2 :   Topo.InitDAGTopologicalSorting();
    1820             : 
    1821           4 :   TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
    1822           2 :   BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
    1823           2 : }
    1824             : 
    1825             : // Move low latencies further from their user without
    1826             : // increasing SGPR usage (in general)
    1827             : // This is to be replaced by a better pass that would
    1828             : // take into account SGPR usage (based on VGPR Usage
    1829             : // and the corresponding wavefront count), that would
    1830             : // try to merge groups of loads if it make sense, etc
    1831           2 : void SIScheduleDAGMI::moveLowLatencies() {
    1832           2 :    unsigned DAGSize = SUnits.size();
    1833             :    int LastLowLatencyUser = -1;
    1834             :    int LastLowLatencyPos = -1;
    1835             : 
    1836          20 :    for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
    1837          32 :     SUnit *SU = &SUnits[ScheduledSUnits[i]];
    1838             :     bool IsLowLatencyUser = false;
    1839             :     unsigned MinPos = 0;
    1840             : 
    1841          37 :     for (SDep& PredDep : SU->Preds) {
    1842             :       SUnit *Pred = PredDep.getSUnit();
    1843          21 :       if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
    1844             :         IsLowLatencyUser = true;
    1845             :       }
    1846          21 :       if (Pred->NodeNum >= DAGSize)
    1847             :         continue;
    1848          21 :       unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
    1849          21 :       if (PredPos >= MinPos)
    1850          10 :         MinPos = PredPos + 1;
    1851             :     }
    1852             : 
    1853          16 :     if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
    1854           2 :       unsigned BestPos = LastLowLatencyUser + 1;
    1855           2 :       if ((int)BestPos <= LastLowLatencyPos)
    1856           1 :         BestPos = LastLowLatencyPos + 1;
    1857           2 :       if (BestPos < MinPos)
    1858             :         BestPos = MinPos;
    1859           2 :       if (BestPos < i) {
    1860          18 :         for (unsigned u = i; u > BestPos; --u) {
    1861          32 :           ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
    1862          32 :           ScheduledSUnits[u] = ScheduledSUnits[u-1];
    1863             :         }
    1864           2 :         ScheduledSUnits[BestPos] = SU->NodeNum;
    1865           4 :         ScheduledSUnitsInv[SU->NodeNum] = BestPos;
    1866             :       }
    1867           2 :       LastLowLatencyPos = BestPos;
    1868           2 :       if (IsLowLatencyUser)
    1869             :         LastLowLatencyUser = BestPos;
    1870          14 :     } else if (IsLowLatencyUser) {
    1871           0 :       LastLowLatencyUser = i;
    1872             :     // Moves COPY instructions on which depends
    1873             :     // the low latency instructions too.
    1874          28 :     } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
    1875             :       bool CopyForLowLat = false;
    1876          12 :       for (SDep& SuccDep : SU->Succs) {
    1877             :         SUnit *Succ = SuccDep.getSUnit();
    1878           7 :         if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
    1879             :           CopyForLowLat = true;
    1880             :         }
    1881             :       }
    1882           5 :       if (!CopyForLowLat)
    1883             :         continue;
    1884           2 :       if (MinPos < i) {
    1885          19 :         for (unsigned u = i; u > MinPos; --u) {
    1886          34 :           ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
    1887          34 :           ScheduledSUnits[u] = ScheduledSUnits[u-1];
    1888             :         }
    1889           2 :         ScheduledSUnits[MinPos] = SU->NodeNum;
    1890           4 :         ScheduledSUnitsInv[SU->NodeNum] = MinPos;
    1891             :       }
    1892             :     }
    1893             :   }
    1894           2 : }
    1895             : 
    1896           2 : void SIScheduleDAGMI::restoreSULinksLeft() {
    1897          20 :   for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
    1898          16 :     SUnits[i].isScheduled = false;
    1899          32 :     SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
    1900          32 :     SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
    1901          32 :     SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
    1902          48 :     SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
    1903             :   }
    1904           2 : }
    1905             : 
    1906             : // Return the Vgpr and Sgpr usage corresponding to some virtual registers.
    1907             : template<typename _Iterator> void
    1908           5 : SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
    1909             :                                   unsigned &VgprUsage, unsigned &SgprUsage) {
    1910           5 :   VgprUsage = 0;
    1911           5 :   SgprUsage = 0;
    1912          15 :   for (_Iterator RegI = First; RegI != End; ++RegI) {
    1913          10 :     unsigned Reg = *RegI;
    1914             :     // For now only track virtual registers
    1915          10 :     if (!TargetRegisterInfo::isVirtualRegister(Reg))
    1916             :       continue;
    1917          10 :     PSetIterator PSetI = MRI.getPressureSets(Reg);
    1918          26 :     for (; PSetI.isValid(); ++PSetI) {
    1919          16 :       if (*PSetI == VGPRSetID)
    1920           6 :         VgprUsage += PSetI.getWeight();
    1921          10 :       else if (*PSetI == SGPRSetID)
    1922           4 :         SgprUsage += PSetI.getWeight();
    1923             :     }
    1924             :   }
    1925           5 : }
    1926             : 
    1927           2 : void SIScheduleDAGMI::schedule()
    1928             : {
    1929             :   SmallVector<SUnit*, 8> TopRoots, BotRoots;
    1930             :   SIScheduleBlockResult Best, Temp;
    1931             :   LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
    1932             : 
    1933           2 :   buildDAGWithRegPressure();
    1934             :   LLVM_DEBUG(dump());
    1935             : 
    1936           2 :   topologicalSort();
    1937           2 :   findRootsAndBiasEdges(TopRoots, BotRoots);
    1938             :   // We reuse several ScheduleDAGMI and ScheduleDAGMILive
    1939             :   // functions, but to make them happy we must initialize
    1940             :   // the default Scheduler implementation (even if we do not
    1941             :   // run it)
    1942           2 :   SchedImpl->initialize(this);
    1943           2 :   initQueues(TopRoots, BotRoots);
    1944             : 
    1945             :   // Fill some stats to help scheduling.
    1946             : 
    1947           2 :   SUnitsLinksBackup = SUnits;
    1948           2 :   IsLowLatencySU.clear();
    1949           2 :   LowLatencyOffset.clear();
    1950           2 :   IsHighLatencySU.clear();
    1951             : 
    1952           4 :   IsLowLatencySU.resize(SUnits.size(), 0);
    1953           4 :   LowLatencyOffset.resize(SUnits.size(), 0);
    1954           4 :   IsHighLatencySU.resize(SUnits.size(), 0);
    1955             : 
    1956          20 :   for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
    1957          16 :     SUnit *SU = &SUnits[i];
    1958             :     unsigned BaseLatReg;
    1959             :     int64_t OffLatReg;
    1960          16 :     if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
    1961           2 :       IsLowLatencySU[i] = 1;
    1962           2 :       if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg,
    1963             :                                        TRI))
    1964           4 :         LowLatencyOffset[i] = OffLatReg;
    1965          14 :     } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
    1966           2 :       IsHighLatencySU[i] = 1;
    1967             :   }
    1968             : 
    1969             :   SIScheduler Scheduler(this);
    1970           2 :   Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
    1971             :                                    SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
    1972             : 
    1973             :   // if VGPR usage is extremely high, try other good performing variants
    1974             :   // which could lead to lower VGPR usage
    1975           2 :   if (Best.MaxVGPRUsage > 180) {
    1976             :     static const std::pair<SISchedulerBlockCreatorVariant,
    1977             :                            SISchedulerBlockSchedulerVariant>
    1978             :         Variants[] = {
    1979             :       { LatenciesAlone, BlockRegUsageLatency },
    1980             : //      { LatenciesAlone, BlockRegUsage },
    1981             :       { LatenciesGrouped, BlockLatencyRegUsage },
    1982             : //      { LatenciesGrouped, BlockRegUsageLatency },
    1983             : //      { LatenciesGrouped, BlockRegUsage },
    1984             :       { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
    1985             : //      { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
    1986             : //      { LatenciesAlonePlusConsecutive, BlockRegUsage }
    1987             :     };
    1988           0 :     for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
    1989           0 :       Temp = Scheduler.scheduleVariant(v.first, v.second);
    1990           0 :       if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
    1991             :         Best = Temp;
    1992             :     }
    1993             :   }
    1994             :   // if VGPR usage is still extremely high, we may spill. Try other variants
    1995             :   // which are less performing, but that could lead to lower VGPR usage.
    1996           2 :   if (Best.MaxVGPRUsage > 200) {
    1997             :     static const std::pair<SISchedulerBlockCreatorVariant,
    1998             :                            SISchedulerBlockSchedulerVariant>
    1999             :         Variants[] = {
    2000             : //      { LatenciesAlone, BlockRegUsageLatency },
    2001             :       { LatenciesAlone, BlockRegUsage },
    2002             : //      { LatenciesGrouped, BlockLatencyRegUsage },
    2003             :       { LatenciesGrouped, BlockRegUsageLatency },
    2004             :       { LatenciesGrouped, BlockRegUsage },
    2005             : //      { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
    2006             :       { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
    2007             :       { LatenciesAlonePlusConsecutive, BlockRegUsage }
    2008             :     };
    2009           0 :     for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
    2010           0 :       Temp = Scheduler.scheduleVariant(v.first, v.second);
    2011           0 :       if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
    2012             :         Best = Temp;
    2013             :     }
    2014             :   }
    2015             : 
    2016           2 :   ScheduledSUnits = Best.SUs;
    2017           4 :   ScheduledSUnitsInv.resize(SUnits.size());
    2018             : 
    2019          20 :   for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
    2020          48 :     ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
    2021             :   }
    2022             : 
    2023           2 :   moveLowLatencies();
    2024             : 
    2025             :   // Tell the outside world about the result of the scheduling.
    2026             : 
    2027             :   assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
    2028             :   TopRPTracker.setPos(CurrentTop);
    2029             : 
    2030             :   for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
    2031          18 :        E = ScheduledSUnits.end(); I != E; ++I) {
    2032          16 :     SUnit *SU = &SUnits[*I];
    2033             : 
    2034          16 :     scheduleMI(SU, true);
    2035             : 
    2036             :     LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
    2037             :                       << *SU->getInstr());
    2038             :   }
    2039             : 
    2040             :   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
    2041             : 
    2042           2 :   placeDebugValues();
    2043             : 
    2044             :   LLVM_DEBUG({
    2045             :     dbgs() << "*** Final schedule for "
    2046             :            << printMBBReference(*begin()->getParent()) << " ***\n";
    2047             :     dumpSchedule();
    2048             :     dbgs() << '\n';
    2049             :   });
    2050           2 : }

Generated by: LCOV version 1.13