Line data Source code
1 : //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : /// \file
11 : /// SI Machine Scheduler interface
12 : //
13 : //===----------------------------------------------------------------------===//
14 :
15 : #include "SIMachineScheduler.h"
16 : #include "AMDGPU.h"
17 : #include "SIInstrInfo.h"
18 : #include "SIRegisterInfo.h"
19 : #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 : #include "llvm/ADT/STLExtras.h"
21 : #include "llvm/ADT/SmallVector.h"
22 : #include "llvm/CodeGen/LiveInterval.h"
23 : #include "llvm/CodeGen/LiveIntervals.h"
24 : #include "llvm/CodeGen/MachineInstr.h"
25 : #include "llvm/CodeGen/MachineRegisterInfo.h"
26 : #include "llvm/CodeGen/MachineScheduler.h"
27 : #include "llvm/CodeGen/RegisterPressure.h"
28 : #include "llvm/CodeGen/SlotIndexes.h"
29 : #include "llvm/CodeGen/TargetRegisterInfo.h"
30 : #include "llvm/Support/Debug.h"
31 : #include "llvm/Support/ErrorHandling.h"
32 : #include "llvm/Support/raw_ostream.h"
33 : #include <algorithm>
34 : #include <cassert>
35 : #include <map>
36 : #include <set>
37 : #include <utility>
38 : #include <vector>
39 :
40 : using namespace llvm;
41 :
42 : #define DEBUG_TYPE "machine-scheduler"
43 :
44 : // This scheduler implements a different scheduling algorithm than
45 : // GenericScheduler.
46 : //
47 : // There are several specific architecture behaviours that can't be modelled
48 : // for GenericScheduler:
49 : // . When accessing the result of an SGPR load instruction, you have to wait
50 : // for all the SGPR load instructions before your current instruction to
51 : // have finished.
52 : // . When accessing the result of an VGPR load instruction, you have to wait
53 : // for all the VGPR load instructions previous to the VGPR load instruction
54 : // you are interested in to finish.
55 : // . The less the register pressure, the best load latencies are hidden
56 : //
57 : // Moreover some specifities (like the fact a lot of instructions in the shader
58 : // have few dependencies) makes the generic scheduler have some unpredictable
59 : // behaviours. For example when register pressure becomes high, it can either
60 : // manage to prevent register pressure from going too high, or it can
61 : // increase register pressure even more than if it hadn't taken register
62 : // pressure into account.
63 : //
64 : // Also some other bad behaviours are generated, like loading at the beginning
65 : // of the shader a constant in VGPR you won't need until the end of the shader.
66 : //
67 : // The scheduling problem for SI can distinguish three main parts:
68 : // . Hiding high latencies (texture sampling, etc)
69 : // . Hiding low latencies (SGPR constant loading, etc)
70 : // . Keeping register usage low for better latency hiding and general
71 : // performance
72 : //
73 : // Some other things can also affect performance, but are hard to predict
74 : // (cache usage, the fact the HW can issue several instructions from different
75 : // wavefronts if different types, etc)
76 : //
77 : // This scheduler tries to solve the scheduling problem by dividing it into
78 : // simpler sub-problems. It divides the instructions into blocks, schedules
79 : // locally inside the blocks where it takes care of low latencies, and then
80 : // chooses the order of the blocks by taking care of high latencies.
81 : // Dividing the instructions into blocks helps control keeping register
82 : // usage low.
83 : //
84 : // First the instructions are put into blocks.
85 : // We want the blocks help control register usage and hide high latencies
86 : // later. To help control register usage, we typically want all local
87 : // computations, when for example you create a result that can be comsummed
88 : // right away, to be contained in a block. Block inputs and outputs would
89 : // typically be important results that are needed in several locations of
90 : // the shader. Since we do want blocks to help hide high latencies, we want
91 : // the instructions inside the block to have a minimal set of dependencies
92 : // on high latencies. It will make it easy to pick blocks to hide specific
93 : // high latencies.
94 : // The block creation algorithm is divided into several steps, and several
95 : // variants can be tried during the scheduling process.
96 : //
97 : // Second the order of the instructions inside the blocks is chosen.
98 : // At that step we do take into account only register usage and hiding
99 : // low latency instructions
100 : //
101 : // Third the block order is chosen, there we try to hide high latencies
102 : // and keep register usage low.
103 : //
104 : // After the third step, a pass is done to improve the hiding of low
105 : // latencies.
106 : //
107 : // Actually when talking about 'low latency' or 'high latency' it includes
108 : // both the latency to get the cache (or global mem) data go to the register,
109 : // and the bandwidth limitations.
110 : // Increasing the number of active wavefronts helps hide the former, but it
111 : // doesn't solve the latter, thus why even if wavefront count is high, we have
112 : // to try have as many instructions hiding high latencies as possible.
113 : // The OpenCL doc says for example latency of 400 cycles for a global mem access,
114 : // which is hidden by 10 instructions if the wavefront count is 10.
115 :
116 : // Some figures taken from AMD docs:
117 : // Both texture and constant L1 caches are 4-way associative with 64 bytes
118 : // lines.
119 : // Constant cache is shared with 4 CUs.
120 : // For texture sampling, the address generation unit receives 4 texture
121 : // addresses per cycle, thus we could expect texture sampling latency to be
122 : // equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
123 : // instructions in a wavefront group are executed every 4 cycles),
124 : // or 16 instructions if the other wavefronts associated to the 3 other VALUs
125 : // of the CU do texture sampling too. (Don't take these figures too seriously,
126 : // as I'm not 100% sure of the computation)
127 : // Data exports should get similar latency.
128 : // For constant loading, the cache is shader with 4 CUs.
129 : // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
130 : // I guess if the other CU don't read the cache, it can go up to 64B/cycle.
131 : // It means a simple s_buffer_load should take one instruction to hide, as
132 : // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
133 : // cache line.
134 : //
135 : // As of today the driver doesn't preload the constants in cache, thus the
136 : // first loads get extra latency. The doc says global memory access can be
137 : // 300-600 cycles. We do not specially take that into account when scheduling
138 : // As we expect the driver to be able to preload the constants soon.
139 :
140 : // common code //
141 :
142 : #ifndef NDEBUG
143 :
144 : static const char *getReasonStr(SIScheduleCandReason Reason) {
145 : switch (Reason) {
146 : case NoCand: return "NOCAND";
147 : case RegUsage: return "REGUSAGE";
148 : case Latency: return "LATENCY";
149 : case Successor: return "SUCCESSOR";
150 : case Depth: return "DEPTH";
151 : case NodeOrder: return "ORDER";
152 : }
153 : llvm_unreachable("Unknown reason!");
154 : }
155 :
156 : #endif
157 :
158 : namespace llvm {
159 : namespace SISched {
160 0 : static bool tryLess(int TryVal, int CandVal,
161 : SISchedulerCandidate &TryCand,
162 : SISchedulerCandidate &Cand,
163 : SIScheduleCandReason Reason) {
164 0 : if (TryVal < CandVal) {
165 2 : TryCand.Reason = Reason;
166 0 : return true;
167 : }
168 15 : if (TryVal > CandVal) {
169 0 : if (Cand.Reason > Reason)
170 0 : Cand.Reason = Reason;
171 0 : return true;
172 : }
173 : Cand.setRepeat(Reason);
174 0 : return false;
175 : }
176 :
177 0 : static bool tryGreater(int TryVal, int CandVal,
178 : SISchedulerCandidate &TryCand,
179 : SISchedulerCandidate &Cand,
180 : SIScheduleCandReason Reason) {
181 0 : if (TryVal > CandVal) {
182 1 : TryCand.Reason = Reason;
183 0 : return true;
184 : }
185 8 : if (TryVal < CandVal) {
186 0 : if (Cand.Reason > Reason)
187 0 : Cand.Reason = Reason;
188 0 : return true;
189 : }
190 : Cand.setRepeat(Reason);
191 0 : return false;
192 : }
193 : } // end namespace SISched
194 : } // end namespace llvm
195 :
196 : // SIScheduleBlock //
197 :
198 16 : void SIScheduleBlock::addUnit(SUnit *SU) {
199 32 : NodeNum2Index[SU->NodeNum] = SUnits.size();
200 16 : SUnits.push_back(SU);
201 16 : }
202 :
203 : #ifndef NDEBUG
204 : void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
205 :
206 : dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
207 : dbgs() << '\n';
208 : }
209 : #endif
210 :
211 25 : void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
212 : SISchedCandidate &TryCand) {
213 : // Initialize the candidate if needed.
214 25 : if (!Cand.isValid()) {
215 16 : TryCand.Reason = NodeOrder;
216 16 : return;
217 : }
218 :
219 9 : if (Cand.SGPRUsage > 60 &&
220 0 : SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage,
221 : TryCand, Cand, RegUsage))
222 0 : return;
223 :
224 : // Schedule low latency instructions as top as possible.
225 : // Order of priority is:
226 : // . Low latency instructions which do not depend on other low latency
227 : // instructions we haven't waited for
228 : // . Other instructions which do not depend on low latency instructions
229 : // we haven't waited for
230 : // . Low latencies
231 : // . All other instructions
232 : // Goal is to get: low latency instructions - independent instructions
233 : // - (eventually some more low latency instructions)
234 : // - instructions that depend on the first low latency instructions.
235 : // If in the block there is a lot of constant loads, the SGPR usage
236 : // could go quite high, thus above the arbitrary limit of 60 will encourage
237 : // use the already loaded constants (in order to release some SGPRs) before
238 : // loading more.
239 9 : if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent,
240 9 : Cand.HasLowLatencyNonWaitedParent,
241 : TryCand, Cand, SIScheduleCandReason::Depth))
242 0 : return;
243 :
244 9 : if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
245 : TryCand, Cand, SIScheduleCandReason::Depth))
246 1 : return;
247 :
248 8 : if (TryCand.IsLowLatency &&
249 0 : SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
250 : TryCand, Cand, SIScheduleCandReason::Depth))
251 0 : return;
252 :
253 8 : if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage,
254 : TryCand, Cand, RegUsage))
255 2 : return;
256 :
257 : // Fall through to original instruction order.
258 6 : if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
259 0 : TryCand.Reason = NodeOrder;
260 : }
261 : }
262 :
263 16 : SUnit* SIScheduleBlock::pickNode() {
264 : SISchedCandidate TopCand;
265 :
266 41 : for (SUnit* SU : TopReadySUs) {
267 : SISchedCandidate TryCand;
268 : std::vector<unsigned> pressure;
269 : std::vector<unsigned> MaxPressure;
270 : // Predict register usage after this instruction.
271 25 : TryCand.SU = SU;
272 25 : TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
273 25 : TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
274 25 : TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
275 25 : TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
276 25 : TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
277 25 : TryCand.HasLowLatencyNonWaitedParent =
278 25 : HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
279 25 : tryCandidateTopDown(TopCand, TryCand);
280 25 : if (TryCand.Reason != NoCand)
281 : TopCand.setBest(TryCand);
282 : }
283 :
284 16 : return TopCand.SU;
285 : }
286 :
287 :
288 : // Schedule something valid.
289 5 : void SIScheduleBlock::fastSchedule() {
290 5 : TopReadySUs.clear();
291 5 : if (Scheduled)
292 0 : undoSchedule();
293 :
294 21 : for (SUnit* SU : SUnits) {
295 16 : if (!SU->NumPredsLeft)
296 10 : TopReadySUs.push_back(SU);
297 : }
298 :
299 21 : while (!TopReadySUs.empty()) {
300 16 : SUnit *SU = TopReadySUs[0];
301 16 : ScheduledSUnits.push_back(SU);
302 16 : nodeScheduled(SU);
303 : }
304 :
305 5 : Scheduled = true;
306 5 : }
307 :
308 : // Returns if the register was set between first and last.
309 8 : static bool isDefBetween(unsigned Reg,
310 : SlotIndex First, SlotIndex Last,
311 : const MachineRegisterInfo *MRI,
312 : const LiveIntervals *LIS) {
313 : for (MachineRegisterInfo::def_instr_iterator
314 8 : UI = MRI->def_instr_begin(Reg),
315 9 : UE = MRI->def_instr_end(); UI != UE; ++UI) {
316 : const MachineInstr* MI = &*UI;
317 9 : if (MI->isDebugValue())
318 : continue;
319 9 : SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
320 9 : if (InstSlot >= First && InstSlot <= Last)
321 : return true;
322 : }
323 : return false;
324 : }
325 :
326 5 : void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
327 : MachineBasicBlock::iterator EndBlock) {
328 : IntervalPressure Pressure, BotPressure;
329 5 : RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
330 5 : LiveIntervals *LIS = DAG->getLIS();
331 5 : MachineRegisterInfo *MRI = DAG->getMRI();
332 5 : DAG->initRPTracker(TopRPTracker);
333 5 : DAG->initRPTracker(BotRPTracker);
334 5 : DAG->initRPTracker(RPTracker);
335 :
336 : // Goes though all SU. RPTracker captures what had to be alive for the SUs
337 : // to execute, and what is still alive at the end.
338 21 : for (SUnit* SU : ScheduledSUnits) {
339 16 : RPTracker.setPos(SU->getInstr());
340 16 : RPTracker.advance();
341 : }
342 :
343 : // Close the RPTracker to finalize live ins/outs.
344 5 : RPTracker.closeRegion();
345 :
346 : // Initialize the live ins and live outs.
347 10 : TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
348 10 : BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
349 :
350 : // Do not Track Physical Registers, because it messes up.
351 20 : for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
352 30 : if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
353 8 : LiveInRegs.insert(RegMaskPair.RegUnit);
354 : }
355 : LiveOutRegs.clear();
356 : // There is several possibilities to distinguish:
357 : // 1) Reg is not input to any instruction in the block, but is output of one
358 : // 2) 1) + read in the block and not needed after it
359 : // 3) 1) + read in the block but needed in another block
360 : // 4) Reg is input of an instruction but another block will read it too
361 : // 5) Reg is input of an instruction and then rewritten in the block.
362 : // result is not read in the block (implies used in another block)
363 : // 6) Reg is input of an instruction and then rewritten in the block.
364 : // result is read in the block and not needed in another block
365 : // 7) Reg is input of an instruction and then rewritten in the block.
366 : // result is read in the block but also needed in another block
367 : // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
368 : // We want LiveOutRegs to contain only Regs whose content will be read after
369 : // in another block, and whose content was written in the current block,
370 : // that is we want it to get 1, 3, 5, 7
371 : // Since we made the MIs of a block to be packed all together before
372 : // scheduling, then the LiveIntervals were correct, and the RPTracker was
373 : // able to correctly handle 5 vs 6, 2 vs 3.
374 : // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
375 : // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
376 : // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7
377 : // The use of findDefBetween removes the case 4.
378 13 : for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
379 8 : unsigned Reg = RegMaskPair.RegUnit;
380 16 : if (TargetRegisterInfo::isVirtualRegister(Reg) &&
381 8 : isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
382 8 : LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
383 : LIS)) {
384 : LiveOutRegs.insert(Reg);
385 : }
386 : }
387 :
388 : // Pressure = sum_alive_registers register size
389 : // Internally llvm will represent some registers as big 128 bits registers
390 : // for example, but they actually correspond to 4 actual 32 bits registers.
391 : // Thus Pressure is not equal to num_alive_registers * constant.
392 5 : LiveInPressure = TopPressure.MaxSetPressure;
393 5 : LiveOutPressure = BotPressure.MaxSetPressure;
394 :
395 : // Prepares TopRPTracker for top down scheduling.
396 5 : TopRPTracker.closeTop();
397 5 : }
398 :
399 5 : void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
400 : MachineBasicBlock::iterator EndBlock) {
401 5 : if (!Scheduled)
402 0 : fastSchedule();
403 :
404 : // PreScheduling phase to set LiveIn and LiveOut.
405 5 : initRegPressure(BeginBlock, EndBlock);
406 5 : undoSchedule();
407 :
408 : // Schedule for real now.
409 :
410 5 : TopReadySUs.clear();
411 :
412 21 : for (SUnit* SU : SUnits) {
413 16 : if (!SU->NumPredsLeft)
414 10 : TopReadySUs.push_back(SU);
415 : }
416 :
417 21 : while (!TopReadySUs.empty()) {
418 16 : SUnit *SU = pickNode();
419 16 : ScheduledSUnits.push_back(SU);
420 16 : TopRPTracker.setPos(SU->getInstr());
421 16 : TopRPTracker.advance();
422 16 : nodeScheduled(SU);
423 : }
424 :
425 : // TODO: compute InternalAdditionnalPressure.
426 10 : InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size());
427 :
428 : // Check everything is right.
429 : #ifndef NDEBUG
430 : assert(SUnits.size() == ScheduledSUnits.size() &&
431 : TopReadySUs.empty());
432 : for (SUnit* SU : SUnits) {
433 : assert(SU->isScheduled &&
434 : SU->NumPredsLeft == 0);
435 : }
436 : #endif
437 :
438 5 : Scheduled = true;
439 5 : }
440 :
441 5 : void SIScheduleBlock::undoSchedule() {
442 21 : for (SUnit* SU : SUnits) {
443 16 : SU->isScheduled = false;
444 40 : for (SDep& Succ : SU->Succs) {
445 48 : if (BC->isSUInBlock(Succ.getSUnit(), ID))
446 12 : undoReleaseSucc(SU, &Succ);
447 : }
448 : }
449 5 : HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
450 : ScheduledSUnits.clear();
451 5 : Scheduled = false;
452 5 : }
453 :
454 12 : void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
455 : SUnit *SuccSU = SuccEdge->getSUnit();
456 :
457 : if (SuccEdge->isWeak()) {
458 0 : ++SuccSU->WeakPredsLeft;
459 0 : return;
460 : }
461 12 : ++SuccSU->NumPredsLeft;
462 : }
463 :
464 33 : void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
465 : SUnit *SuccSU = SuccEdge->getSUnit();
466 :
467 : if (SuccEdge->isWeak()) {
468 0 : --SuccSU->WeakPredsLeft;
469 0 : return;
470 : }
471 : #ifndef NDEBUG
472 : if (SuccSU->NumPredsLeft == 0) {
473 : dbgs() << "*** Scheduling failed! ***\n";
474 : DAG->dumpNode(*SuccSU);
475 : dbgs() << " has been released too many times!\n";
476 : llvm_unreachable(nullptr);
477 : }
478 : #endif
479 :
480 33 : --SuccSU->NumPredsLeft;
481 : }
482 :
483 : /// Release Successors of the SU that are in the block or not.
484 48 : void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
485 120 : for (SDep& Succ : SU->Succs) {
486 72 : SUnit *SuccSU = Succ.getSUnit();
487 :
488 144 : if (SuccSU->NodeNum >= DAG->SUnits.size())
489 39 : continue;
490 :
491 63 : if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
492 : continue;
493 :
494 33 : releaseSucc(SU, &Succ);
495 33 : if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
496 12 : TopReadySUs.push_back(SuccSU);
497 : }
498 48 : }
499 :
500 32 : void SIScheduleBlock::nodeScheduled(SUnit *SU) {
501 : // Is in TopReadySUs
502 : assert (!SU->NumPredsLeft);
503 : std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU);
504 32 : if (I == TopReadySUs.end()) {
505 0 : dbgs() << "Data Structure Bug in SI Scheduler\n";
506 0 : llvm_unreachable(nullptr);
507 : }
508 32 : TopReadySUs.erase(I);
509 :
510 32 : releaseSuccessors(SU, true);
511 : // Scheduling this node will trigger a wait,
512 : // thus propagate to other instructions that they do not need to wait either.
513 32 : if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
514 0 : HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
515 :
516 64 : if (DAG->IsLowLatencySU[SU->NodeNum]) {
517 8 : for (SDep& Succ : SU->Succs) {
518 : std::map<unsigned, unsigned>::iterator I =
519 : NodeNum2Index.find(Succ.getSUnit()->NodeNum);
520 4 : if (I != NodeNum2Index.end())
521 0 : HasLowLatencyNonWaitedParent[I->second] = 1;
522 : }
523 : }
524 32 : SU->isScheduled = true;
525 32 : }
526 :
527 5 : void SIScheduleBlock::finalizeUnits() {
528 : // We remove links from outside blocks to enable scheduling inside the block.
529 21 : for (SUnit* SU : SUnits) {
530 16 : releaseSuccessors(SU, false);
531 32 : if (DAG->IsHighLatencySU[SU->NodeNum])
532 1 : HighLatencyBlock = true;
533 : }
534 10 : HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
535 5 : }
536 :
537 : // we maintain ascending order of IDs
538 9 : void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
539 9 : unsigned PredID = Pred->getID();
540 :
541 : // Check if not already predecessor.
542 10 : for (SIScheduleBlock* P : Preds) {
543 6 : if (PredID == P->getID())
544 : return;
545 : }
546 4 : Preds.push_back(Pred);
547 :
548 : assert(none_of(Succs,
549 : [=](std::pair<SIScheduleBlock*,
550 : SIScheduleBlockLinkKind> S) {
551 : return PredID == S.first->getID();
552 : }) &&
553 : "Loop in the Block Graph!");
554 : }
555 :
556 9 : void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
557 : SIScheduleBlockLinkKind Kind) {
558 9 : unsigned SuccID = Succ->getID();
559 :
560 : // Check if not already predecessor.
561 10 : for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
562 6 : if (SuccID == S.first->getID()) {
563 5 : if (S.second == SIScheduleBlockLinkKind::NoData &&
564 : Kind == SIScheduleBlockLinkKind::Data)
565 0 : S.second = Kind;
566 : return;
567 : }
568 : }
569 4 : if (Succ->isHighLatencyBlock())
570 0 : ++NumHighLatencySuccessors;
571 4 : Succs.push_back(std::make_pair(Succ, Kind));
572 :
573 : assert(none_of(Preds,
574 : [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
575 : "Loop in the Block Graph!");
576 : }
577 :
578 : #ifndef NDEBUG
579 : void SIScheduleBlock::printDebug(bool full) {
580 : dbgs() << "Block (" << ID << ")\n";
581 : if (!full)
582 : return;
583 :
584 : dbgs() << "\nContains High Latency Instruction: "
585 : << HighLatencyBlock << '\n';
586 : dbgs() << "\nDepends On:\n";
587 : for (SIScheduleBlock* P : Preds) {
588 : P->printDebug(false);
589 : }
590 :
591 : dbgs() << "\nSuccessors:\n";
592 : for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
593 : if (S.second == SIScheduleBlockLinkKind::Data)
594 : dbgs() << "(Data Dep) ";
595 : S.first->printDebug(false);
596 : }
597 :
598 : if (Scheduled) {
599 : dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
600 : << LiveInPressure[DAG->getVGPRSetID()] << '\n';
601 : dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
602 : << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
603 : dbgs() << "LiveIns:\n";
604 : for (unsigned Reg : LiveInRegs)
605 : dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
606 :
607 : dbgs() << "\nLiveOuts:\n";
608 : for (unsigned Reg : LiveOutRegs)
609 : dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
610 : }
611 :
612 : dbgs() << "\nInstructions:\n";
613 : if (!Scheduled) {
614 : for (const SUnit* SU : SUnits)
615 : DAG->dumpNode(*SU);
616 : } else {
617 : for (const SUnit* SU : SUnits)
618 : DAG->dumpNode(*SU);
619 : }
620 :
621 : dbgs() << "///////////////////////\n";
622 : }
623 : #endif
624 :
625 : // SIScheduleBlockCreator //
626 :
627 2 : SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) :
628 2 : DAG(DAG) {
629 2 : }
630 :
631 : SIScheduleBlockCreator::~SIScheduleBlockCreator() = default;
632 :
633 : SIScheduleBlocks
634 2 : SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
635 : std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
636 : Blocks.find(BlockVariant);
637 2 : if (B == Blocks.end()) {
638 2 : SIScheduleBlocks Res;
639 2 : createBlocksForVariant(BlockVariant);
640 2 : topologicalSort();
641 2 : scheduleInsideBlocks();
642 2 : fillStats();
643 2 : Res.Blocks = CurrentBlocks;
644 2 : Res.TopDownIndex2Block = TopDownIndex2Block;
645 2 : Res.TopDownBlock2Index = TopDownBlock2Index;
646 2 : Blocks[BlockVariant] = Res;
647 2 : return Res;
648 : } else {
649 0 : return B->second;
650 : }
651 : }
652 :
653 87 : bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
654 174 : if (SU->NodeNum >= DAG->SUnits.size())
655 : return false;
656 252 : return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
657 : }
658 :
659 2 : void SIScheduleBlockCreator::colorHighLatenciesAlone() {
660 4 : unsigned DAGSize = DAG->SUnits.size();
661 :
662 18 : for (unsigned i = 0, e = DAGSize; i != e; ++i) {
663 16 : SUnit *SU = &DAG->SUnits[i];
664 32 : if (DAG->IsHighLatencySU[SU->NodeNum]) {
665 2 : CurrentColoring[SU->NodeNum] = NextReservedID++;
666 : }
667 : }
668 2 : }
669 :
670 : static bool
671 : hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
672 0 : for (const auto &PredDep : SU.Preds) {
673 0 : if (PredDep.getSUnit() == &FromSU &&
674 : PredDep.getKind() == llvm::SDep::Data)
675 : return true;
676 : }
677 : return false;
678 : }
679 :
680 0 : void SIScheduleBlockCreator::colorHighLatenciesGroups() {
681 0 : unsigned DAGSize = DAG->SUnits.size();
682 : unsigned NumHighLatencies = 0;
683 : unsigned GroupSize;
684 0 : int Color = NextReservedID;
685 : unsigned Count = 0;
686 : std::set<unsigned> FormingGroup;
687 :
688 0 : for (unsigned i = 0, e = DAGSize; i != e; ++i) {
689 0 : SUnit *SU = &DAG->SUnits[i];
690 0 : if (DAG->IsHighLatencySU[SU->NodeNum])
691 0 : ++NumHighLatencies;
692 : }
693 :
694 0 : if (NumHighLatencies == 0)
695 : return;
696 :
697 0 : if (NumHighLatencies <= 6)
698 : GroupSize = 2;
699 0 : else if (NumHighLatencies <= 12)
700 : GroupSize = 3;
701 : else
702 : GroupSize = 4;
703 :
704 0 : for (unsigned SUNum : DAG->TopDownIndex2SU) {
705 0 : const SUnit &SU = DAG->SUnits[SUNum];
706 0 : if (DAG->IsHighLatencySU[SU.NodeNum]) {
707 : unsigned CompatibleGroup = true;
708 : int ProposedColor = Color;
709 : std::vector<int> AdditionalElements;
710 :
711 : // We don't want to put in the same block
712 : // two high latency instructions that depend
713 : // on each other.
714 : // One way would be to check canAddEdge
715 : // in both directions, but that currently is not
716 : // enough because there the high latency order is
717 : // enforced (via links).
718 : // Instead, look at the dependencies between the
719 : // high latency instructions and deduce if it is
720 : // a data dependency or not.
721 0 : for (unsigned j : FormingGroup) {
722 : bool HasSubGraph;
723 : std::vector<int> SubGraph;
724 : // By construction (topological order), if SU and
725 : // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
726 : // in the parent graph of SU.
727 : #ifndef NDEBUG
728 : SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
729 : HasSubGraph);
730 : assert(!HasSubGraph);
731 : #endif
732 0 : SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
733 : HasSubGraph);
734 0 : if (!HasSubGraph)
735 : continue; // No dependencies between each other
736 0 : else if (SubGraph.size() > 5) {
737 : // Too many elements would be required to be added to the block.
738 : CompatibleGroup = false;
739 : break;
740 : }
741 : else {
742 : // Check the type of dependency
743 0 : for (unsigned k : SubGraph) {
744 : // If in the path to join the two instructions,
745 : // there is another high latency instruction,
746 : // or instructions colored for another block
747 : // abort the merge.
748 0 : if (DAG->IsHighLatencySU[k] ||
749 0 : (CurrentColoring[k] != ProposedColor &&
750 0 : CurrentColoring[k] != 0)) {
751 : CompatibleGroup = false;
752 : break;
753 : }
754 : // If one of the SU in the subgraph depends on the result of SU j,
755 : // there'll be a data dependency.
756 0 : if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) {
757 : CompatibleGroup = false;
758 : break;
759 : }
760 : }
761 0 : if (!CompatibleGroup)
762 : break;
763 : // Same check for the SU
764 0 : if (hasDataDependencyPred(SU, DAG->SUnits[j])) {
765 : CompatibleGroup = false;
766 : break;
767 : }
768 : // Add all the required instructions to the block
769 : // These cannot live in another block (because they
770 : // depend (order dependency) on one of the
771 : // instruction in the block, and are required for the
772 : // high latency instruction we add.
773 : AdditionalElements.insert(AdditionalElements.end(),
774 0 : SubGraph.begin(), SubGraph.end());
775 : }
776 : }
777 0 : if (CompatibleGroup) {
778 0 : FormingGroup.insert(SU.NodeNum);
779 0 : for (unsigned j : AdditionalElements)
780 0 : CurrentColoring[j] = ProposedColor;
781 0 : CurrentColoring[SU.NodeNum] = ProposedColor;
782 0 : ++Count;
783 : }
784 : // Found one incompatible instruction,
785 : // or has filled a big enough group.
786 : // -> start a new one.
787 0 : if (!CompatibleGroup) {
788 : FormingGroup.clear();
789 0 : Color = ++NextReservedID;
790 : ProposedColor = Color;
791 0 : FormingGroup.insert(SU.NodeNum);
792 0 : CurrentColoring[SU.NodeNum] = ProposedColor;
793 : Count = 0;
794 0 : } else if (Count == GroupSize) {
795 : FormingGroup.clear();
796 0 : Color = ++NextReservedID;
797 : ProposedColor = Color;
798 : Count = 0;
799 : }
800 : }
801 : }
802 : }
803 :
804 2 : void SIScheduleBlockCreator::colorComputeReservedDependencies() {
805 4 : unsigned DAGSize = DAG->SUnits.size();
806 : std::map<std::set<unsigned>, unsigned> ColorCombinations;
807 :
808 2 : CurrentTopDownReservedDependencyColoring.clear();
809 2 : CurrentBottomUpReservedDependencyColoring.clear();
810 :
811 2 : CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
812 2 : CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
813 :
814 : // Traverse TopDown, and give different colors to SUs depending
815 : // on which combination of High Latencies they depend on.
816 :
817 18 : for (unsigned SUNum : DAG->TopDownIndex2SU) {
818 16 : SUnit *SU = &DAG->SUnits[SUNum];
819 : std::set<unsigned> SUColors;
820 :
821 : // Already given.
822 32 : if (CurrentColoring[SU->NodeNum]) {
823 1 : CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
824 : CurrentColoring[SU->NodeNum];
825 1 : continue;
826 : }
827 :
828 36 : for (SDep& PredDep : SU->Preds) {
829 : SUnit *Pred = PredDep.getSUnit();
830 21 : if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
831 : continue;
832 42 : if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
833 5 : SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
834 : }
835 : // Color 0 by default.
836 15 : if (SUColors.empty())
837 : continue;
838 : // Same color than parents.
839 3 : if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
840 0 : CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
841 0 : *SUColors.begin();
842 : else {
843 : std::map<std::set<unsigned>, unsigned>::iterator Pos =
844 : ColorCombinations.find(SUColors);
845 3 : if (Pos != ColorCombinations.end()) {
846 2 : CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
847 : } else {
848 2 : CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
849 2 : NextNonReservedID;
850 2 : ColorCombinations[SUColors] = NextNonReservedID++;
851 : }
852 : }
853 : }
854 :
855 : ColorCombinations.clear();
856 :
857 : // Same as before, but BottomUp.
858 :
859 18 : for (unsigned SUNum : DAG->BottomUpIndex2SU) {
860 16 : SUnit *SU = &DAG->SUnits[SUNum];
861 : std::set<unsigned> SUColors;
862 :
863 : // Already given.
864 32 : if (CurrentColoring[SU->NodeNum]) {
865 1 : CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
866 : CurrentColoring[SU->NodeNum];
867 1 : continue;
868 : }
869 :
870 36 : for (SDep& SuccDep : SU->Succs) {
871 : SUnit *Succ = SuccDep.getSUnit();
872 21 : if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
873 : continue;
874 36 : if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
875 0 : SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
876 : }
877 : // Keep color 0.
878 15 : if (SUColors.empty())
879 : continue;
880 : // Same color than parents.
881 0 : if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
882 0 : CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
883 0 : *SUColors.begin();
884 : else {
885 : std::map<std::set<unsigned>, unsigned>::iterator Pos =
886 : ColorCombinations.find(SUColors);
887 0 : if (Pos != ColorCombinations.end()) {
888 0 : CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
889 : } else {
890 0 : CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
891 0 : NextNonReservedID;
892 0 : ColorCombinations[SUColors] = NextNonReservedID++;
893 : }
894 : }
895 : }
896 2 : }
897 :
898 2 : void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
899 4 : unsigned DAGSize = DAG->SUnits.size();
900 : std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
901 :
902 : // Every combination of colors given by the top down
903 : // and bottom up Reserved node dependency
904 :
905 18 : for (unsigned i = 0, e = DAGSize; i != e; ++i) {
906 16 : SUnit *SU = &DAG->SUnits[i];
907 16 : std::pair<unsigned, unsigned> SUColors;
908 :
909 : // High latency instructions: already given.
910 32 : if (CurrentColoring[SU->NodeNum])
911 1 : continue;
912 :
913 15 : SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum];
914 30 : SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum];
915 :
916 : std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
917 : ColorCombinations.find(SUColors);
918 15 : if (Pos != ColorCombinations.end()) {
919 24 : CurrentColoring[SU->NodeNum] = Pos->second;
920 : } else {
921 3 : CurrentColoring[SU->NodeNum] = NextNonReservedID;
922 3 : ColorCombinations[SUColors] = NextNonReservedID++;
923 : }
924 : }
925 2 : }
926 :
927 2 : void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
928 2 : unsigned DAGSize = DAG->SUnits.size();
929 2 : std::vector<int> PendingColoring = CurrentColoring;
930 :
931 : assert(DAGSize >= 1 &&
932 : CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
933 : CurrentTopDownReservedDependencyColoring.size() == DAGSize);
934 : // If there is no reserved block at all, do nothing. We don't want
935 : // everything in one block.
936 2 : if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
937 2 : CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
938 1 : *std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
939 : CurrentTopDownReservedDependencyColoring.end()) == 0)
940 : return;
941 :
942 5 : for (unsigned SUNum : DAG->BottomUpIndex2SU) {
943 4 : SUnit *SU = &DAG->SUnits[SUNum];
944 : std::set<unsigned> SUColors;
945 : std::set<unsigned> SUColorsPending;
946 :
947 8 : if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
948 : continue;
949 :
950 6 : if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
951 6 : CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
952 : continue;
953 :
954 0 : for (SDep& SuccDep : SU->Succs) {
955 : SUnit *Succ = SuccDep.getSUnit();
956 0 : if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
957 : continue;
958 0 : if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
959 0 : CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
960 0 : SUColors.insert(CurrentColoring[Succ->NodeNum]);
961 0 : SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
962 : }
963 : // If there is only one child/parent block, and that block
964 : // is not among the ones we are removing in this path, then
965 : // merge the instruction to that block
966 0 : if (SUColors.size() == 1 && SUColorsPending.size() == 1)
967 0 : PendingColoring[SU->NodeNum] = *SUColors.begin();
968 : else // TODO: Attribute new colors depending on color
969 : // combination of children.
970 0 : PendingColoring[SU->NodeNum] = NextNonReservedID++;
971 : }
972 1 : CurrentColoring = PendingColoring;
973 : }
974 :
975 :
976 0 : void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
977 0 : unsigned DAGSize = DAG->SUnits.size();
978 : unsigned PreviousColor;
979 : std::set<unsigned> SeenColors;
980 :
981 0 : if (DAGSize <= 1)
982 : return;
983 :
984 0 : PreviousColor = CurrentColoring[0];
985 :
986 0 : for (unsigned i = 1, e = DAGSize; i != e; ++i) {
987 0 : SUnit *SU = &DAG->SUnits[i];
988 0 : unsigned CurrentColor = CurrentColoring[i];
989 0 : unsigned PreviousColorSave = PreviousColor;
990 : assert(i == SU->NodeNum);
991 :
992 0 : if (CurrentColor != PreviousColor)
993 : SeenColors.insert(PreviousColor);
994 0 : PreviousColor = CurrentColor;
995 :
996 0 : if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
997 : continue;
998 :
999 0 : if (SeenColors.find(CurrentColor) == SeenColors.end())
1000 : continue;
1001 :
1002 0 : if (PreviousColorSave != CurrentColor)
1003 0 : CurrentColoring[i] = NextNonReservedID++;
1004 : else
1005 0 : CurrentColoring[i] = CurrentColoring[i-1];
1006 : }
1007 : }
1008 :
1009 2 : void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
1010 4 : unsigned DAGSize = DAG->SUnits.size();
1011 :
1012 18 : for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1013 16 : SUnit *SU = &DAG->SUnits[SUNum];
1014 : std::set<unsigned> SUColors;
1015 :
1016 32 : if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1017 : continue;
1018 :
1019 : // No predecessor: Vgpr constant loading.
1020 : // Low latency instructions usually have a predecessor (the address)
1021 15 : if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
1022 : continue;
1023 :
1024 16 : for (SDep& SuccDep : SU->Succs) {
1025 : SUnit *Succ = SuccDep.getSUnit();
1026 9 : if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1027 : continue;
1028 14 : SUColors.insert(CurrentColoring[Succ->NodeNum]);
1029 : }
1030 7 : if (SUColors.size() == 1)
1031 8 : CurrentColoring[SU->NodeNum] = *SUColors.begin();
1032 : }
1033 2 : }
1034 :
1035 0 : void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
1036 0 : unsigned DAGSize = DAG->SUnits.size();
1037 :
1038 0 : for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1039 0 : SUnit *SU = &DAG->SUnits[SUNum];
1040 : std::set<unsigned> SUColors;
1041 :
1042 0 : if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1043 : continue;
1044 :
1045 0 : for (SDep& SuccDep : SU->Succs) {
1046 : SUnit *Succ = SuccDep.getSUnit();
1047 0 : if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1048 : continue;
1049 0 : SUColors.insert(CurrentColoring[Succ->NodeNum]);
1050 : }
1051 0 : if (SUColors.size() == 1)
1052 0 : CurrentColoring[SU->NodeNum] = *SUColors.begin();
1053 : }
1054 0 : }
1055 :
1056 2 : void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
1057 4 : unsigned DAGSize = DAG->SUnits.size();
1058 :
1059 18 : for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1060 16 : SUnit *SU = &DAG->SUnits[SUNum];
1061 : std::set<unsigned> SUColors;
1062 :
1063 32 : if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1064 : continue;
1065 :
1066 36 : for (SDep& SuccDep : SU->Succs) {
1067 : SUnit *Succ = SuccDep.getSUnit();
1068 21 : if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1069 : continue;
1070 36 : SUColors.insert(CurrentColoring[Succ->NodeNum]);
1071 : }
1072 15 : if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
1073 0 : CurrentColoring[SU->NodeNum] = *SUColors.begin();
1074 : }
1075 2 : }
1076 :
1077 0 : void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
1078 0 : unsigned DAGSize = DAG->SUnits.size();
1079 : std::map<unsigned, unsigned> ColorCount;
1080 :
1081 0 : for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1082 0 : SUnit *SU = &DAG->SUnits[SUNum];
1083 0 : unsigned color = CurrentColoring[SU->NodeNum];
1084 0 : ++ColorCount[color];
1085 : }
1086 :
1087 0 : for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1088 0 : SUnit *SU = &DAG->SUnits[SUNum];
1089 0 : unsigned color = CurrentColoring[SU->NodeNum];
1090 : std::set<unsigned> SUColors;
1091 :
1092 0 : if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1093 : continue;
1094 :
1095 0 : if (ColorCount[color] > 1)
1096 : continue;
1097 :
1098 0 : for (SDep& SuccDep : SU->Succs) {
1099 : SUnit *Succ = SuccDep.getSUnit();
1100 0 : if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1101 : continue;
1102 0 : SUColors.insert(CurrentColoring[Succ->NodeNum]);
1103 : }
1104 0 : if (SUColors.size() == 1 && *SUColors.begin() != color) {
1105 0 : --ColorCount[color];
1106 0 : CurrentColoring[SU->NodeNum] = *SUColors.begin();
1107 0 : ++ColorCount[*SUColors.begin()];
1108 : }
1109 : }
1110 0 : }
1111 :
1112 0 : void SIScheduleBlockCreator::cutHugeBlocks() {
1113 : // TODO
1114 0 : }
1115 :
1116 2 : void SIScheduleBlockCreator::regroupNoUserInstructions() {
1117 2 : unsigned DAGSize = DAG->SUnits.size();
1118 2 : int GroupID = NextNonReservedID++;
1119 :
1120 18 : for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1121 16 : SUnit *SU = &DAG->SUnits[SUNum];
1122 : bool hasSuccessor = false;
1123 :
1124 32 : if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1125 : continue;
1126 :
1127 36 : for (SDep& SuccDep : SU->Succs) {
1128 : SUnit *Succ = SuccDep.getSUnit();
1129 21 : if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1130 3 : continue;
1131 : hasSuccessor = true;
1132 : }
1133 15 : if (!hasSuccessor)
1134 4 : CurrentColoring[SU->NodeNum] = GroupID;
1135 : }
1136 2 : }
1137 :
1138 2 : void SIScheduleBlockCreator::colorExports() {
1139 2 : unsigned ExportColor = NextNonReservedID++;
1140 : SmallVector<unsigned, 8> ExpGroup;
1141 :
1142 : // Put all exports together in a block.
1143 : // The block will naturally end up being scheduled last,
1144 : // thus putting exports at the end of the schedule, which
1145 : // is better for performance.
1146 : // However we must ensure, for safety, the exports can be put
1147 : // together in the same block without any other instruction.
1148 : // This could happen, for example, when scheduling after regalloc
1149 : // if reloading a spilled register from memory using the same
1150 : // register than used in a previous export.
1151 : // If that happens, do not regroup the exports.
1152 18 : for (unsigned SUNum : DAG->TopDownIndex2SU) {
1153 16 : const SUnit &SU = DAG->SUnits[SUNum];
1154 32 : if (SIInstrInfo::isEXP(*SU.getInstr())) {
1155 : // Check the EXP can be added to the group safely,
1156 : // ie without needing any other instruction.
1157 : // The EXP is allowed to depend on other EXP
1158 : // (they will be in the same group).
1159 1 : for (unsigned j : ExpGroup) {
1160 : bool HasSubGraph;
1161 : std::vector<int> SubGraph;
1162 : // By construction (topological order), if SU and
1163 : // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
1164 : // in the parent graph of SU.
1165 : #ifndef NDEBUG
1166 : SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
1167 : HasSubGraph);
1168 : assert(!HasSubGraph);
1169 : #endif
1170 0 : SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
1171 : HasSubGraph);
1172 0 : if (!HasSubGraph)
1173 : continue; // No dependencies between each other
1174 :
1175 : // SubGraph contains all the instructions required
1176 : // between EXP SUnits[j] and EXP SU.
1177 0 : for (unsigned k : SubGraph) {
1178 0 : if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr()))
1179 : // Other instructions than EXP would be required in the group.
1180 : // Abort the groupping.
1181 : return;
1182 : }
1183 : }
1184 :
1185 1 : ExpGroup.push_back(SUNum);
1186 : }
1187 : }
1188 :
1189 : // The group can be formed. Give the color.
1190 3 : for (unsigned j : ExpGroup)
1191 2 : CurrentColoring[j] = ExportColor;
1192 : }
1193 :
1194 2 : void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
1195 4 : unsigned DAGSize = DAG->SUnits.size();
1196 : std::map<unsigned,unsigned> RealID;
1197 :
1198 2 : CurrentBlocks.clear();
1199 2 : CurrentColoring.clear();
1200 2 : CurrentColoring.resize(DAGSize, 0);
1201 2 : Node2CurrentBlock.clear();
1202 :
1203 : // Restore links previous scheduling variant has overridden.
1204 2 : DAG->restoreSULinksLeft();
1205 :
1206 2 : NextReservedID = 1;
1207 2 : NextNonReservedID = DAGSize + 1;
1208 :
1209 : LLVM_DEBUG(dbgs() << "Coloring the graph\n");
1210 :
1211 2 : if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
1212 0 : colorHighLatenciesGroups();
1213 : else
1214 2 : colorHighLatenciesAlone();
1215 2 : colorComputeReservedDependencies();
1216 2 : colorAccordingToReservedDependencies();
1217 2 : colorEndsAccordingToDependencies();
1218 2 : if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
1219 0 : colorForceConsecutiveOrderInGroup();
1220 2 : regroupNoUserInstructions();
1221 2 : colorMergeConstantLoadsNextGroup();
1222 2 : colorMergeIfPossibleNextGroupOnlyForReserved();
1223 2 : colorExports();
1224 :
1225 : // Put SUs of same color into same block
1226 2 : Node2CurrentBlock.resize(DAGSize, -1);
1227 18 : for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1228 16 : SUnit *SU = &DAG->SUnits[i];
1229 32 : unsigned Color = CurrentColoring[SU->NodeNum];
1230 16 : if (RealID.find(Color) == RealID.end()) {
1231 5 : int ID = CurrentBlocks.size();
1232 10 : BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID));
1233 5 : CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
1234 5 : RealID[Color] = ID;
1235 : }
1236 16 : CurrentBlocks[RealID[Color]]->addUnit(SU);
1237 16 : Node2CurrentBlock[SU->NodeNum] = RealID[Color];
1238 : }
1239 :
1240 : // Build dependencies between blocks.
1241 18 : for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1242 16 : SUnit *SU = &DAG->SUnits[i];
1243 32 : int SUID = Node2CurrentBlock[i];
1244 40 : for (SDep& SuccDep : SU->Succs) {
1245 : SUnit *Succ = SuccDep.getSUnit();
1246 24 : if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1247 : continue;
1248 42 : if (Node2CurrentBlock[Succ->NodeNum] != SUID)
1249 32 : CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
1250 : SuccDep.isCtrl() ? NoData : Data);
1251 : }
1252 37 : for (SDep& PredDep : SU->Preds) {
1253 : SUnit *Pred = PredDep.getSUnit();
1254 21 : if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
1255 : continue;
1256 42 : if (Node2CurrentBlock[Pred->NodeNum] != SUID)
1257 27 : CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
1258 : }
1259 : }
1260 :
1261 : // Free root and leafs of all blocks to enable scheduling inside them.
1262 9 : for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1263 5 : SIScheduleBlock *Block = CurrentBlocks[i];
1264 5 : Block->finalizeUnits();
1265 : }
1266 : LLVM_DEBUG(dbgs() << "Blocks created:\n\n";
1267 : for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1268 : SIScheduleBlock *Block = CurrentBlocks[i];
1269 : Block->printDebug(true);
1270 : });
1271 2 : }
1272 :
1273 : // Two functions taken from Codegen/MachineScheduler.cpp
1274 :
1275 : /// Non-const version.
1276 : static MachineBasicBlock::iterator
1277 12 : nextIfDebug(MachineBasicBlock::iterator I,
1278 : MachineBasicBlock::const_iterator End) {
1279 12 : for (; I != End; ++I) {
1280 : if (!I->isDebugInstr())
1281 : break;
1282 : }
1283 12 : return I;
1284 : }
1285 :
1286 2 : void SIScheduleBlockCreator::topologicalSort() {
1287 4 : unsigned DAGSize = CurrentBlocks.size();
1288 : std::vector<int> WorkList;
1289 :
1290 : LLVM_DEBUG(dbgs() << "Topological Sort\n");
1291 :
1292 2 : WorkList.reserve(DAGSize);
1293 2 : TopDownIndex2Block.resize(DAGSize);
1294 2 : TopDownBlock2Index.resize(DAGSize);
1295 2 : BottomUpIndex2Block.resize(DAGSize);
1296 :
1297 7 : for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1298 10 : SIScheduleBlock *Block = CurrentBlocks[i];
1299 5 : unsigned Degree = Block->getSuccs().size();
1300 5 : TopDownBlock2Index[i] = Degree;
1301 5 : if (Degree == 0) {
1302 2 : WorkList.push_back(i);
1303 : }
1304 : }
1305 :
1306 2 : int Id = DAGSize;
1307 7 : while (!WorkList.empty()) {
1308 5 : int i = WorkList.back();
1309 10 : SIScheduleBlock *Block = CurrentBlocks[i];
1310 : WorkList.pop_back();
1311 5 : TopDownBlock2Index[i] = --Id;
1312 5 : TopDownIndex2Block[Id] = i;
1313 9 : for (SIScheduleBlock* Pred : Block->getPreds()) {
1314 8 : if (!--TopDownBlock2Index[Pred->getID()])
1315 3 : WorkList.push_back(Pred->getID());
1316 : }
1317 : }
1318 :
1319 : #ifndef NDEBUG
1320 : // Check correctness of the ordering.
1321 : for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1322 : SIScheduleBlock *Block = CurrentBlocks[i];
1323 : for (SIScheduleBlock* Pred : Block->getPreds()) {
1324 : assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&
1325 : "Wrong Top Down topological sorting");
1326 : }
1327 : }
1328 : #endif
1329 :
1330 2 : BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1331 2 : TopDownIndex2Block.rend());
1332 2 : }
1333 :
1334 2 : void SIScheduleBlockCreator::scheduleInsideBlocks() {
1335 4 : unsigned DAGSize = CurrentBlocks.size();
1336 :
1337 : LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n");
1338 :
1339 : // We do schedule a valid scheduling such that a Block corresponds
1340 : // to a range of instructions.
1341 : LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n");
1342 7 : for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1343 5 : SIScheduleBlock *Block = CurrentBlocks[i];
1344 5 : Block->fastSchedule();
1345 : }
1346 :
1347 : // Note: the following code, and the part restoring previous position
1348 : // is by far the most expensive operation of the Scheduler.
1349 :
1350 : // Do not update CurrentTop.
1351 2 : MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
1352 : std::vector<MachineBasicBlock::iterator> PosOld;
1353 : std::vector<MachineBasicBlock::iterator> PosNew;
1354 4 : PosOld.reserve(DAG->SUnits.size());
1355 4 : PosNew.reserve(DAG->SUnits.size());
1356 :
1357 7 : for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1358 5 : int BlockIndice = TopDownIndex2Block[i];
1359 10 : SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1360 : std::vector<SUnit*> SUs = Block->getScheduledUnits();
1361 :
1362 21 : for (SUnit* SU : SUs) {
1363 16 : MachineInstr *MI = SU->getInstr();
1364 : MachineBasicBlock::iterator Pos = MI;
1365 16 : PosOld.push_back(Pos);
1366 16 : if (&*CurrentTopFastSched == MI) {
1367 12 : PosNew.push_back(Pos);
1368 : CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
1369 24 : DAG->getCurrentBottom());
1370 : } else {
1371 : // Update the instruction stream.
1372 8 : DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
1373 :
1374 : // Update LiveIntervals.
1375 : // Note: Moving all instructions and calling handleMove every time
1376 : // is the most cpu intensive operation of the scheduler.
1377 : // It would gain a lot if there was a way to recompute the
1378 : // LiveIntervals for the entire scheduling region.
1379 4 : DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
1380 4 : PosNew.push_back(CurrentTopFastSched);
1381 : }
1382 : }
1383 : }
1384 :
1385 : // Now we have Block of SUs == Block of MI.
1386 : // We do the final schedule for the instructions inside the block.
1387 : // The property that all the SUs of the Block are grouped together as MI
1388 : // is used for correct reg usage tracking.
1389 7 : for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1390 10 : SIScheduleBlock *Block = CurrentBlocks[i];
1391 : std::vector<SUnit*> SUs = Block->getScheduledUnits();
1392 15 : Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
1393 : }
1394 :
1395 : LLVM_DEBUG(dbgs() << "Restoring MI Pos\n");
1396 : // Restore old ordering (which prevents a LIS->handleMove bug).
1397 20 : for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
1398 16 : MachineBasicBlock::iterator POld = PosOld[i-1];
1399 16 : MachineBasicBlock::iterator PNew = PosNew[i-1];
1400 16 : if (PNew != POld) {
1401 : // Update the instruction stream.
1402 4 : DAG->getBB()->splice(POld, DAG->getBB(), PNew);
1403 :
1404 : // Update LiveIntervals.
1405 4 : DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
1406 : }
1407 : }
1408 :
1409 : LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {
1410 : SIScheduleBlock *Block = CurrentBlocks[i];
1411 : Block->printDebug(true);
1412 : });
1413 2 : }
1414 :
1415 2 : void SIScheduleBlockCreator::fillStats() {
1416 4 : unsigned DAGSize = CurrentBlocks.size();
1417 :
1418 7 : for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1419 5 : int BlockIndice = TopDownIndex2Block[i];
1420 10 : SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1421 5 : if (Block->getPreds().empty())
1422 2 : Block->Depth = 0;
1423 : else {
1424 : unsigned Depth = 0;
1425 7 : for (SIScheduleBlock *Pred : Block->getPreds()) {
1426 8 : if (Depth < Pred->Depth + Pred->getCost())
1427 : Depth = Pred->Depth + Pred->getCost();
1428 : }
1429 3 : Block->Depth = Depth;
1430 : }
1431 : }
1432 :
1433 7 : for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1434 5 : int BlockIndice = BottomUpIndex2Block[i];
1435 10 : SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1436 5 : if (Block->getSuccs().empty())
1437 2 : Block->Height = 0;
1438 : else {
1439 3 : unsigned Height = 0;
1440 7 : for (const auto &Succ : Block->getSuccs())
1441 11 : Height = std::max(Height, Succ.first->Height + Succ.first->getCost());
1442 3 : Block->Height = Height;
1443 : }
1444 : }
1445 2 : }
1446 :
1447 : // SIScheduleBlockScheduler //
1448 :
1449 2 : SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
1450 : SISchedulerBlockSchedulerVariant Variant,
1451 2 : SIScheduleBlocks BlocksStruct) :
1452 2 : DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
1453 : LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1454 2 : SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1455 :
1456 : // Fill the usage of every output
1457 : // Warning: while by construction we always have a link between two blocks
1458 : // when one needs a result from the other, the number of users of an output
1459 : // is not the sum of child blocks having as input the same virtual register.
1460 : // Here is an example. A produces x and y. B eats x and produces x'.
1461 : // C eats x' and y. The register coalescer may have attributed the same
1462 : // virtual register to x and x'.
1463 : // To count accurately, we do a topological sort. In case the register is
1464 : // found for several parents, we increment the usage of the one with the
1465 : // highest topological index.
1466 4 : LiveOutRegsNumUsages.resize(Blocks.size());
1467 9 : for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1468 10 : SIScheduleBlock *Block = Blocks[i];
1469 13 : for (unsigned Reg : Block->getInRegs()) {
1470 : bool Found = false;
1471 : int topoInd = -1;
1472 15 : for (SIScheduleBlock* Pred: Block->getPreds()) {
1473 : std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1474 : std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1475 :
1476 7 : if (RegPos != PredOutRegs.end()) {
1477 : Found = true;
1478 10 : if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
1479 : topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
1480 : }
1481 : }
1482 : }
1483 :
1484 8 : if (!Found)
1485 : continue;
1486 :
1487 5 : int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
1488 10 : ++LiveOutRegsNumUsages[PredID][Reg];
1489 : }
1490 : }
1491 :
1492 4 : LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
1493 4 : BlockNumPredsLeft.resize(Blocks.size());
1494 4 : BlockNumSuccsLeft.resize(Blocks.size());
1495 :
1496 9 : for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1497 5 : SIScheduleBlock *Block = Blocks[i];
1498 15 : BlockNumPredsLeft[i] = Block->getPreds().size();
1499 10 : BlockNumSuccsLeft[i] = Block->getSuccs().size();
1500 : }
1501 :
1502 : #ifndef NDEBUG
1503 : for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1504 : SIScheduleBlock *Block = Blocks[i];
1505 : assert(Block->getID() == i);
1506 : }
1507 : #endif
1508 :
1509 2 : std::set<unsigned> InRegs = DAG->getInRegs();
1510 2 : addLiveRegs(InRegs);
1511 :
1512 : // Increase LiveOutRegsNumUsages for blocks
1513 : // producing registers consumed in another
1514 : // scheduling region.
1515 7 : for (unsigned Reg : DAG->getOutRegs()) {
1516 6 : for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1517 : // Do reverse traversal
1518 3 : int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
1519 6 : SIScheduleBlock *Block = Blocks[ID];
1520 : const std::set<unsigned> &OutRegs = Block->getOutRegs();
1521 :
1522 3 : if (OutRegs.find(Reg) == OutRegs.end())
1523 : continue;
1524 :
1525 6 : ++LiveOutRegsNumUsages[ID][Reg];
1526 3 : break;
1527 : }
1528 : }
1529 :
1530 : // Fill LiveRegsConsumers for regs that were already
1531 : // defined before scheduling.
1532 9 : for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1533 10 : SIScheduleBlock *Block = Blocks[i];
1534 13 : for (unsigned Reg : Block->getInRegs()) {
1535 : bool Found = false;
1536 8 : for (SIScheduleBlock* Pred: Block->getPreds()) {
1537 : std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1538 : std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1539 :
1540 5 : if (RegPos != PredOutRegs.end()) {
1541 : Found = true;
1542 : break;
1543 : }
1544 : }
1545 :
1546 : if (!Found)
1547 3 : ++LiveRegsConsumers[Reg];
1548 : }
1549 : }
1550 :
1551 9 : for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1552 5 : SIScheduleBlock *Block = Blocks[i];
1553 10 : if (BlockNumPredsLeft[i] == 0) {
1554 2 : ReadyBlocks.push_back(Block);
1555 : }
1556 : }
1557 :
1558 7 : while (SIScheduleBlock *Block = pickBlock()) {
1559 5 : BlocksScheduled.push_back(Block);
1560 5 : blockScheduled(Block);
1561 5 : }
1562 :
1563 : LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Block
1564 : : BlocksScheduled) {
1565 : dbgs() << ' ' << Block->getID();
1566 : } dbgs() << '\n';);
1567 2 : }
1568 :
1569 5 : bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1570 : SIBlockSchedCandidate &TryCand) {
1571 5 : if (!Cand.isValid()) {
1572 5 : TryCand.Reason = NodeOrder;
1573 5 : return true;
1574 : }
1575 :
1576 : // Try to hide high latencies.
1577 0 : if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled,
1578 0 : Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
1579 0 : return true;
1580 : // Schedule high latencies early so you can hide them better.
1581 0 : if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
1582 : TryCand, Cand, Latency))
1583 0 : return true;
1584 0 : if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height,
1585 : TryCand, Cand, Depth))
1586 0 : return true;
1587 0 : if (SISched::tryGreater(TryCand.NumHighLatencySuccessors,
1588 0 : Cand.NumHighLatencySuccessors,
1589 : TryCand, Cand, Successor))
1590 0 : return true;
1591 : return false;
1592 : }
1593 :
1594 0 : bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1595 : SIBlockSchedCandidate &TryCand) {
1596 0 : if (!Cand.isValid()) {
1597 0 : TryCand.Reason = NodeOrder;
1598 0 : return true;
1599 : }
1600 :
1601 0 : if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
1602 : TryCand, Cand, RegUsage))
1603 0 : return true;
1604 0 : if (SISched::tryGreater(TryCand.NumSuccessors > 0,
1605 0 : Cand.NumSuccessors > 0,
1606 : TryCand, Cand, Successor))
1607 0 : return true;
1608 0 : if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
1609 0 : return true;
1610 0 : if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
1611 : TryCand, Cand, RegUsage))
1612 0 : return true;
1613 : return false;
1614 : }
1615 :
1616 7 : SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
1617 : SIBlockSchedCandidate Cand;
1618 : std::vector<SIScheduleBlock*>::iterator Best;
1619 : SIScheduleBlock *Block;
1620 7 : if (ReadyBlocks.empty())
1621 : return nullptr;
1622 :
1623 5 : DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
1624 5 : VregCurrentUsage, SregCurrentUsage);
1625 5 : if (VregCurrentUsage > maxVregUsage)
1626 3 : maxVregUsage = VregCurrentUsage;
1627 5 : if (SregCurrentUsage > maxSregUsage)
1628 2 : maxSregUsage = SregCurrentUsage;
1629 : LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
1630 : for (SIScheduleBlock *Block
1631 : : ReadyBlocks) dbgs()
1632 : << Block->getID() << ' ';
1633 : dbgs() << "\nCurrent Live:\n";
1634 : for (unsigned Reg
1635 : : LiveRegs) dbgs()
1636 : << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
1637 : dbgs() << '\n';
1638 : dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
1639 : dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
1640 :
1641 5 : Cand.Block = nullptr;
1642 : for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
1643 10 : E = ReadyBlocks.end(); I != E; ++I) {
1644 : SIBlockSchedCandidate TryCand;
1645 5 : TryCand.Block = *I;
1646 5 : TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1647 5 : TryCand.VGPRUsageDiff =
1648 5 : checkRegUsageImpact(TryCand.Block->getInRegs(),
1649 5 : TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
1650 5 : TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1651 5 : TryCand.NumHighLatencySuccessors =
1652 5 : TryCand.Block->getNumHighLatencySuccessors();
1653 5 : TryCand.LastPosHighLatParentScheduled =
1654 15 : (unsigned int) std::max<int> (0,
1655 5 : LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
1656 5 : LastPosWaitedHighLatency);
1657 5 : TryCand.Height = TryCand.Block->Height;
1658 : // Try not to increase VGPR usage too much, else we may spill.
1659 5 : if (VregCurrentUsage > 120 ||
1660 5 : Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
1661 0 : if (!tryCandidateRegUsage(Cand, TryCand) &&
1662 0 : Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
1663 0 : tryCandidateLatency(Cand, TryCand);
1664 : } else {
1665 5 : if (!tryCandidateLatency(Cand, TryCand))
1666 0 : tryCandidateRegUsage(Cand, TryCand);
1667 : }
1668 5 : if (TryCand.Reason != NoCand) {
1669 : Cand.setBest(TryCand);
1670 : Best = I;
1671 : LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' '
1672 : << getReasonStr(Cand.Reason) << '\n');
1673 : }
1674 : }
1675 :
1676 : LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n';
1677 : dbgs() << "Is a block with high latency instruction: "
1678 : << (Cand.IsHighLatency ? "yes\n" : "no\n");
1679 : dbgs() << "Position of last high latency dependency: "
1680 : << Cand.LastPosHighLatParentScheduled << '\n';
1681 : dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';
1682 : dbgs() << '\n';);
1683 :
1684 5 : Block = Cand.Block;
1685 5 : ReadyBlocks.erase(Best);
1686 5 : return Block;
1687 : }
1688 :
1689 : // Tracking of currently alive registers to determine VGPR Usage.
1690 :
1691 7 : void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
1692 26 : for (unsigned Reg : Regs) {
1693 : // For now only track virtual registers.
1694 19 : if (!TargetRegisterInfo::isVirtualRegister(Reg))
1695 : continue;
1696 : // If not already in the live set, then add it.
1697 : (void) LiveRegs.insert(Reg);
1698 : }
1699 7 : }
1700 :
1701 5 : void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
1702 : std::set<unsigned> &Regs) {
1703 13 : for (unsigned Reg : Regs) {
1704 : // For now only track virtual registers.
1705 : std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
1706 : assert (Pos != LiveRegs.end() && // Reg must be live.
1707 : LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
1708 : LiveRegsConsumers[Reg] >= 1);
1709 8 : --LiveRegsConsumers[Reg];
1710 8 : if (LiveRegsConsumers[Reg] == 0)
1711 : LiveRegs.erase(Pos);
1712 : }
1713 5 : }
1714 :
1715 5 : void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
1716 9 : for (const auto &Block : Parent->getSuccs()) {
1717 8 : if (--BlockNumPredsLeft[Block.first->getID()] == 0)
1718 3 : ReadyBlocks.push_back(Block.first);
1719 :
1720 4 : if (Parent->isHighLatencyBlock() &&
1721 2 : Block.second == SIScheduleBlockLinkKind::Data)
1722 2 : LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
1723 : }
1724 5 : }
1725 :
1726 5 : void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
1727 5 : decreaseLiveRegs(Block, Block->getInRegs());
1728 5 : addLiveRegs(Block->getOutRegs());
1729 5 : releaseBlockSuccs(Block);
1730 : for (std::map<unsigned, unsigned>::iterator RegI =
1731 5 : LiveOutRegsNumUsages[Block->getID()].begin(),
1732 13 : E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
1733 : std::pair<unsigned, unsigned> RegP = *RegI;
1734 : // We produce this register, thus it must not be previously alive.
1735 : assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
1736 : LiveRegsConsumers[RegP.first] == 0);
1737 8 : LiveRegsConsumers[RegP.first] += RegP.second;
1738 : }
1739 5 : if (LastPosHighLatencyParentScheduled[Block->getID()] >
1740 5 : (unsigned)LastPosWaitedHighLatency)
1741 0 : LastPosWaitedHighLatency =
1742 : LastPosHighLatencyParentScheduled[Block->getID()];
1743 5 : ++NumBlockScheduled;
1744 5 : }
1745 :
1746 : std::vector<int>
1747 5 : SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
1748 : std::set<unsigned> &OutRegs) {
1749 : std::vector<int> DiffSetPressure;
1750 5 : DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
1751 :
1752 13 : for (unsigned Reg : InRegs) {
1753 : // For now only track virtual registers.
1754 8 : if (!TargetRegisterInfo::isVirtualRegister(Reg))
1755 : continue;
1756 8 : if (LiveRegsConsumers[Reg] > 1)
1757 : continue;
1758 8 : PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1759 20 : for (; PSetI.isValid(); ++PSetI) {
1760 24 : DiffSetPressure[*PSetI] -= PSetI.getWeight();
1761 : }
1762 : }
1763 :
1764 13 : for (unsigned Reg : OutRegs) {
1765 : // For now only track virtual registers.
1766 8 : if (!TargetRegisterInfo::isVirtualRegister(Reg))
1767 : continue;
1768 8 : PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1769 20 : for (; PSetI.isValid(); ++PSetI) {
1770 24 : DiffSetPressure[*PSetI] += PSetI.getWeight();
1771 : }
1772 : }
1773 :
1774 5 : return DiffSetPressure;
1775 : }
1776 :
1777 : // SIScheduler //
1778 :
1779 : struct SIScheduleBlockResult
1780 2 : SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1781 : SISchedulerBlockSchedulerVariant ScheduleVariant) {
1782 4 : SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
1783 6 : SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
1784 : std::vector<SIScheduleBlock*> ScheduledBlocks;
1785 : struct SIScheduleBlockResult Res;
1786 :
1787 2 : ScheduledBlocks = Scheduler.getBlocks();
1788 :
1789 9 : for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) {
1790 5 : SIScheduleBlock *Block = ScheduledBlocks[b];
1791 : std::vector<SUnit*> SUs = Block->getScheduledUnits();
1792 :
1793 21 : for (SUnit* SU : SUs)
1794 16 : Res.SUs.push_back(SU->NodeNum);
1795 : }
1796 :
1797 2 : Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
1798 2 : Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
1799 2 : return Res;
1800 : }
1801 :
1802 : // SIScheduleDAGMI //
1803 :
1804 1 : SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
1805 2 : ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) {
1806 1 : SITII = static_cast<const SIInstrInfo*>(TII);
1807 1 : SITRI = static_cast<const SIRegisterInfo*>(TRI);
1808 :
1809 1 : VGPRSetID = SITRI->getVGPRPressureSet();
1810 1 : SGPRSetID = SITRI->getSGPRPressureSet();
1811 1 : }
1812 :
1813 : SIScheduleDAGMI::~SIScheduleDAGMI() = default;
1814 :
1815 : // Code adapted from scheduleDAG.cpp
1816 : // Does a topological sort over the SUs.
1817 : // Both TopDown and BottomUp
1818 2 : void SIScheduleDAGMI::topologicalSort() {
1819 2 : Topo.InitDAGTopologicalSorting();
1820 :
1821 4 : TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
1822 2 : BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
1823 2 : }
1824 :
1825 : // Move low latencies further from their user without
1826 : // increasing SGPR usage (in general)
1827 : // This is to be replaced by a better pass that would
1828 : // take into account SGPR usage (based on VGPR Usage
1829 : // and the corresponding wavefront count), that would
1830 : // try to merge groups of loads if it make sense, etc
1831 2 : void SIScheduleDAGMI::moveLowLatencies() {
1832 2 : unsigned DAGSize = SUnits.size();
1833 : int LastLowLatencyUser = -1;
1834 : int LastLowLatencyPos = -1;
1835 :
1836 20 : for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
1837 32 : SUnit *SU = &SUnits[ScheduledSUnits[i]];
1838 : bool IsLowLatencyUser = false;
1839 : unsigned MinPos = 0;
1840 :
1841 37 : for (SDep& PredDep : SU->Preds) {
1842 : SUnit *Pred = PredDep.getSUnit();
1843 21 : if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
1844 : IsLowLatencyUser = true;
1845 : }
1846 21 : if (Pred->NodeNum >= DAGSize)
1847 : continue;
1848 21 : unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
1849 21 : if (PredPos >= MinPos)
1850 10 : MinPos = PredPos + 1;
1851 : }
1852 :
1853 16 : if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1854 2 : unsigned BestPos = LastLowLatencyUser + 1;
1855 2 : if ((int)BestPos <= LastLowLatencyPos)
1856 1 : BestPos = LastLowLatencyPos + 1;
1857 2 : if (BestPos < MinPos)
1858 : BestPos = MinPos;
1859 2 : if (BestPos < i) {
1860 18 : for (unsigned u = i; u > BestPos; --u) {
1861 32 : ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1862 32 : ScheduledSUnits[u] = ScheduledSUnits[u-1];
1863 : }
1864 2 : ScheduledSUnits[BestPos] = SU->NodeNum;
1865 4 : ScheduledSUnitsInv[SU->NodeNum] = BestPos;
1866 : }
1867 2 : LastLowLatencyPos = BestPos;
1868 2 : if (IsLowLatencyUser)
1869 : LastLowLatencyUser = BestPos;
1870 14 : } else if (IsLowLatencyUser) {
1871 0 : LastLowLatencyUser = i;
1872 : // Moves COPY instructions on which depends
1873 : // the low latency instructions too.
1874 28 : } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
1875 : bool CopyForLowLat = false;
1876 12 : for (SDep& SuccDep : SU->Succs) {
1877 : SUnit *Succ = SuccDep.getSUnit();
1878 7 : if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
1879 : CopyForLowLat = true;
1880 : }
1881 : }
1882 5 : if (!CopyForLowLat)
1883 : continue;
1884 2 : if (MinPos < i) {
1885 19 : for (unsigned u = i; u > MinPos; --u) {
1886 34 : ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1887 34 : ScheduledSUnits[u] = ScheduledSUnits[u-1];
1888 : }
1889 2 : ScheduledSUnits[MinPos] = SU->NodeNum;
1890 4 : ScheduledSUnitsInv[SU->NodeNum] = MinPos;
1891 : }
1892 : }
1893 : }
1894 2 : }
1895 :
1896 2 : void SIScheduleDAGMI::restoreSULinksLeft() {
1897 20 : for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1898 16 : SUnits[i].isScheduled = false;
1899 32 : SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
1900 32 : SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
1901 32 : SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
1902 48 : SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
1903 : }
1904 2 : }
1905 :
1906 : // Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1907 : template<typename _Iterator> void
1908 5 : SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
1909 : unsigned &VgprUsage, unsigned &SgprUsage) {
1910 5 : VgprUsage = 0;
1911 5 : SgprUsage = 0;
1912 15 : for (_Iterator RegI = First; RegI != End; ++RegI) {
1913 10 : unsigned Reg = *RegI;
1914 : // For now only track virtual registers
1915 10 : if (!TargetRegisterInfo::isVirtualRegister(Reg))
1916 : continue;
1917 10 : PSetIterator PSetI = MRI.getPressureSets(Reg);
1918 26 : for (; PSetI.isValid(); ++PSetI) {
1919 16 : if (*PSetI == VGPRSetID)
1920 6 : VgprUsage += PSetI.getWeight();
1921 10 : else if (*PSetI == SGPRSetID)
1922 4 : SgprUsage += PSetI.getWeight();
1923 : }
1924 : }
1925 5 : }
1926 :
1927 2 : void SIScheduleDAGMI::schedule()
1928 : {
1929 : SmallVector<SUnit*, 8> TopRoots, BotRoots;
1930 : SIScheduleBlockResult Best, Temp;
1931 : LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
1932 :
1933 2 : buildDAGWithRegPressure();
1934 : LLVM_DEBUG(dump());
1935 :
1936 2 : topologicalSort();
1937 2 : findRootsAndBiasEdges(TopRoots, BotRoots);
1938 : // We reuse several ScheduleDAGMI and ScheduleDAGMILive
1939 : // functions, but to make them happy we must initialize
1940 : // the default Scheduler implementation (even if we do not
1941 : // run it)
1942 2 : SchedImpl->initialize(this);
1943 2 : initQueues(TopRoots, BotRoots);
1944 :
1945 : // Fill some stats to help scheduling.
1946 :
1947 2 : SUnitsLinksBackup = SUnits;
1948 2 : IsLowLatencySU.clear();
1949 2 : LowLatencyOffset.clear();
1950 2 : IsHighLatencySU.clear();
1951 :
1952 4 : IsLowLatencySU.resize(SUnits.size(), 0);
1953 4 : LowLatencyOffset.resize(SUnits.size(), 0);
1954 4 : IsHighLatencySU.resize(SUnits.size(), 0);
1955 :
1956 20 : for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1957 16 : SUnit *SU = &SUnits[i];
1958 : unsigned BaseLatReg;
1959 : int64_t OffLatReg;
1960 16 : if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1961 2 : IsLowLatencySU[i] = 1;
1962 2 : if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg,
1963 : TRI))
1964 4 : LowLatencyOffset[i] = OffLatReg;
1965 14 : } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
1966 2 : IsHighLatencySU[i] = 1;
1967 : }
1968 :
1969 : SIScheduler Scheduler(this);
1970 2 : Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
1971 : SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
1972 :
1973 : // if VGPR usage is extremely high, try other good performing variants
1974 : // which could lead to lower VGPR usage
1975 2 : if (Best.MaxVGPRUsage > 180) {
1976 : static const std::pair<SISchedulerBlockCreatorVariant,
1977 : SISchedulerBlockSchedulerVariant>
1978 : Variants[] = {
1979 : { LatenciesAlone, BlockRegUsageLatency },
1980 : // { LatenciesAlone, BlockRegUsage },
1981 : { LatenciesGrouped, BlockLatencyRegUsage },
1982 : // { LatenciesGrouped, BlockRegUsageLatency },
1983 : // { LatenciesGrouped, BlockRegUsage },
1984 : { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1985 : // { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1986 : // { LatenciesAlonePlusConsecutive, BlockRegUsage }
1987 : };
1988 0 : for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1989 0 : Temp = Scheduler.scheduleVariant(v.first, v.second);
1990 0 : if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1991 : Best = Temp;
1992 : }
1993 : }
1994 : // if VGPR usage is still extremely high, we may spill. Try other variants
1995 : // which are less performing, but that could lead to lower VGPR usage.
1996 2 : if (Best.MaxVGPRUsage > 200) {
1997 : static const std::pair<SISchedulerBlockCreatorVariant,
1998 : SISchedulerBlockSchedulerVariant>
1999 : Variants[] = {
2000 : // { LatenciesAlone, BlockRegUsageLatency },
2001 : { LatenciesAlone, BlockRegUsage },
2002 : // { LatenciesGrouped, BlockLatencyRegUsage },
2003 : { LatenciesGrouped, BlockRegUsageLatency },
2004 : { LatenciesGrouped, BlockRegUsage },
2005 : // { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
2006 : { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
2007 : { LatenciesAlonePlusConsecutive, BlockRegUsage }
2008 : };
2009 0 : for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
2010 0 : Temp = Scheduler.scheduleVariant(v.first, v.second);
2011 0 : if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
2012 : Best = Temp;
2013 : }
2014 : }
2015 :
2016 2 : ScheduledSUnits = Best.SUs;
2017 4 : ScheduledSUnitsInv.resize(SUnits.size());
2018 :
2019 20 : for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
2020 48 : ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
2021 : }
2022 :
2023 2 : moveLowLatencies();
2024 :
2025 : // Tell the outside world about the result of the scheduling.
2026 :
2027 : assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
2028 : TopRPTracker.setPos(CurrentTop);
2029 :
2030 : for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(),
2031 18 : E = ScheduledSUnits.end(); I != E; ++I) {
2032 16 : SUnit *SU = &SUnits[*I];
2033 :
2034 16 : scheduleMI(SU, true);
2035 :
2036 : LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "
2037 : << *SU->getInstr());
2038 : }
2039 :
2040 : assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
2041 :
2042 2 : placeDebugValues();
2043 :
2044 : LLVM_DEBUG({
2045 : dbgs() << "*** Final schedule for "
2046 : << printMBBReference(*begin()->getParent()) << " ***\n";
2047 : dumpSchedule();
2048 : dbgs() << '\n';
2049 : });
2050 2 : }
|