Bug Summary

File:build/source/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
Warning:line 770, column 9
Value stored to 'ProposedColor' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name SIMachineScheduler.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-17/lib/clang/17 -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/source/llvm/lib/Target/AMDGPU -I include -I /build/source/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/source/= -source-date-epoch 1683717183 -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-05-10-133810-16478-1 -x c++ /build/source/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
1//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Machine Scheduler interface
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIMachineScheduler.h"
15#include "SIInstrInfo.h"
16#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17#include "llvm/CodeGen/LiveIntervals.h"
18#include "llvm/CodeGen/MachineRegisterInfo.h"
19
20using namespace llvm;
21
22#define DEBUG_TYPE"machine-scheduler" "machine-scheduler"
23
24// This scheduler implements a different scheduling algorithm than
25// GenericScheduler.
26//
27// There are several specific architecture behaviours that can't be modelled
28// for GenericScheduler:
29// . When accessing the result of an SGPR load instruction, you have to wait
30// for all the SGPR load instructions before your current instruction to
31// have finished.
32// . When accessing the result of an VGPR load instruction, you have to wait
33// for all the VGPR load instructions previous to the VGPR load instruction
34// you are interested in to finish.
35// . The less the register pressure, the best load latencies are hidden
36//
37// Moreover some specifities (like the fact a lot of instructions in the shader
38// have few dependencies) makes the generic scheduler have some unpredictable
39// behaviours. For example when register pressure becomes high, it can either
40// manage to prevent register pressure from going too high, or it can
41// increase register pressure even more than if it hadn't taken register
42// pressure into account.
43//
44// Also some other bad behaviours are generated, like loading at the beginning
45// of the shader a constant in VGPR you won't need until the end of the shader.
46//
47// The scheduling problem for SI can distinguish three main parts:
48// . Hiding high latencies (texture sampling, etc)
49// . Hiding low latencies (SGPR constant loading, etc)
50// . Keeping register usage low for better latency hiding and general
51// performance
52//
53// Some other things can also affect performance, but are hard to predict
54// (cache usage, the fact the HW can issue several instructions from different
55// wavefronts if different types, etc)
56//
57// This scheduler tries to solve the scheduling problem by dividing it into
58// simpler sub-problems. It divides the instructions into blocks, schedules
59// locally inside the blocks where it takes care of low latencies, and then
60// chooses the order of the blocks by taking care of high latencies.
61// Dividing the instructions into blocks helps control keeping register
62// usage low.
63//
64// First the instructions are put into blocks.
65// We want the blocks help control register usage and hide high latencies
66// later. To help control register usage, we typically want all local
67// computations, when for example you create a result that can be consumed
68// right away, to be contained in a block. Block inputs and outputs would
69// typically be important results that are needed in several locations of
70// the shader. Since we do want blocks to help hide high latencies, we want
71// the instructions inside the block to have a minimal set of dependencies
72// on high latencies. It will make it easy to pick blocks to hide specific
73// high latencies.
74// The block creation algorithm is divided into several steps, and several
75// variants can be tried during the scheduling process.
76//
77// Second the order of the instructions inside the blocks is chosen.
78// At that step we do take into account only register usage and hiding
79// low latency instructions
80//
81// Third the block order is chosen, there we try to hide high latencies
82// and keep register usage low.
83//
84// After the third step, a pass is done to improve the hiding of low
85// latencies.
86//
87// Actually when talking about 'low latency' or 'high latency' it includes
88// both the latency to get the cache (or global mem) data go to the register,
89// and the bandwidth limitations.
90// Increasing the number of active wavefronts helps hide the former, but it
91// doesn't solve the latter, thus why even if wavefront count is high, we have
92// to try have as many instructions hiding high latencies as possible.
93// The OpenCL doc says for example latency of 400 cycles for a global mem
94// access, which is hidden by 10 instructions if the wavefront count is 10.
95
96// Some figures taken from AMD docs:
97// Both texture and constant L1 caches are 4-way associative with 64 bytes
98// lines.
99// Constant cache is shared with 4 CUs.
100// For texture sampling, the address generation unit receives 4 texture
101// addresses per cycle, thus we could expect texture sampling latency to be
102// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
103// instructions in a wavefront group are executed every 4 cycles),
104// or 16 instructions if the other wavefronts associated to the 3 other VALUs
105// of the CU do texture sampling too. (Don't take these figures too seriously,
106// as I'm not 100% sure of the computation)
107// Data exports should get similar latency.
108// For constant loading, the cache is shader with 4 CUs.
109// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
110// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
111// It means a simple s_buffer_load should take one instruction to hide, as
112// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
113// cache line.
114//
115// As of today the driver doesn't preload the constants in cache, thus the
116// first loads get extra latency. The doc says global memory access can be
117// 300-600 cycles. We do not specially take that into account when scheduling
118// As we expect the driver to be able to preload the constants soon.
119
120// common code //
121
122#ifndef NDEBUG
123
124static const char *getReasonStr(SIScheduleCandReason Reason) {
125 switch (Reason) {
126 case NoCand: return "NOCAND";
127 case RegUsage: return "REGUSAGE";
128 case Latency: return "LATENCY";
129 case Successor: return "SUCCESSOR";
130 case Depth: return "DEPTH";
131 case NodeOrder: return "ORDER";
132 }
133 llvm_unreachable("Unknown reason!")::llvm::llvm_unreachable_internal("Unknown reason!", "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp"
, 133)
;
134}
135
136#endif
137
138namespace llvm {
139namespace SISched {
140static bool tryLess(int TryVal, int CandVal,
141 SISchedulerCandidate &TryCand,
142 SISchedulerCandidate &Cand,
143 SIScheduleCandReason Reason) {
144 if (TryVal < CandVal) {
145 TryCand.Reason = Reason;
146 return true;
147 }
148 if (TryVal > CandVal) {
149 if (Cand.Reason > Reason)
150 Cand.Reason = Reason;
151 return true;
152 }
153 Cand.setRepeat(Reason);
154 return false;
155}
156
157static bool tryGreater(int TryVal, int CandVal,
158 SISchedulerCandidate &TryCand,
159 SISchedulerCandidate &Cand,
160 SIScheduleCandReason Reason) {
161 if (TryVal > CandVal) {
162 TryCand.Reason = Reason;
163 return true;
164 }
165 if (TryVal < CandVal) {
166 if (Cand.Reason > Reason)
167 Cand.Reason = Reason;
168 return true;
169 }
170 Cand.setRepeat(Reason);
171 return false;
172}
173} // end namespace SISched
174} // end namespace llvm
175
176// SIScheduleBlock //
177
178void SIScheduleBlock::addUnit(SUnit *SU) {
179 NodeNum2Index[SU->NodeNum] = SUnits.size();
180 SUnits.push_back(SU);
181}
182
183#ifndef NDEBUG
184void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
185
186 dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
187 dbgs() << '\n';
188}
189#endif
190
191void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
192 SISchedCandidate &TryCand) {
193 // Initialize the candidate if needed.
194 if (!Cand.isValid()) {
195 TryCand.Reason = NodeOrder;
196 return;
197 }
198
199 if (Cand.SGPRUsage > 60 &&
200 SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage,
201 TryCand, Cand, RegUsage))
202 return;
203
204 // Schedule low latency instructions as top as possible.
205 // Order of priority is:
206 // . Low latency instructions which do not depend on other low latency
207 // instructions we haven't waited for
208 // . Other instructions which do not depend on low latency instructions
209 // we haven't waited for
210 // . Low latencies
211 // . All other instructions
212 // Goal is to get: low latency instructions - independent instructions
213 // - (eventually some more low latency instructions)
214 // - instructions that depend on the first low latency instructions.
215 // If in the block there is a lot of constant loads, the SGPR usage
216 // could go quite high, thus above the arbitrary limit of 60 will encourage
217 // use the already loaded constants (in order to release some SGPRs) before
218 // loading more.
219 if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent,
220 Cand.HasLowLatencyNonWaitedParent,
221 TryCand, Cand, SIScheduleCandReason::Depth))
222 return;
223
224 if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
225 TryCand, Cand, SIScheduleCandReason::Depth))
226 return;
227
228 if (TryCand.IsLowLatency &&
229 SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
230 TryCand, Cand, SIScheduleCandReason::Depth))
231 return;
232
233 if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage,
234 TryCand, Cand, RegUsage))
235 return;
236
237 // Fall through to original instruction order.
238 if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
239 TryCand.Reason = NodeOrder;
240 }
241}
242
243SUnit* SIScheduleBlock::pickNode() {
244 SISchedCandidate TopCand;
245
246 for (SUnit* SU : TopReadySUs) {
247 SISchedCandidate TryCand;
248 std::vector<unsigned> pressure;
249 std::vector<unsigned> MaxPressure;
250 // Predict register usage after this instruction.
251 TryCand.SU = SU;
252 TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
253 TryCand.SGPRUsage = pressure[AMDGPU::RegisterPressureSets::SReg_32];
254 TryCand.VGPRUsage = pressure[AMDGPU::RegisterPressureSets::VGPR_32];
255 TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
256 TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
257 TryCand.HasLowLatencyNonWaitedParent =
258 HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
259 tryCandidateTopDown(TopCand, TryCand);
260 if (TryCand.Reason != NoCand)
261 TopCand.setBest(TryCand);
262 }
263
264 return TopCand.SU;
265}
266
267
268// Schedule something valid.
269void SIScheduleBlock::fastSchedule() {
270 TopReadySUs.clear();
271 if (Scheduled)
272 undoSchedule();
273
274 for (SUnit* SU : SUnits) {
275 if (!SU->NumPredsLeft)
276 TopReadySUs.push_back(SU);
277 }
278
279 while (!TopReadySUs.empty()) {
280 SUnit *SU = TopReadySUs[0];
281 ScheduledSUnits.push_back(SU);
282 nodeScheduled(SU);
283 }
284
285 Scheduled = true;
286}
287
288// Returns if the register was set between first and last.
289static bool isDefBetween(unsigned Reg,
290 SlotIndex First, SlotIndex Last,
291 const MachineRegisterInfo *MRI,
292 const LiveIntervals *LIS) {
293 for (MachineRegisterInfo::def_instr_iterator
294 UI = MRI->def_instr_begin(Reg),
295 UE = MRI->def_instr_end(); UI != UE; ++UI) {
296 const MachineInstr* MI = &*UI;
297 if (MI->isDebugValue())
298 continue;
299 SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
300 if (InstSlot >= First && InstSlot <= Last)
301 return true;
302 }
303 return false;
304}
305
306void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
307 MachineBasicBlock::iterator EndBlock) {
308 IntervalPressure Pressure, BotPressure;
309 RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
310 LiveIntervals *LIS = DAG->getLIS();
311 MachineRegisterInfo *MRI = DAG->getMRI();
312 DAG->initRPTracker(TopRPTracker);
313 DAG->initRPTracker(BotRPTracker);
314 DAG->initRPTracker(RPTracker);
315
316 // Goes though all SU. RPTracker captures what had to be alive for the SUs
317 // to execute, and what is still alive at the end.
318 for (SUnit* SU : ScheduledSUnits) {
319 RPTracker.setPos(SU->getInstr());
320 RPTracker.advance();
321 }
322
323 // Close the RPTracker to finalize live ins/outs.
324 RPTracker.closeRegion();
325
326 // Initialize the live ins and live outs.
327 TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
328 BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
329
330 // Do not Track Physical Registers, because it messes up.
331 for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
332 if (RegMaskPair.RegUnit.isVirtual())
333 LiveInRegs.insert(RegMaskPair.RegUnit);
334 }
335 LiveOutRegs.clear();
336 // There is several possibilities to distinguish:
337 // 1) Reg is not input to any instruction in the block, but is output of one
338 // 2) 1) + read in the block and not needed after it
339 // 3) 1) + read in the block but needed in another block
340 // 4) Reg is input of an instruction but another block will read it too
341 // 5) Reg is input of an instruction and then rewritten in the block.
342 // result is not read in the block (implies used in another block)
343 // 6) Reg is input of an instruction and then rewritten in the block.
344 // result is read in the block and not needed in another block
345 // 7) Reg is input of an instruction and then rewritten in the block.
346 // result is read in the block but also needed in another block
347 // LiveInRegs will contains all the regs in situation 4, 5, 6, 7
348 // We want LiveOutRegs to contain only Regs whose content will be read after
349 // in another block, and whose content was written in the current block,
350 // that is we want it to get 1, 3, 5, 7
351 // Since we made the MIs of a block to be packed all together before
352 // scheduling, then the LiveIntervals were correct, and the RPTracker was
353 // able to correctly handle 5 vs 6, 2 vs 3.
354 // (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
355 // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
356 // Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7
357 // The use of findDefBetween removes the case 4.
358 for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
359 Register Reg = RegMaskPair.RegUnit;
360 if (Reg.isVirtual() &&
361 isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
362 LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
363 LIS)) {
364 LiveOutRegs.insert(Reg);
365 }
366 }
367
368 // Pressure = sum_alive_registers register size
369 // Internally llvm will represent some registers as big 128 bits registers
370 // for example, but they actually correspond to 4 actual 32 bits registers.
371 // Thus Pressure is not equal to num_alive_registers * constant.
372 LiveInPressure = TopPressure.MaxSetPressure;
373 LiveOutPressure = BotPressure.MaxSetPressure;
374
375 // Prepares TopRPTracker for top down scheduling.
376 TopRPTracker.closeTop();
377}
378
379void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
380 MachineBasicBlock::iterator EndBlock) {
381 if (!Scheduled)
382 fastSchedule();
383
384 // PreScheduling phase to set LiveIn and LiveOut.
385 initRegPressure(BeginBlock, EndBlock);
386 undoSchedule();
387
388 // Schedule for real now.
389
390 TopReadySUs.clear();
391
392 for (SUnit* SU : SUnits) {
393 if (!SU->NumPredsLeft)
394 TopReadySUs.push_back(SU);
395 }
396
397 while (!TopReadySUs.empty()) {
398 SUnit *SU = pickNode();
399 ScheduledSUnits.push_back(SU);
400 TopRPTracker.setPos(SU->getInstr());
401 TopRPTracker.advance();
402 nodeScheduled(SU);
403 }
404
405 // TODO: compute InternalAdditionalPressure.
406 InternalAdditionalPressure.resize(TopPressure.MaxSetPressure.size());
407
408 // Check everything is right.
409#ifndef NDEBUG
410 assert(SUnits.size() == ScheduledSUnits.size() &&(static_cast <bool> (SUnits.size() == ScheduledSUnits.size
() && TopReadySUs.empty()) ? void (0) : __assert_fail
("SUnits.size() == ScheduledSUnits.size() && TopReadySUs.empty()"
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 411, __extension__
__PRETTY_FUNCTION__))
411 TopReadySUs.empty())(static_cast <bool> (SUnits.size() == ScheduledSUnits.size
() && TopReadySUs.empty()) ? void (0) : __assert_fail
("SUnits.size() == ScheduledSUnits.size() && TopReadySUs.empty()"
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 411, __extension__
__PRETTY_FUNCTION__))
;
412 for (SUnit* SU : SUnits) {
413 assert(SU->isScheduled &&(static_cast <bool> (SU->isScheduled && SU->
NumPredsLeft == 0) ? void (0) : __assert_fail ("SU->isScheduled && SU->NumPredsLeft == 0"
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 414, __extension__
__PRETTY_FUNCTION__))
414 SU->NumPredsLeft == 0)(static_cast <bool> (SU->isScheduled && SU->
NumPredsLeft == 0) ? void (0) : __assert_fail ("SU->isScheduled && SU->NumPredsLeft == 0"
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 414, __extension__
__PRETTY_FUNCTION__))
;
415 }
416#endif
417
418 Scheduled = true;
419}
420
421void SIScheduleBlock::undoSchedule() {
422 for (SUnit* SU : SUnits) {
423 SU->isScheduled = false;
424 for (SDep& Succ : SU->Succs) {
425 if (BC->isSUInBlock(Succ.getSUnit(), ID))
426 undoReleaseSucc(SU, &Succ);
427 }
428 }
429 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
430 ScheduledSUnits.clear();
431 Scheduled = false;
432}
433
434void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) {
435 SUnit *SuccSU = SuccEdge->getSUnit();
436
437 if (SuccEdge->isWeak()) {
438 ++SuccSU->WeakPredsLeft;
439 return;
440 }
441 ++SuccSU->NumPredsLeft;
442}
443
444void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) {
445 SUnit *SuccSU = SuccEdge->getSUnit();
446
447 if (SuccEdge->isWeak()) {
448 --SuccSU->WeakPredsLeft;
449 return;
450 }
451#ifndef NDEBUG
452 if (SuccSU->NumPredsLeft == 0) {
453 dbgs() << "*** Scheduling failed! ***\n";
454 DAG->dumpNode(*SuccSU);
455 dbgs() << " has been released too many times!\n";
456 llvm_unreachable(nullptr)::llvm::llvm_unreachable_internal(nullptr, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp"
, 456)
;
457 }
458#endif
459
460 --SuccSU->NumPredsLeft;
461}
462
463/// Release Successors of the SU that are in the block or not.
464void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
465 for (SDep& Succ : SU->Succs) {
466 SUnit *SuccSU = Succ.getSUnit();
467
468 if (SuccSU->NodeNum >= DAG->SUnits.size())
469 continue;
470
471 if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
472 continue;
473
474 releaseSucc(SU, &Succ);
475 if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
476 TopReadySUs.push_back(SuccSU);
477 }
478}
479
480void SIScheduleBlock::nodeScheduled(SUnit *SU) {
481 // Is in TopReadySUs
482 assert (!SU->NumPredsLeft)(static_cast <bool> (!SU->NumPredsLeft) ? void (0) :
__assert_fail ("!SU->NumPredsLeft", "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp"
, 482, __extension__ __PRETTY_FUNCTION__))
;
483 std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU);
484 if (I == TopReadySUs.end()) {
485 dbgs() << "Data Structure Bug in SI Scheduler\n";
486 llvm_unreachable(nullptr)::llvm::llvm_unreachable_internal(nullptr, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp"
, 486)
;
487 }
488 TopReadySUs.erase(I);
489
490 releaseSuccessors(SU, true);
491 // Scheduling this node will trigger a wait,
492 // thus propagate to other instructions that they do not need to wait either.
493 if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
494 HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
495
496 if (DAG->IsLowLatencySU[SU->NodeNum]) {
497 for (SDep& Succ : SU->Succs) {
498 std::map<unsigned, unsigned>::iterator I =
499 NodeNum2Index.find(Succ.getSUnit()->NodeNum);
500 if (I != NodeNum2Index.end())
501 HasLowLatencyNonWaitedParent[I->second] = 1;
502 }
503 }
504 SU->isScheduled = true;
505}
506
507void SIScheduleBlock::finalizeUnits() {
508 // We remove links from outside blocks to enable scheduling inside the block.
509 for (SUnit* SU : SUnits) {
510 releaseSuccessors(SU, false);
511 if (DAG->IsHighLatencySU[SU->NodeNum])
512 HighLatencyBlock = true;
513 }
514 HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
515}
516
517// we maintain ascending order of IDs
518void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
519 unsigned PredID = Pred->getID();
520
521 // Check if not already predecessor.
522 for (SIScheduleBlock* P : Preds) {
523 if (PredID == P->getID())
524 return;
525 }
526 Preds.push_back(Pred);
527
528 assert(none_of(Succs,(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock
*, SIScheduleBlockLinkKind> S) { return PredID == S.first->
getID(); }) && "Loop in the Block Graph!") ? void (0)
: __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\""
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 533, __extension__
__PRETTY_FUNCTION__))
529 [=](std::pair<SIScheduleBlock*,(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock
*, SIScheduleBlockLinkKind> S) { return PredID == S.first->
getID(); }) && "Loop in the Block Graph!") ? void (0)
: __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\""
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 533, __extension__
__PRETTY_FUNCTION__))
530 SIScheduleBlockLinkKind> S) {(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock
*, SIScheduleBlockLinkKind> S) { return PredID == S.first->
getID(); }) && "Loop in the Block Graph!") ? void (0)
: __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\""
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 533, __extension__
__PRETTY_FUNCTION__))
531 return PredID == S.first->getID();(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock
*, SIScheduleBlockLinkKind> S) { return PredID == S.first->
getID(); }) && "Loop in the Block Graph!") ? void (0)
: __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\""
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 533, __extension__
__PRETTY_FUNCTION__))
532 }) &&(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock
*, SIScheduleBlockLinkKind> S) { return PredID == S.first->
getID(); }) && "Loop in the Block Graph!") ? void (0)
: __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\""
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 533, __extension__
__PRETTY_FUNCTION__))
533 "Loop in the Block Graph!")(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock
*, SIScheduleBlockLinkKind> S) { return PredID == S.first->
getID(); }) && "Loop in the Block Graph!") ? void (0)
: __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\""
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 533, __extension__
__PRETTY_FUNCTION__))
;
534}
535
536void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
537 SIScheduleBlockLinkKind Kind) {
538 unsigned SuccID = Succ->getID();
539
540 // Check if not already predecessor.
541 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
542 if (SuccID == S.first->getID()) {
543 if (S.second == SIScheduleBlockLinkKind::NoData &&
544 Kind == SIScheduleBlockLinkKind::Data)
545 S.second = Kind;
546 return;
547 }
548 }
549 if (Succ->isHighLatencyBlock())
550 ++NumHighLatencySuccessors;
551 Succs.push_back(std::pair(Succ, Kind));
552
553 assert(none_of(Preds,(static_cast <bool> (none_of(Preds, [=](SIScheduleBlock
*P) { return SuccID == P->getID(); }) && "Loop in the Block Graph!"
) ? void (0) : __assert_fail ("none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && \"Loop in the Block Graph!\""
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 555, __extension__
__PRETTY_FUNCTION__))
554 [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&(static_cast <bool> (none_of(Preds, [=](SIScheduleBlock
*P) { return SuccID == P->getID(); }) && "Loop in the Block Graph!"
) ? void (0) : __assert_fail ("none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && \"Loop in the Block Graph!\""
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 555, __extension__
__PRETTY_FUNCTION__))
555 "Loop in the Block Graph!")(static_cast <bool> (none_of(Preds, [=](SIScheduleBlock
*P) { return SuccID == P->getID(); }) && "Loop in the Block Graph!"
) ? void (0) : __assert_fail ("none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && \"Loop in the Block Graph!\""
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 555, __extension__
__PRETTY_FUNCTION__))
;
556}
557
558#ifndef NDEBUG
559void SIScheduleBlock::printDebug(bool full) {
560 dbgs() << "Block (" << ID << ")\n";
561 if (!full)
562 return;
563
564 dbgs() << "\nContains High Latency Instruction: "
565 << HighLatencyBlock << '\n';
566 dbgs() << "\nDepends On:\n";
567 for (SIScheduleBlock* P : Preds) {
568 P->printDebug(false);
569 }
570
571 dbgs() << "\nSuccessors:\n";
572 for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
573 if (S.second == SIScheduleBlockLinkKind::Data)
574 dbgs() << "(Data Dep) ";
575 S.first->printDebug(false);
576 }
577
578 if (Scheduled) {
579 dbgs() << "LiveInPressure "
580 << LiveInPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' '
581 << LiveInPressure[AMDGPU::RegisterPressureSets::VGPR_32] << '\n';
582 dbgs() << "LiveOutPressure "
583 << LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' '
584 << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n";
585 dbgs() << "LiveIns:\n";
586 for (unsigned Reg : LiveInRegs)
587 dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
588
589 dbgs() << "\nLiveOuts:\n";
590 for (unsigned Reg : LiveOutRegs)
591 dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
592 }
593
594 dbgs() << "\nInstructions:\n";
595 for (const SUnit* SU : SUnits)
596 DAG->dumpNode(*SU);
597
598 dbgs() << "///////////////////////\n";
599}
600#endif
601
602// SIScheduleBlockCreator //
603
604SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG)
605 : DAG(DAG) {}
606
607SIScheduleBlocks
608SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
609 std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
610 Blocks.find(BlockVariant);
611 if (B == Blocks.end()) {
612 SIScheduleBlocks Res;
613 createBlocksForVariant(BlockVariant);
614 topologicalSort();
615 scheduleInsideBlocks();
616 fillStats();
617 Res.Blocks = CurrentBlocks;
618 Res.TopDownIndex2Block = TopDownIndex2Block;
619 Res.TopDownBlock2Index = TopDownBlock2Index;
620 Blocks[BlockVariant] = Res;
621 return Res;
622 } else {
623 return B->second;
624 }
625}
626
627bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
628 if (SU->NodeNum >= DAG->SUnits.size())
629 return false;
630 return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
631}
632
633void SIScheduleBlockCreator::colorHighLatenciesAlone() {
634 unsigned DAGSize = DAG->SUnits.size();
635
636 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
637 SUnit *SU = &DAG->SUnits[i];
638 if (DAG->IsHighLatencySU[SU->NodeNum]) {
639 CurrentColoring[SU->NodeNum] = NextReservedID++;
640 }
641 }
642}
643
644static bool
645hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
646 for (const auto &PredDep : SU.Preds) {
647 if (PredDep.getSUnit() == &FromSU &&
648 PredDep.getKind() == llvm::SDep::Data)
649 return true;
650 }
651 return false;
652}
653
654void SIScheduleBlockCreator::colorHighLatenciesGroups() {
655 unsigned DAGSize = DAG->SUnits.size();
656 unsigned NumHighLatencies = 0;
657 unsigned GroupSize;
658 int Color = NextReservedID;
659 unsigned Count = 0;
660 std::set<unsigned> FormingGroup;
661
662 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
663 SUnit *SU = &DAG->SUnits[i];
664 if (DAG->IsHighLatencySU[SU->NodeNum])
665 ++NumHighLatencies;
666 }
667
668 if (NumHighLatencies == 0)
669 return;
670
671 if (NumHighLatencies <= 6)
672 GroupSize = 2;
673 else if (NumHighLatencies <= 12)
674 GroupSize = 3;
675 else
676 GroupSize = 4;
677
678 for (unsigned SUNum : DAG->TopDownIndex2SU) {
679 const SUnit &SU = DAG->SUnits[SUNum];
680 if (DAG->IsHighLatencySU[SU.NodeNum]) {
681 unsigned CompatibleGroup = true;
682 int ProposedColor = Color;
683 std::vector<int> AdditionalElements;
684
685 // We don't want to put in the same block
686 // two high latency instructions that depend
687 // on each other.
688 // One way would be to check canAddEdge
689 // in both directions, but that currently is not
690 // enough because there the high latency order is
691 // enforced (via links).
692 // Instead, look at the dependencies between the
693 // high latency instructions and deduce if it is
694 // a data dependency or not.
695 for (unsigned j : FormingGroup) {
696 bool HasSubGraph;
697 std::vector<int> SubGraph;
698 // By construction (topological order), if SU and
699 // DAG->SUnits[j] are linked, DAG->SUnits[j] is necessary
700 // in the parent graph of SU.
701#ifndef NDEBUG
702 SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
703 HasSubGraph);
704 assert(!HasSubGraph)(static_cast <bool> (!HasSubGraph) ? void (0) : __assert_fail
("!HasSubGraph", "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp"
, 704, __extension__ __PRETTY_FUNCTION__))
;
705#endif
706 SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
707 HasSubGraph);
708 if (!HasSubGraph)
709 continue; // No dependencies between each other
710 else if (SubGraph.size() > 5) {
711 // Too many elements would be required to be added to the block.
712 CompatibleGroup = false;
713 break;
714 }
715 else {
716 // Check the type of dependency
717 for (unsigned k : SubGraph) {
718 // If in the path to join the two instructions,
719 // there is another high latency instruction,
720 // or instructions colored for another block
721 // abort the merge.
722 if (DAG->IsHighLatencySU[k] ||
723 (CurrentColoring[k] != ProposedColor &&
724 CurrentColoring[k] != 0)) {
725 CompatibleGroup = false;
726 break;
727 }
728 // If one of the SU in the subgraph depends on the result of SU j,
729 // there'll be a data dependency.
730 if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) {
731 CompatibleGroup = false;
732 break;
733 }
734 }
735 if (!CompatibleGroup)
736 break;
737 // Same check for the SU
738 if (hasDataDependencyPred(SU, DAG->SUnits[j])) {
739 CompatibleGroup = false;
740 break;
741 }
742 // Add all the required instructions to the block
743 // These cannot live in another block (because they
744 // depend (order dependency) on one of the
745 // instruction in the block, and are required for the
746 // high latency instruction we add.
747 llvm::append_range(AdditionalElements, SubGraph);
748 }
749 }
750 if (CompatibleGroup) {
751 FormingGroup.insert(SU.NodeNum);
752 for (unsigned j : AdditionalElements)
753 CurrentColoring[j] = ProposedColor;
754 CurrentColoring[SU.NodeNum] = ProposedColor;
755 ++Count;
756 }
757 // Found one incompatible instruction,
758 // or has filled a big enough group.
759 // -> start a new one.
760 if (!CompatibleGroup) {
761 FormingGroup.clear();
762 Color = ++NextReservedID;
763 ProposedColor = Color;
764 FormingGroup.insert(SU.NodeNum);
765 CurrentColoring[SU.NodeNum] = ProposedColor;
766 Count = 0;
767 } else if (Count == GroupSize) {
768 FormingGroup.clear();
769 Color = ++NextReservedID;
770 ProposedColor = Color;
Value stored to 'ProposedColor' is never read
771 Count = 0;
772 }
773 }
774 }
775}
776
777void SIScheduleBlockCreator::colorComputeReservedDependencies() {
778 unsigned DAGSize = DAG->SUnits.size();
779 std::map<std::set<unsigned>, unsigned> ColorCombinations;
780
781 CurrentTopDownReservedDependencyColoring.clear();
782 CurrentBottomUpReservedDependencyColoring.clear();
783
784 CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
785 CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
786
787 // Traverse TopDown, and give different colors to SUs depending
788 // on which combination of High Latencies they depend on.
789
790 for (unsigned SUNum : DAG->TopDownIndex2SU) {
791 SUnit *SU = &DAG->SUnits[SUNum];
792 std::set<unsigned> SUColors;
793
794 // Already given.
795 if (CurrentColoring[SU->NodeNum]) {
796 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
797 CurrentColoring[SU->NodeNum];
798 continue;
799 }
800
801 for (SDep& PredDep : SU->Preds) {
802 SUnit *Pred = PredDep.getSUnit();
803 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
804 continue;
805 if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
806 SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
807 }
808 // Color 0 by default.
809 if (SUColors.empty())
810 continue;
811 // Same color than parents.
812 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
813 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
814 *SUColors.begin();
815 else {
816 std::map<std::set<unsigned>, unsigned>::iterator Pos =
817 ColorCombinations.find(SUColors);
818 if (Pos != ColorCombinations.end()) {
819 CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
820 } else {
821 CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
822 NextNonReservedID;
823 ColorCombinations[SUColors] = NextNonReservedID++;
824 }
825 }
826 }
827
828 ColorCombinations.clear();
829
830 // Same as before, but BottomUp.
831
832 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
833 SUnit *SU = &DAG->SUnits[SUNum];
834 std::set<unsigned> SUColors;
835
836 // Already given.
837 if (CurrentColoring[SU->NodeNum]) {
838 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
839 CurrentColoring[SU->NodeNum];
840 continue;
841 }
842
843 for (SDep& SuccDep : SU->Succs) {
844 SUnit *Succ = SuccDep.getSUnit();
845 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
846 continue;
847 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
848 SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
849 }
850 // Keep color 0.
851 if (SUColors.empty())
852 continue;
853 // Same color than parents.
854 if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
855 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
856 *SUColors.begin();
857 else {
858 std::map<std::set<unsigned>, unsigned>::iterator Pos =
859 ColorCombinations.find(SUColors);
860 if (Pos != ColorCombinations.end()) {
861 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
862 } else {
863 CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
864 NextNonReservedID;
865 ColorCombinations[SUColors] = NextNonReservedID++;
866 }
867 }
868 }
869}
870
871void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
872 std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
873
874 // Every combination of colors given by the top down
875 // and bottom up Reserved node dependency
876
877 for (const SUnit &SU : DAG->SUnits) {
878 std::pair<unsigned, unsigned> SUColors;
879
880 // High latency instructions: already given.
881 if (CurrentColoring[SU.NodeNum])
882 continue;
883
884 SUColors.first = CurrentTopDownReservedDependencyColoring[SU.NodeNum];
885 SUColors.second = CurrentBottomUpReservedDependencyColoring[SU.NodeNum];
886
887 std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
888 ColorCombinations.find(SUColors);
889 if (Pos != ColorCombinations.end()) {
890 CurrentColoring[SU.NodeNum] = Pos->second;
891 } else {
892 CurrentColoring[SU.NodeNum] = NextNonReservedID;
893 ColorCombinations[SUColors] = NextNonReservedID++;
894 }
895 }
896}
897
898void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
899 unsigned DAGSize = DAG->SUnits.size();
900 std::vector<int> PendingColoring = CurrentColoring;
901
902 assert(DAGSize >= 1 &&(static_cast <bool> (DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring
.size() == DAGSize && CurrentTopDownReservedDependencyColoring
.size() == DAGSize) ? void (0) : __assert_fail ("DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring.size() == DAGSize && CurrentTopDownReservedDependencyColoring.size() == DAGSize"
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 904, __extension__
__PRETTY_FUNCTION__))
903 CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&(static_cast <bool> (DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring
.size() == DAGSize && CurrentTopDownReservedDependencyColoring
.size() == DAGSize) ? void (0) : __assert_fail ("DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring.size() == DAGSize && CurrentTopDownReservedDependencyColoring.size() == DAGSize"
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 904, __extension__
__PRETTY_FUNCTION__))
904 CurrentTopDownReservedDependencyColoring.size() == DAGSize)(static_cast <bool> (DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring
.size() == DAGSize && CurrentTopDownReservedDependencyColoring
.size() == DAGSize) ? void (0) : __assert_fail ("DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring.size() == DAGSize && CurrentTopDownReservedDependencyColoring.size() == DAGSize"
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 904, __extension__
__PRETTY_FUNCTION__))
;
905 // If there is no reserved block at all, do nothing. We don't want
906 // everything in one block.
907 if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
908 CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
909 *std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
910 CurrentTopDownReservedDependencyColoring.end()) == 0)
911 return;
912
913 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
914 SUnit *SU = &DAG->SUnits[SUNum];
915 std::set<unsigned> SUColors;
916 std::set<unsigned> SUColorsPending;
917
918 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
919 continue;
920
921 if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 ||
922 CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
923 continue;
924
925 for (SDep& SuccDep : SU->Succs) {
926 SUnit *Succ = SuccDep.getSUnit();
927 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
928 continue;
929 if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 ||
930 CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
931 SUColors.insert(CurrentColoring[Succ->NodeNum]);
932 SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
933 }
934 // If there is only one child/parent block, and that block
935 // is not among the ones we are removing in this path, then
936 // merge the instruction to that block
937 if (SUColors.size() == 1 && SUColorsPending.size() == 1)
938 PendingColoring[SU->NodeNum] = *SUColors.begin();
939 else // TODO: Attribute new colors depending on color
940 // combination of children.
941 PendingColoring[SU->NodeNum] = NextNonReservedID++;
942 }
943 CurrentColoring = PendingColoring;
944}
945
946
947void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
948 unsigned DAGSize = DAG->SUnits.size();
949 unsigned PreviousColor;
950 std::set<unsigned> SeenColors;
951
952 if (DAGSize <= 1)
953 return;
954
955 PreviousColor = CurrentColoring[0];
956
957 for (unsigned i = 1, e = DAGSize; i != e; ++i) {
958 SUnit *SU = &DAG->SUnits[i];
959 unsigned CurrentColor = CurrentColoring[i];
960 unsigned PreviousColorSave = PreviousColor;
961 assert(i == SU->NodeNum)(static_cast <bool> (i == SU->NodeNum) ? void (0) : __assert_fail
("i == SU->NodeNum", "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp"
, 961, __extension__ __PRETTY_FUNCTION__))
;
962
963 if (CurrentColor != PreviousColor)
964 SeenColors.insert(PreviousColor);
965 PreviousColor = CurrentColor;
966
967 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
968 continue;
969
970 if (SeenColors.find(CurrentColor) == SeenColors.end())
971 continue;
972
973 if (PreviousColorSave != CurrentColor)
974 CurrentColoring[i] = NextNonReservedID++;
975 else
976 CurrentColoring[i] = CurrentColoring[i-1];
977 }
978}
979
980void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
981 unsigned DAGSize = DAG->SUnits.size();
982
983 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
984 SUnit *SU = &DAG->SUnits[SUNum];
985 std::set<unsigned> SUColors;
986
987 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
988 continue;
989
990 // No predecessor: Vgpr constant loading.
991 // Low latency instructions usually have a predecessor (the address)
992 if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
993 continue;
994
995 for (SDep& SuccDep : SU->Succs) {
996 SUnit *Succ = SuccDep.getSUnit();
997 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
998 continue;
999 SUColors.insert(CurrentColoring[Succ->NodeNum]);
1000 }
1001 if (SUColors.size() == 1)
1002 CurrentColoring[SU->NodeNum] = *SUColors.begin();
1003 }
1004}
1005
1006void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
1007 unsigned DAGSize = DAG->SUnits.size();
1008
1009 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1010 SUnit *SU = &DAG->SUnits[SUNum];
1011 std::set<unsigned> SUColors;
1012
1013 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1014 continue;
1015
1016 for (SDep& SuccDep : SU->Succs) {
1017 SUnit *Succ = SuccDep.getSUnit();
1018 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1019 continue;
1020 SUColors.insert(CurrentColoring[Succ->NodeNum]);
1021 }
1022 if (SUColors.size() == 1)
1023 CurrentColoring[SU->NodeNum] = *SUColors.begin();
1024 }
1025}
1026
1027void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
1028 unsigned DAGSize = DAG->SUnits.size();
1029
1030 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1031 SUnit *SU = &DAG->SUnits[SUNum];
1032 std::set<unsigned> SUColors;
1033
1034 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1035 continue;
1036
1037 for (SDep& SuccDep : SU->Succs) {
1038 SUnit *Succ = SuccDep.getSUnit();
1039 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1040 continue;
1041 SUColors.insert(CurrentColoring[Succ->NodeNum]);
1042 }
1043 if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
1044 CurrentColoring[SU->NodeNum] = *SUColors.begin();
1045 }
1046}
1047
1048void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
1049 unsigned DAGSize = DAG->SUnits.size();
1050 std::map<unsigned, unsigned> ColorCount;
1051
1052 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1053 SUnit *SU = &DAG->SUnits[SUNum];
1054 unsigned color = CurrentColoring[SU->NodeNum];
1055 ++ColorCount[color];
1056 }
1057
1058 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1059 SUnit *SU = &DAG->SUnits[SUNum];
1060 unsigned color = CurrentColoring[SU->NodeNum];
1061 std::set<unsigned> SUColors;
1062
1063 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1064 continue;
1065
1066 if (ColorCount[color] > 1)
1067 continue;
1068
1069 for (SDep& SuccDep : SU->Succs) {
1070 SUnit *Succ = SuccDep.getSUnit();
1071 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1072 continue;
1073 SUColors.insert(CurrentColoring[Succ->NodeNum]);
1074 }
1075 if (SUColors.size() == 1 && *SUColors.begin() != color) {
1076 --ColorCount[color];
1077 CurrentColoring[SU->NodeNum] = *SUColors.begin();
1078 ++ColorCount[*SUColors.begin()];
1079 }
1080 }
1081}
1082
1083void SIScheduleBlockCreator::cutHugeBlocks() {
1084 // TODO
1085}
1086
1087void SIScheduleBlockCreator::regroupNoUserInstructions() {
1088 unsigned DAGSize = DAG->SUnits.size();
1089 int GroupID = NextNonReservedID++;
1090
1091 for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1092 SUnit *SU = &DAG->SUnits[SUNum];
1093 bool hasSuccessor = false;
1094
1095 if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1096 continue;
1097
1098 for (SDep& SuccDep : SU->Succs) {
1099 SUnit *Succ = SuccDep.getSUnit();
1100 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1101 continue;
1102 hasSuccessor = true;
1103 }
1104 if (!hasSuccessor)
1105 CurrentColoring[SU->NodeNum] = GroupID;
1106 }
1107}
1108
1109void SIScheduleBlockCreator::colorExports() {
1110 unsigned ExportColor = NextNonReservedID++;
1111 SmallVector<unsigned, 8> ExpGroup;
1112
1113 // Put all exports together in a block.
1114 // The block will naturally end up being scheduled last,
1115 // thus putting exports at the end of the schedule, which
1116 // is better for performance.
1117 // However we must ensure, for safety, the exports can be put
1118 // together in the same block without any other instruction.
1119 // This could happen, for example, when scheduling after regalloc
1120 // if reloading a spilled register from memory using the same
1121 // register than used in a previous export.
1122 // If that happens, do not regroup the exports.
1123 for (unsigned SUNum : DAG->TopDownIndex2SU) {
1124 const SUnit &SU = DAG->SUnits[SUNum];
1125 if (SIInstrInfo::isEXP(*SU.getInstr())) {
1126 // SU is an export instruction. Check whether one of its successor
1127 // dependencies is a non-export, in which case we skip export grouping.
1128 for (const SDep &SuccDep : SU.Succs) {
1129 const SUnit *SuccSU = SuccDep.getSUnit();
1130 if (SuccDep.isWeak() || SuccSU->NodeNum >= DAG->SUnits.size()) {
1131 // Ignore these dependencies.
1132 continue;
1133 }
1134 assert(SuccSU->isInstr() &&(static_cast <bool> (SuccSU->isInstr() && "SUnit unexpectedly not representing an instruction!"
) ? void (0) : __assert_fail ("SuccSU->isInstr() && \"SUnit unexpectedly not representing an instruction!\""
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1135, __extension__
__PRETTY_FUNCTION__))
1135 "SUnit unexpectedly not representing an instruction!")(static_cast <bool> (SuccSU->isInstr() && "SUnit unexpectedly not representing an instruction!"
) ? void (0) : __assert_fail ("SuccSU->isInstr() && \"SUnit unexpectedly not representing an instruction!\""
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1135, __extension__
__PRETTY_FUNCTION__))
;
1136
1137 if (!SIInstrInfo::isEXP(*SuccSU->getInstr())) {
1138 // A non-export depends on us. Skip export grouping.
1139 // Note that this is a bit pessimistic: We could still group all other
1140 // exports that are not depended on by non-exports, directly or
1141 // indirectly. Simply skipping this particular export but grouping all
1142 // others would not account for indirect dependencies.
1143 return;
1144 }
1145 }
1146 ExpGroup.push_back(SUNum);
1147 }
1148 }
1149
1150 // The group can be formed. Give the color.
1151 for (unsigned j : ExpGroup)
1152 CurrentColoring[j] = ExportColor;
1153}
1154
1155void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
1156 unsigned DAGSize = DAG->SUnits.size();
1157 std::map<unsigned,unsigned> RealID;
1158
1159 CurrentBlocks.clear();
1160 CurrentColoring.clear();
1161 CurrentColoring.resize(DAGSize, 0);
1162 Node2CurrentBlock.clear();
1163
1164 // Restore links previous scheduling variant has overridden.
1165 DAG->restoreSULinksLeft();
1166
1167 NextReservedID = 1;
1168 NextNonReservedID = DAGSize + 1;
1169
1170 LLVM_DEBUG(dbgs() << "Coloring the graph\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Coloring the graph\n"
; } } while (false)
;
1171
1172 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
1173 colorHighLatenciesGroups();
1174 else
1175 colorHighLatenciesAlone();
1176 colorComputeReservedDependencies();
1177 colorAccordingToReservedDependencies();
1178 colorEndsAccordingToDependencies();
1179 if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
1180 colorForceConsecutiveOrderInGroup();
1181 regroupNoUserInstructions();
1182 colorMergeConstantLoadsNextGroup();
1183 colorMergeIfPossibleNextGroupOnlyForReserved();
1184 colorExports();
1185
1186 // Put SUs of same color into same block
1187 Node2CurrentBlock.resize(DAGSize, -1);
1188 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1189 SUnit *SU = &DAG->SUnits[i];
1190 unsigned Color = CurrentColoring[SU->NodeNum];
1191 if (RealID.find(Color) == RealID.end()) {
1192 int ID = CurrentBlocks.size();
1193 BlockPtrs.push_back(std::make_unique<SIScheduleBlock>(DAG, this, ID));
1194 CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
1195 RealID[Color] = ID;
1196 }
1197 CurrentBlocks[RealID[Color]]->addUnit(SU);
1198 Node2CurrentBlock[SU->NodeNum] = RealID[Color];
1199 }
1200
1201 // Build dependencies between blocks.
1202 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1203 SUnit *SU = &DAG->SUnits[i];
1204 int SUID = Node2CurrentBlock[i];
1205 for (SDep& SuccDep : SU->Succs) {
1206 SUnit *Succ = SuccDep.getSUnit();
1207 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1208 continue;
1209 if (Node2CurrentBlock[Succ->NodeNum] != SUID)
1210 CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
1211 SuccDep.isCtrl() ? NoData : Data);
1212 }
1213 for (SDep& PredDep : SU->Preds) {
1214 SUnit *Pred = PredDep.getSUnit();
1215 if (PredDep.isWeak() || Pred->NodeNum >= DAGSize)
1216 continue;
1217 if (Node2CurrentBlock[Pred->NodeNum] != SUID)
1218 CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
1219 }
1220 }
1221
1222 // Free root and leafs of all blocks to enable scheduling inside them.
1223 for (SIScheduleBlock *Block : CurrentBlocks)
1224 Block->finalizeUnits();
1225 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { { dbgs() << "Blocks created:\n\n"
; for (SIScheduleBlock *Block : CurrentBlocks) Block->printDebug
(true); }; } } while (false)
1226 dbgs() << "Blocks created:\n\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { { dbgs() << "Blocks created:\n\n"
; for (SIScheduleBlock *Block : CurrentBlocks) Block->printDebug
(true); }; } } while (false)
1227 for (SIScheduleBlock *Block : CurrentBlocks)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { { dbgs() << "Blocks created:\n\n"
; for (SIScheduleBlock *Block : CurrentBlocks) Block->printDebug
(true); }; } } while (false)
1228 Block->printDebug(true);do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { { dbgs() << "Blocks created:\n\n"
; for (SIScheduleBlock *Block : CurrentBlocks) Block->printDebug
(true); }; } } while (false)
1229 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { { dbgs() << "Blocks created:\n\n"
; for (SIScheduleBlock *Block : CurrentBlocks) Block->printDebug
(true); }; } } while (false)
;
1230}
1231
1232// Two functions taken from Codegen/MachineScheduler.cpp
1233
1234/// Non-const version.
1235static MachineBasicBlock::iterator
1236nextIfDebug(MachineBasicBlock::iterator I,
1237 MachineBasicBlock::const_iterator End) {
1238 for (; I != End; ++I) {
1239 if (!I->isDebugInstr())
1240 break;
1241 }
1242 return I;
1243}
1244
1245void SIScheduleBlockCreator::topologicalSort() {
1246 unsigned DAGSize = CurrentBlocks.size();
1247 std::vector<int> WorkList;
1248
1249 LLVM_DEBUG(dbgs() << "Topological Sort\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Topological Sort\n"
; } } while (false)
;
1250
1251 WorkList.reserve(DAGSize);
1252 TopDownIndex2Block.resize(DAGSize);
1253 TopDownBlock2Index.resize(DAGSize);
1254 BottomUpIndex2Block.resize(DAGSize);
1255
1256 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1257 SIScheduleBlock *Block = CurrentBlocks[i];
1258 unsigned Degree = Block->getSuccs().size();
1259 TopDownBlock2Index[i] = Degree;
1260 if (Degree == 0) {
1261 WorkList.push_back(i);
1262 }
1263 }
1264
1265 int Id = DAGSize;
1266 while (!WorkList.empty()) {
1267 int i = WorkList.back();
1268 SIScheduleBlock *Block = CurrentBlocks[i];
1269 WorkList.pop_back();
1270 TopDownBlock2Index[i] = --Id;
1271 TopDownIndex2Block[Id] = i;
1272 for (SIScheduleBlock* Pred : Block->getPreds()) {
1273 if (!--TopDownBlock2Index[Pred->getID()])
1274 WorkList.push_back(Pred->getID());
1275 }
1276 }
1277
1278#ifndef NDEBUG
1279 // Check correctness of the ordering.
1280 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1281 SIScheduleBlock *Block = CurrentBlocks[i];
1282 for (SIScheduleBlock* Pred : Block->getPreds()) {
1283 assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&(static_cast <bool> (TopDownBlock2Index[i] > TopDownBlock2Index
[Pred->getID()] && "Wrong Top Down topological sorting"
) ? void (0) : __assert_fail ("TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && \"Wrong Top Down topological sorting\""
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1284, __extension__
__PRETTY_FUNCTION__))
1284 "Wrong Top Down topological sorting")(static_cast <bool> (TopDownBlock2Index[i] > TopDownBlock2Index
[Pred->getID()] && "Wrong Top Down topological sorting"
) ? void (0) : __assert_fail ("TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && \"Wrong Top Down topological sorting\""
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1284, __extension__
__PRETTY_FUNCTION__))
;
1285 }
1286 }
1287#endif
1288
1289 BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1290 TopDownIndex2Block.rend());
1291}
1292
1293void SIScheduleBlockCreator::scheduleInsideBlocks() {
1294 unsigned DAGSize = CurrentBlocks.size();
1295
1296 LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "\nScheduling Blocks\n\n"
; } } while (false)
;
1297
1298 // We do schedule a valid scheduling such that a Block corresponds
1299 // to a range of instructions.
1300 LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "First phase: Fast scheduling for Reg Liveness\n"
; } } while (false)
;
1301 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1302 SIScheduleBlock *Block = CurrentBlocks[i];
1303 Block->fastSchedule();
1304 }
1305
1306 // Note: the following code, and the part restoring previous position
1307 // is by far the most expensive operation of the Scheduler.
1308
1309 // Do not update CurrentTop.
1310 MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
1311 std::vector<MachineBasicBlock::iterator> PosOld;
1312 std::vector<MachineBasicBlock::iterator> PosNew;
1313 PosOld.reserve(DAG->SUnits.size());
1314 PosNew.reserve(DAG->SUnits.size());
1315
1316 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1317 int BlockIndice = TopDownIndex2Block[i];
1318 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1319 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1320
1321 for (SUnit* SU : SUs) {
1322 MachineInstr *MI = SU->getInstr();
1323 MachineBasicBlock::iterator Pos = MI;
1324 PosOld.push_back(Pos);
1325 if (&*CurrentTopFastSched == MI) {
1326 PosNew.push_back(Pos);
1327 CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
1328 DAG->getCurrentBottom());
1329 } else {
1330 // Update the instruction stream.
1331 DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
1332
1333 // Update LiveIntervals.
1334 // Note: Moving all instructions and calling handleMove every time
1335 // is the most cpu intensive operation of the scheduler.
1336 // It would gain a lot if there was a way to recompute the
1337 // LiveIntervals for the entire scheduling region.
1338 DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true);
1339 PosNew.push_back(CurrentTopFastSched);
1340 }
1341 }
1342 }
1343
1344 // Now we have Block of SUs == Block of MI.
1345 // We do the final schedule for the instructions inside the block.
1346 // The property that all the SUs of the Block are grouped together as MI
1347 // is used for correct reg usage tracking.
1348 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1349 SIScheduleBlock *Block = CurrentBlocks[i];
1350 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1351 Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr());
1352 }
1353
1354 LLVM_DEBUG(dbgs() << "Restoring MI Pos\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Restoring MI Pos\n"
; } } while (false)
;
1355 // Restore old ordering (which prevents a LIS->handleMove bug).
1356 for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
1357 MachineBasicBlock::iterator POld = PosOld[i-1];
1358 MachineBasicBlock::iterator PNew = PosNew[i-1];
1359 if (PNew != POld) {
1360 // Update the instruction stream.
1361 DAG->getBB()->splice(POld, DAG->getBB(), PNew);
1362
1363 // Update LiveIntervals.
1364 DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true);
1365 }
1366 }
1367
1368 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { { for (SIScheduleBlock *Block : CurrentBlocks
) Block->printDebug(true); }; } } while (false)
1369 for (SIScheduleBlock *Block : CurrentBlocks)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { { for (SIScheduleBlock *Block : CurrentBlocks
) Block->printDebug(true); }; } } while (false)
1370 Block->printDebug(true);do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { { for (SIScheduleBlock *Block : CurrentBlocks
) Block->printDebug(true); }; } } while (false)
1371 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { { for (SIScheduleBlock *Block : CurrentBlocks
) Block->printDebug(true); }; } } while (false)
;
1372}
1373
1374void SIScheduleBlockCreator::fillStats() {
1375 unsigned DAGSize = CurrentBlocks.size();
1376
1377 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1378 int BlockIndice = TopDownIndex2Block[i];
1379 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1380 if (Block->getPreds().empty())
1381 Block->Depth = 0;
1382 else {
1383 unsigned Depth = 0;
1384 for (SIScheduleBlock *Pred : Block->getPreds()) {
1385 if (Depth < Pred->Depth + Pred->getCost())
1386 Depth = Pred->Depth + Pred->getCost();
1387 }
1388 Block->Depth = Depth;
1389 }
1390 }
1391
1392 for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1393 int BlockIndice = BottomUpIndex2Block[i];
1394 SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1395 if (Block->getSuccs().empty())
1396 Block->Height = 0;
1397 else {
1398 unsigned Height = 0;
1399 for (const auto &Succ : Block->getSuccs())
1400 Height = std::max(Height, Succ.first->Height + Succ.first->getCost());
1401 Block->Height = Height;
1402 }
1403 }
1404}
1405
1406// SIScheduleBlockScheduler //
1407
1408SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
1409 SISchedulerBlockSchedulerVariant Variant,
1410 SIScheduleBlocks BlocksStruct) :
1411 DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
1412 LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1413 SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1414
1415 // Fill the usage of every output
1416 // Warning: while by construction we always have a link between two blocks
1417 // when one needs a result from the other, the number of users of an output
1418 // is not the sum of child blocks having as input the same virtual register.
1419 // Here is an example. A produces x and y. B eats x and produces x'.
1420 // C eats x' and y. The register coalescer may have attributed the same
1421 // virtual register to x and x'.
1422 // To count accurately, we do a topological sort. In case the register is
1423 // found for several parents, we increment the usage of the one with the
1424 // highest topological index.
1425 LiveOutRegsNumUsages.resize(Blocks.size());
1426 for (SIScheduleBlock *Block : Blocks) {
1427 for (unsigned Reg : Block->getInRegs()) {
1428 bool Found = false;
1429 int topoInd = -1;
1430 for (SIScheduleBlock* Pred: Block->getPreds()) {
1431 std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1432 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1433
1434 if (RegPos != PredOutRegs.end()) {
1435 Found = true;
1436 if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
1437 topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
1438 }
1439 }
1440 }
1441
1442 if (!Found)
1443 continue;
1444
1445 int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
1446 ++LiveOutRegsNumUsages[PredID][Reg];
1447 }
1448 }
1449
1450 LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
1451 BlockNumPredsLeft.resize(Blocks.size());
1452 BlockNumSuccsLeft.resize(Blocks.size());
1453
1454 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1455 SIScheduleBlock *Block = Blocks[i];
1456 BlockNumPredsLeft[i] = Block->getPreds().size();
1457 BlockNumSuccsLeft[i] = Block->getSuccs().size();
1458 }
1459
1460#ifndef NDEBUG
1461 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1462 SIScheduleBlock *Block = Blocks[i];
1463 assert(Block->getID() == i)(static_cast <bool> (Block->getID() == i) ? void (0)
: __assert_fail ("Block->getID() == i", "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp"
, 1463, __extension__ __PRETTY_FUNCTION__))
;
1464 }
1465#endif
1466
1467 std::set<unsigned> InRegs = DAG->getInRegs();
1468 addLiveRegs(InRegs);
1469
1470 // Increase LiveOutRegsNumUsages for blocks
1471 // producing registers consumed in another
1472 // scheduling region.
1473 for (unsigned Reg : DAG->getOutRegs()) {
1474 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1475 // Do reverse traversal
1476 int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
1477 SIScheduleBlock *Block = Blocks[ID];
1478 const std::set<unsigned> &OutRegs = Block->getOutRegs();
1479
1480 if (OutRegs.find(Reg) == OutRegs.end())
1481 continue;
1482
1483 ++LiveOutRegsNumUsages[ID][Reg];
1484 break;
1485 }
1486 }
1487
1488 // Fill LiveRegsConsumers for regs that were already
1489 // defined before scheduling.
1490 for (SIScheduleBlock *Block : Blocks) {
1491 for (unsigned Reg : Block->getInRegs()) {
1492 bool Found = false;
1493 for (SIScheduleBlock* Pred: Block->getPreds()) {
1494 std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1495 std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1496
1497 if (RegPos != PredOutRegs.end()) {
1498 Found = true;
1499 break;
1500 }
1501 }
1502
1503 if (!Found)
1504 ++LiveRegsConsumers[Reg];
1505 }
1506 }
1507
1508 for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1509 SIScheduleBlock *Block = Blocks[i];
1510 if (BlockNumPredsLeft[i] == 0) {
1511 ReadyBlocks.push_back(Block);
1512 }
1513 }
1514
1515 while (SIScheduleBlock *Block = pickBlock()) {
1516 BlocksScheduled.push_back(Block);
1517 blockScheduled(Block);
1518 }
1519
1520 LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Blockdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Block Order:"; for (
SIScheduleBlock *Block : BlocksScheduled) { dbgs() << ' '
<< Block->getID(); } dbgs() << '\n';; } } while
(false)
1521 : BlocksScheduled) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Block Order:"; for (
SIScheduleBlock *Block : BlocksScheduled) { dbgs() << ' '
<< Block->getID(); } dbgs() << '\n';; } } while
(false)
1522 dbgs() << ' ' << Block->getID();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Block Order:"; for (
SIScheduleBlock *Block : BlocksScheduled) { dbgs() << ' '
<< Block->getID(); } dbgs() << '\n';; } } while
(false)
1523 } dbgs() << '\n';)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Block Order:"; for (
SIScheduleBlock *Block : BlocksScheduled) { dbgs() << ' '
<< Block->getID(); } dbgs() << '\n';; } } while
(false)
;
1524}
1525
1526bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1527 SIBlockSchedCandidate &TryCand) {
1528 if (!Cand.isValid()) {
1529 TryCand.Reason = NodeOrder;
1530 return true;
1531 }
1532
1533 // Try to hide high latencies.
1534 if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled,
1535 Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
1536 return true;
1537 // Schedule high latencies early so you can hide them better.
1538 if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
1539 TryCand, Cand, Latency))
1540 return true;
1541 if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height,
1542 TryCand, Cand, Depth))
1543 return true;
1544 if (SISched::tryGreater(TryCand.NumHighLatencySuccessors,
1545 Cand.NumHighLatencySuccessors,
1546 TryCand, Cand, Successor))
1547 return true;
1548 return false;
1549}
1550
1551bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1552 SIBlockSchedCandidate &TryCand) {
1553 if (!Cand.isValid()) {
1554 TryCand.Reason = NodeOrder;
1555 return true;
1556 }
1557
1558 if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
1559 TryCand, Cand, RegUsage))
1560 return true;
1561 if (SISched::tryGreater(TryCand.NumSuccessors > 0,
1562 Cand.NumSuccessors > 0,
1563 TryCand, Cand, Successor))
1564 return true;
1565 if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
1566 return true;
1567 if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
1568 TryCand, Cand, RegUsage))
1569 return true;
1570 return false;
1571}
1572
1573SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
1574 SIBlockSchedCandidate Cand;
1575 std::vector<SIScheduleBlock*>::iterator Best;
1576 SIScheduleBlock *Block;
1577 if (ReadyBlocks.empty())
1578 return nullptr;
1579
1580 DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
1581 VregCurrentUsage, SregCurrentUsage);
1582 if (VregCurrentUsage > maxVregUsage)
1583 maxVregUsage = VregCurrentUsage;
1584 if (SregCurrentUsage > maxSregUsage)
1585 maxSregUsage = SregCurrentUsage;
1586 LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Picking New Blocks\n"
; dbgs() << "Available: "; for (SIScheduleBlock *Block :
ReadyBlocks) dbgs() << Block->getID() << ' ';
dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs
) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) <<
' '; dbgs() << '\n'; dbgs() << "Current VGPRs: "
<< VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: "
<< SregCurrentUsage << '\n';; } } while (false)
1587 for (SIScheduleBlock *Blockdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Picking New Blocks\n"
; dbgs() << "Available: "; for (SIScheduleBlock *Block :
ReadyBlocks) dbgs() << Block->getID() << ' ';
dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs
) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) <<
' '; dbgs() << '\n'; dbgs() << "Current VGPRs: "
<< VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: "
<< SregCurrentUsage << '\n';; } } while (false)
1588 : ReadyBlocks) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Picking New Blocks\n"
; dbgs() << "Available: "; for (SIScheduleBlock *Block :
ReadyBlocks) dbgs() << Block->getID() << ' ';
dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs
) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) <<
' '; dbgs() << '\n'; dbgs() << "Current VGPRs: "
<< VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: "
<< SregCurrentUsage << '\n';; } } while (false)
1589 << Block->getID() << ' ';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Picking New Blocks\n"
; dbgs() << "Available: "; for (SIScheduleBlock *Block :
ReadyBlocks) dbgs() << Block->getID() << ' ';
dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs
) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) <<
' '; dbgs() << '\n'; dbgs() << "Current VGPRs: "
<< VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: "
<< SregCurrentUsage << '\n';; } } while (false)
1590 dbgs() << "\nCurrent Live:\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Picking New Blocks\n"
; dbgs() << "Available: "; for (SIScheduleBlock *Block :
ReadyBlocks) dbgs() << Block->getID() << ' ';
dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs
) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) <<
' '; dbgs() << '\n'; dbgs() << "Current VGPRs: "
<< VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: "
<< SregCurrentUsage << '\n';; } } while (false)
1591 for (unsigned Regdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Picking New Blocks\n"
; dbgs() << "Available: "; for (SIScheduleBlock *Block :
ReadyBlocks) dbgs() << Block->getID() << ' ';
dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs
) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) <<
' '; dbgs() << '\n'; dbgs() << "Current VGPRs: "
<< VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: "
<< SregCurrentUsage << '\n';; } } while (false)
1592 : LiveRegs) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Picking New Blocks\n"
; dbgs() << "Available: "; for (SIScheduleBlock *Block :
ReadyBlocks) dbgs() << Block->getID() << ' ';
dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs
) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) <<
' '; dbgs() << '\n'; dbgs() << "Current VGPRs: "
<< VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: "
<< SregCurrentUsage << '\n';; } } while (false)
1593 << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Picking New Blocks\n"
; dbgs() << "Available: "; for (SIScheduleBlock *Block :
ReadyBlocks) dbgs() << Block->getID() << ' ';
dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs
) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) <<
' '; dbgs() << '\n'; dbgs() << "Current VGPRs: "
<< VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: "
<< SregCurrentUsage << '\n';; } } while (false)
1594 dbgs() << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Picking New Blocks\n"
; dbgs() << "Available: "; for (SIScheduleBlock *Block :
ReadyBlocks) dbgs() << Block->getID() << ' ';
dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs
) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) <<
' '; dbgs() << '\n'; dbgs() << "Current VGPRs: "
<< VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: "
<< SregCurrentUsage << '\n';; } } while (false)
1595 dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Picking New Blocks\n"
; dbgs() << "Available: "; for (SIScheduleBlock *Block :
ReadyBlocks) dbgs() << Block->getID() << ' ';
dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs
) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) <<
' '; dbgs() << '\n'; dbgs() << "Current VGPRs: "
<< VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: "
<< SregCurrentUsage << '\n';; } } while (false)
1596 dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Picking New Blocks\n"
; dbgs() << "Available: "; for (SIScheduleBlock *Block :
ReadyBlocks) dbgs() << Block->getID() << ' ';
dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs
) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) <<
' '; dbgs() << '\n'; dbgs() << "Current VGPRs: "
<< VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: "
<< SregCurrentUsage << '\n';; } } while (false)
;
1597
1598 Cand.Block = nullptr;
1599 for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
1600 E = ReadyBlocks.end(); I != E; ++I) {
1601 SIBlockSchedCandidate TryCand;
1602 TryCand.Block = *I;
1603 TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1604 TryCand.VGPRUsageDiff =
1605 checkRegUsageImpact(TryCand.Block->getInRegs(),
1606 TryCand.Block->getOutRegs())[AMDGPU::RegisterPressureSets::VGPR_32];
1607 TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1608 TryCand.NumHighLatencySuccessors =
1609 TryCand.Block->getNumHighLatencySuccessors();
1610 TryCand.LastPosHighLatParentScheduled =
1611 (unsigned int) std::max<int> (0,
1612 LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
1613 LastPosWaitedHighLatency);
1614 TryCand.Height = TryCand.Block->Height;
1615 // Try not to increase VGPR usage too much, else we may spill.
1616 if (VregCurrentUsage > 120 ||
1617 Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
1618 if (!tryCandidateRegUsage(Cand, TryCand) &&
1619 Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
1620 tryCandidateLatency(Cand, TryCand);
1621 } else {
1622 if (!tryCandidateLatency(Cand, TryCand))
1623 tryCandidateRegUsage(Cand, TryCand);
1624 }
1625 if (TryCand.Reason != NoCand) {
1626 Cand.setBest(TryCand);
1627 Best = I;
1628 LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' 'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Best Current Choice: "
<< Cand.Block->getID() << ' ' << getReasonStr
(Cand.Reason) << '\n'; } } while (false)
1629 << getReasonStr(Cand.Reason) << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Best Current Choice: "
<< Cand.Block->getID() << ' ' << getReasonStr
(Cand.Reason) << '\n'; } } while (false)
;
1630 }
1631 }
1632
1633 LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Picking: " <<
Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: "
<< (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() <<
"Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled
<< '\n'; dbgs() << "VGPRUsageDiff: " << Cand
.VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while
(false)
1634 dbgs() << "Is a block with high latency instruction: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Picking: " <<
Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: "
<< (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() <<
"Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled
<< '\n'; dbgs() << "VGPRUsageDiff: " << Cand
.VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while
(false)
1635 << (Cand.IsHighLatency ? "yes\n" : "no\n");do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Picking: " <<
Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: "
<< (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() <<
"Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled
<< '\n'; dbgs() << "VGPRUsageDiff: " << Cand
.VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while
(false)
1636 dbgs() << "Position of last high latency dependency: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Picking: " <<
Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: "
<< (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() <<
"Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled
<< '\n'; dbgs() << "VGPRUsageDiff: " << Cand
.VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while
(false)
1637 << Cand.LastPosHighLatParentScheduled << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Picking: " <<
Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: "
<< (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() <<
"Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled
<< '\n'; dbgs() << "VGPRUsageDiff: " << Cand
.VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while
(false)
1638 dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Picking: " <<
Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: "
<< (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() <<
"Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled
<< '\n'; dbgs() << "VGPRUsageDiff: " << Cand
.VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while
(false)
1639 dbgs() << '\n';)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Picking: " <<
Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: "
<< (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() <<
"Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled
<< '\n'; dbgs() << "VGPRUsageDiff: " << Cand
.VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while
(false)
;
1640
1641 Block = Cand.Block;
1642 ReadyBlocks.erase(Best);
1643 return Block;
1644}
1645
1646// Tracking of currently alive registers to determine VGPR Usage.
1647
1648void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
1649 for (Register Reg : Regs) {
1650 // For now only track virtual registers.
1651 if (!Reg.isVirtual())
1652 continue;
1653 // If not already in the live set, then add it.
1654 (void) LiveRegs.insert(Reg);
1655 }
1656}
1657
1658void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
1659 std::set<unsigned> &Regs) {
1660 for (unsigned Reg : Regs) {
1661 // For now only track virtual registers.
1662 std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
1663 assert (Pos != LiveRegs.end() && // Reg must be live.(static_cast <bool> (Pos != LiveRegs.end() && LiveRegsConsumers
.find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers
[Reg] >= 1) ? void (0) : __assert_fail ("Pos != LiveRegs.end() && LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers[Reg] >= 1"
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1665, __extension__
__PRETTY_FUNCTION__))
1664 LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&(static_cast <bool> (Pos != LiveRegs.end() && LiveRegsConsumers
.find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers
[Reg] >= 1) ? void (0) : __assert_fail ("Pos != LiveRegs.end() && LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers[Reg] >= 1"
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1665, __extension__
__PRETTY_FUNCTION__))
1665 LiveRegsConsumers[Reg] >= 1)(static_cast <bool> (Pos != LiveRegs.end() && LiveRegsConsumers
.find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers
[Reg] >= 1) ? void (0) : __assert_fail ("Pos != LiveRegs.end() && LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers[Reg] >= 1"
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1665, __extension__
__PRETTY_FUNCTION__))
;
1666 --LiveRegsConsumers[Reg];
1667 if (LiveRegsConsumers[Reg] == 0)
1668 LiveRegs.erase(Pos);
1669 }
1670}
1671
1672void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
1673 for (const auto &Block : Parent->getSuccs()) {
1674 if (--BlockNumPredsLeft[Block.first->getID()] == 0)
1675 ReadyBlocks.push_back(Block.first);
1676
1677 if (Parent->isHighLatencyBlock() &&
1678 Block.second == SIScheduleBlockLinkKind::Data)
1679 LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
1680 }
1681}
1682
1683void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
1684 decreaseLiveRegs(Block, Block->getInRegs());
1685 addLiveRegs(Block->getOutRegs());
1686 releaseBlockSuccs(Block);
1687 for (const auto &RegP : LiveOutRegsNumUsages[Block->getID()]) {
1688 // We produce this register, thus it must not be previously alive.
1689 assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||(static_cast <bool> (LiveRegsConsumers.find(RegP.first)
== LiveRegsConsumers.end() || LiveRegsConsumers[RegP.first] ==
0) ? void (0) : __assert_fail ("LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || LiveRegsConsumers[RegP.first] == 0"
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1690, __extension__
__PRETTY_FUNCTION__))
1690 LiveRegsConsumers[RegP.first] == 0)(static_cast <bool> (LiveRegsConsumers.find(RegP.first)
== LiveRegsConsumers.end() || LiveRegsConsumers[RegP.first] ==
0) ? void (0) : __assert_fail ("LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || LiveRegsConsumers[RegP.first] == 0"
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1690, __extension__
__PRETTY_FUNCTION__))
;
1691 LiveRegsConsumers[RegP.first] += RegP.second;
1692 }
1693 if (LastPosHighLatencyParentScheduled[Block->getID()] >
1694 (unsigned)LastPosWaitedHighLatency)
1695 LastPosWaitedHighLatency =
1696 LastPosHighLatencyParentScheduled[Block->getID()];
1697 ++NumBlockScheduled;
1698}
1699
1700std::vector<int>
1701SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
1702 std::set<unsigned> &OutRegs) {
1703 std::vector<int> DiffSetPressure;
1704 DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
1705
1706 for (Register Reg : InRegs) {
1707 // For now only track virtual registers.
1708 if (!Reg.isVirtual())
1709 continue;
1710 if (LiveRegsConsumers[Reg] > 1)
1711 continue;
1712 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1713 for (; PSetI.isValid(); ++PSetI) {
1714 DiffSetPressure[*PSetI] -= PSetI.getWeight();
1715 }
1716 }
1717
1718 for (Register Reg : OutRegs) {
1719 // For now only track virtual registers.
1720 if (!Reg.isVirtual())
1721 continue;
1722 PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1723 for (; PSetI.isValid(); ++PSetI) {
1724 DiffSetPressure[*PSetI] += PSetI.getWeight();
1725 }
1726 }
1727
1728 return DiffSetPressure;
1729}
1730
1731// SIScheduler //
1732
1733struct SIScheduleBlockResult
1734SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1735 SISchedulerBlockSchedulerVariant ScheduleVariant) {
1736 SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
1737 SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
1738 std::vector<SIScheduleBlock*> ScheduledBlocks;
1739 struct SIScheduleBlockResult Res;
1740
1741 ScheduledBlocks = Scheduler.getBlocks();
1742
1743 for (SIScheduleBlock *Block : ScheduledBlocks) {
1744 std::vector<SUnit*> SUs = Block->getScheduledUnits();
1745
1746 for (SUnit* SU : SUs)
1747 Res.SUs.push_back(SU->NodeNum);
1748 }
1749
1750 Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
1751 Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
1752 return Res;
1753}
1754
1755// SIScheduleDAGMI //
1756
1757SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
1758 ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)) {
1759 SITII = static_cast<const SIInstrInfo*>(TII);
1760 SITRI = static_cast<const SIRegisterInfo*>(TRI);
1761}
1762
1763SIScheduleDAGMI::~SIScheduleDAGMI() = default;
1764
1765// Code adapted from scheduleDAG.cpp
1766// Does a topological sort over the SUs.
1767// Both TopDown and BottomUp
1768void SIScheduleDAGMI::topologicalSort() {
1769 Topo.InitDAGTopologicalSorting();
1770
1771 TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
1772 BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
1773}
1774
1775// Move low latencies further from their user without
1776// increasing SGPR usage (in general)
1777// This is to be replaced by a better pass that would
1778// take into account SGPR usage (based on VGPR Usage
1779// and the corresponding wavefront count), that would
1780// try to merge groups of loads if it make sense, etc
1781void SIScheduleDAGMI::moveLowLatencies() {
1782 unsigned DAGSize = SUnits.size();
1783 int LastLowLatencyUser = -1;
1784 int LastLowLatencyPos = -1;
1785
1786 for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
1787 SUnit *SU = &SUnits[ScheduledSUnits[i]];
1788 bool IsLowLatencyUser = false;
1789 unsigned MinPos = 0;
1790
1791 for (SDep& PredDep : SU->Preds) {
1792 SUnit *Pred = PredDep.getSUnit();
1793 if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
1794 IsLowLatencyUser = true;
1795 }
1796 if (Pred->NodeNum >= DAGSize)
1797 continue;
1798 unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
1799 if (PredPos >= MinPos)
1800 MinPos = PredPos + 1;
1801 }
1802
1803 if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1804 unsigned BestPos = LastLowLatencyUser + 1;
1805 if ((int)BestPos <= LastLowLatencyPos)
1806 BestPos = LastLowLatencyPos + 1;
1807 if (BestPos < MinPos)
1808 BestPos = MinPos;
1809 if (BestPos < i) {
1810 for (unsigned u = i; u > BestPos; --u) {
1811 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1812 ScheduledSUnits[u] = ScheduledSUnits[u-1];
1813 }
1814 ScheduledSUnits[BestPos] = SU->NodeNum;
1815 ScheduledSUnitsInv[SU->NodeNum] = BestPos;
1816 }
1817 LastLowLatencyPos = BestPos;
1818 if (IsLowLatencyUser)
1819 LastLowLatencyUser = BestPos;
1820 } else if (IsLowLatencyUser) {
1821 LastLowLatencyUser = i;
1822 // Moves COPY instructions on which depends
1823 // the low latency instructions too.
1824 } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
1825 bool CopyForLowLat = false;
1826 for (SDep& SuccDep : SU->Succs) {
1827 SUnit *Succ = SuccDep.getSUnit();
1828 if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
1829 continue;
1830 if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
1831 CopyForLowLat = true;
1832 }
1833 }
1834 if (!CopyForLowLat)
1835 continue;
1836 if (MinPos < i) {
1837 for (unsigned u = i; u > MinPos; --u) {
1838 ++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1839 ScheduledSUnits[u] = ScheduledSUnits[u-1];
1840 }
1841 ScheduledSUnits[MinPos] = SU->NodeNum;
1842 ScheduledSUnitsInv[SU->NodeNum] = MinPos;
1843 }
1844 }
1845 }
1846}
1847
1848void SIScheduleDAGMI::restoreSULinksLeft() {
1849 for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1850 SUnits[i].isScheduled = false;
1851 SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
1852 SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
1853 SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
1854 SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
1855 }
1856}
1857
1858// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1859template<typename _Iterator> void
1860SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
1861 unsigned &VgprUsage, unsigned &SgprUsage) {
1862 VgprUsage = 0;
1863 SgprUsage = 0;
1864 for (_Iterator RegI = First; RegI != End; ++RegI) {
1865 Register Reg = *RegI;
1866 // For now only track virtual registers
1867 if (!Reg.isVirtual())
1868 continue;
1869 PSetIterator PSetI = MRI.getPressureSets(Reg);
1870 for (; PSetI.isValid(); ++PSetI) {
1871 if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32)
1872 VgprUsage += PSetI.getWeight();
1873 else if (*PSetI == AMDGPU::RegisterPressureSets::SReg_32)
1874 SgprUsage += PSetI.getWeight();
1875 }
1876 }
1877}
1878
1879void SIScheduleDAGMI::schedule()
1880{
1881 SmallVector<SUnit*, 8> TopRoots, BotRoots;
1882 SIScheduleBlockResult Best, Temp;
1883 LLVM_DEBUG(dbgs() << "Preparing Scheduling\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Preparing Scheduling\n"
; } } while (false)
;
1884
1885 buildDAGWithRegPressure();
1886 postProcessDAG();
1887
1888 LLVM_DEBUG(dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dump(); } } while (false)
;
1889 if (PrintDAGs)
1890 dump();
1891 if (ViewMISchedDAGs)
1892 viewGraph();
1893
1894 topologicalSort();
1895 findRootsAndBiasEdges(TopRoots, BotRoots);
1896 // We reuse several ScheduleDAGMI and ScheduleDAGMILive
1897 // functions, but to make them happy we must initialize
1898 // the default Scheduler implementation (even if we do not
1899 // run it)
1900 SchedImpl->initialize(this);
1901 initQueues(TopRoots, BotRoots);
1902
1903 // Fill some stats to help scheduling.
1904
1905 SUnitsLinksBackup = SUnits;
1906 IsLowLatencySU.clear();
1907 LowLatencyOffset.clear();
1908 IsHighLatencySU.clear();
1909
1910 IsLowLatencySU.resize(SUnits.size(), 0);
1911 LowLatencyOffset.resize(SUnits.size(), 0);
1912 IsHighLatencySU.resize(SUnits.size(), 0);
1913
1914 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1915 SUnit *SU = &SUnits[i];
1916 const MachineOperand *BaseLatOp;
1917 int64_t OffLatReg;
1918 if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1919 IsLowLatencySU[i] = 1;
1920 bool OffsetIsScalable;
1921 if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg,
1922 OffsetIsScalable, TRI))
1923 LowLatencyOffset[i] = OffLatReg;
1924 } else if (SITII->isHighLatencyDef(SU->getInstr()->getOpcode()))
1925 IsHighLatencySU[i] = 1;
1926 }
1927
1928 SIScheduler Scheduler(this);
1929 Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
1930 SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
1931
1932 // if VGPR usage is extremely high, try other good performing variants
1933 // which could lead to lower VGPR usage
1934 if (Best.MaxVGPRUsage > 180) {
1935 static const std::pair<SISchedulerBlockCreatorVariant,
1936 SISchedulerBlockSchedulerVariant>
1937 Variants[] = {
1938 { LatenciesAlone, BlockRegUsageLatency },
1939// { LatenciesAlone, BlockRegUsage },
1940 { LatenciesGrouped, BlockLatencyRegUsage },
1941// { LatenciesGrouped, BlockRegUsageLatency },
1942// { LatenciesGrouped, BlockRegUsage },
1943 { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1944// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1945// { LatenciesAlonePlusConsecutive, BlockRegUsage }
1946 };
1947 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1948 Temp = Scheduler.scheduleVariant(v.first, v.second);
1949 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1950 Best = Temp;
1951 }
1952 }
1953 // if VGPR usage is still extremely high, we may spill. Try other variants
1954 // which are less performing, but that could lead to lower VGPR usage.
1955 if (Best.MaxVGPRUsage > 200) {
1956 static const std::pair<SISchedulerBlockCreatorVariant,
1957 SISchedulerBlockSchedulerVariant>
1958 Variants[] = {
1959// { LatenciesAlone, BlockRegUsageLatency },
1960 { LatenciesAlone, BlockRegUsage },
1961// { LatenciesGrouped, BlockLatencyRegUsage },
1962 { LatenciesGrouped, BlockRegUsageLatency },
1963 { LatenciesGrouped, BlockRegUsage },
1964// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1965 { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1966 { LatenciesAlonePlusConsecutive, BlockRegUsage }
1967 };
1968 for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1969 Temp = Scheduler.scheduleVariant(v.first, v.second);
1970 if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1971 Best = Temp;
1972 }
1973 }
1974
1975 ScheduledSUnits = Best.SUs;
1976 ScheduledSUnitsInv.resize(SUnits.size());
1977
1978 for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1979 ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
1980 }
1981
1982 moveLowLatencies();
1983
1984 // Tell the outside world about the result of the scheduling.
1985
1986 assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker")(static_cast <bool> (TopRPTracker.getPos() == RegionBegin
&& "bad initial Top tracker") ? void (0) : __assert_fail
("TopRPTracker.getPos() == RegionBegin && \"bad initial Top tracker\""
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1986, __extension__
__PRETTY_FUNCTION__))
;
1987 TopRPTracker.setPos(CurrentTop);
1988
1989 for (unsigned I : ScheduledSUnits) {
1990 SUnit *SU = &SUnits[I];
1991
1992 scheduleMI(SU, true);
1993
1994 LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Scheduling SU(" <<
SU->NodeNum << ") " << *SU->getInstr(); } }
while (false)
1995 << *SU->getInstr())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { dbgs() << "Scheduling SU(" <<
SU->NodeNum << ") " << *SU->getInstr(); } }
while (false)
;
1996 }
1997
1998 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.")(static_cast <bool> (CurrentTop == CurrentBottom &&
"Nonempty unscheduled zone.") ? void (0) : __assert_fail ("CurrentTop == CurrentBottom && \"Nonempty unscheduled zone.\""
, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1998, __extension__
__PRETTY_FUNCTION__))
;
1999
2000 placeDebugValues();
2001
2002 LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { { dbgs() << "*** Final schedule for "
<< printMBBReference(*begin()->getParent()) <<
" ***\n"; dumpSchedule(); dbgs() << '\n'; }; } } while
(false)
2003 dbgs() << "*** Final schedule for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { { dbgs() << "*** Final schedule for "
<< printMBBReference(*begin()->getParent()) <<
" ***\n"; dumpSchedule(); dbgs() << '\n'; }; } } while
(false)
2004 << printMBBReference(*begin()->getParent()) << " ***\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { { dbgs() << "*** Final schedule for "
<< printMBBReference(*begin()->getParent()) <<
" ***\n"; dumpSchedule(); dbgs() << '\n'; }; } } while
(false)
2005 dumpSchedule();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { { dbgs() << "*** Final schedule for "
<< printMBBReference(*begin()->getParent()) <<
" ***\n"; dumpSchedule(); dbgs() << '\n'; }; } } while
(false)
2006 dbgs() << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { { dbgs() << "*** Final schedule for "
<< printMBBReference(*begin()->getParent()) <<
" ***\n"; dumpSchedule(); dbgs() << '\n'; }; } } while
(false)
2007 })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("machine-scheduler")) { { dbgs() << "*** Final schedule for "
<< printMBBReference(*begin()->getParent()) <<
" ***\n"; dumpSchedule(); dbgs() << '\n'; }; } } while
(false)
;
2008}