/build/source/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp

Bug Summary

File:	build/source/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
Warning:	line 770, column 9 Value stored to 'ProposedColor' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name SIMachineScheduler.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-17/lib/clang/17 -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/source/llvm/lib/Target/AMDGPU -I include -I /build/source/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/source/= -source-date-epoch 1683717183 -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-05-10-133810-16478-1 -x c++ /build/source/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp

1	//===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// SI Machine Scheduler interface
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "SIMachineScheduler.h"
15	#include "SIInstrInfo.h"
16	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17	#include "llvm/CodeGen/LiveIntervals.h"
18	#include "llvm/CodeGen/MachineRegisterInfo.h"
19
20	using namespace llvm;
21
22	#define DEBUG_TYPE"machine-scheduler" "machine-scheduler"
23
24	// This scheduler implements a different scheduling algorithm than
25	// GenericScheduler.
26	//
27	// There are several specific architecture behaviours that can't be modelled
28	// for GenericScheduler:
29	// . When accessing the result of an SGPR load instruction, you have to wait
30	// for all the SGPR load instructions before your current instruction to
31	// have finished.
32	// . When accessing the result of an VGPR load instruction, you have to wait
33	// for all the VGPR load instructions previous to the VGPR load instruction
34	// you are interested in to finish.
35	// . The less the register pressure, the best load latencies are hidden
36	//
37	// Moreover some specifities (like the fact a lot of instructions in the shader
38	// have few dependencies) makes the generic scheduler have some unpredictable
39	// behaviours. For example when register pressure becomes high, it can either
40	// manage to prevent register pressure from going too high, or it can
41	// increase register pressure even more than if it hadn't taken register
42	// pressure into account.
43	//
44	// Also some other bad behaviours are generated, like loading at the beginning
45	// of the shader a constant in VGPR you won't need until the end of the shader.
46	//
47	// The scheduling problem for SI can distinguish three main parts:
48	// . Hiding high latencies (texture sampling, etc)
49	// . Hiding low latencies (SGPR constant loading, etc)
50	// . Keeping register usage low for better latency hiding and general
51	// performance
52	//
53	// Some other things can also affect performance, but are hard to predict
54	// (cache usage, the fact the HW can issue several instructions from different
55	// wavefronts if different types, etc)
56	//
57	// This scheduler tries to solve the scheduling problem by dividing it into
58	// simpler sub-problems. It divides the instructions into blocks, schedules
59	// locally inside the blocks where it takes care of low latencies, and then
60	// chooses the order of the blocks by taking care of high latencies.
61	// Dividing the instructions into blocks helps control keeping register
62	// usage low.
63	//
64	// First the instructions are put into blocks.
65	// We want the blocks help control register usage and hide high latencies
66	// later. To help control register usage, we typically want all local
67	// computations, when for example you create a result that can be consumed
68	// right away, to be contained in a block. Block inputs and outputs would
69	// typically be important results that are needed in several locations of
70	// the shader. Since we do want blocks to help hide high latencies, we want
71	// the instructions inside the block to have a minimal set of dependencies
72	// on high latencies. It will make it easy to pick blocks to hide specific
73	// high latencies.
74	// The block creation algorithm is divided into several steps, and several
75	// variants can be tried during the scheduling process.
76	//
77	// Second the order of the instructions inside the blocks is chosen.
78	// At that step we do take into account only register usage and hiding
79	// low latency instructions
80	//
81	// Third the block order is chosen, there we try to hide high latencies
82	// and keep register usage low.
83	//
84	// After the third step, a pass is done to improve the hiding of low
85	// latencies.
86	//
87	// Actually when talking about 'low latency' or 'high latency' it includes
88	// both the latency to get the cache (or global mem) data go to the register,
89	// and the bandwidth limitations.
90	// Increasing the number of active wavefronts helps hide the former, but it
91	// doesn't solve the latter, thus why even if wavefront count is high, we have
92	// to try have as many instructions hiding high latencies as possible.
93	// The OpenCL doc says for example latency of 400 cycles for a global mem
94	// access, which is hidden by 10 instructions if the wavefront count is 10.
95
96	// Some figures taken from AMD docs:
97	// Both texture and constant L1 caches are 4-way associative with 64 bytes
98	// lines.
99	// Constant cache is shared with 4 CUs.
100	// For texture sampling, the address generation unit receives 4 texture
101	// addresses per cycle, thus we could expect texture sampling latency to be
102	// equivalent to 4 instructions in the very best case (a VGPR is 64 work items,
103	// instructions in a wavefront group are executed every 4 cycles),
104	// or 16 instructions if the other wavefronts associated to the 3 other VALUs
105	// of the CU do texture sampling too. (Don't take these figures too seriously,
106	// as I'm not 100% sure of the computation)
107	// Data exports should get similar latency.
108	// For constant loading, the cache is shader with 4 CUs.
109	// The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit"
110	// I guess if the other CU don't read the cache, it can go up to 64B/cycle.
111	// It means a simple s_buffer_load should take one instruction to hide, as
112	// well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same
113	// cache line.
114	//
115	// As of today the driver doesn't preload the constants in cache, thus the
116	// first loads get extra latency. The doc says global memory access can be
117	// 300-600 cycles. We do not specially take that into account when scheduling
118	// As we expect the driver to be able to preload the constants soon.
119
120	// common code //
121
122	#ifndef NDEBUG
123
124	static const char *getReasonStr(SIScheduleCandReason Reason) {
125	switch (Reason) {
126	case NoCand: return "NOCAND";
127	case RegUsage: return "REGUSAGE";
128	case Latency: return "LATENCY";
129	case Successor: return "SUCCESSOR";
130	case Depth: return "DEPTH";
131	case NodeOrder: return "ORDER";
132	}
133	llvm_unreachable("Unknown reason!")::llvm::llvm_unreachable_internal("Unknown reason!", "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 133);
134	}
135
136	#endif
137
138	namespace llvm {
139	namespace SISched {
140	static bool tryLess(int TryVal, int CandVal,
141	SISchedulerCandidate &TryCand,
142	SISchedulerCandidate &Cand,
143	SIScheduleCandReason Reason) {
144	if (TryVal < CandVal) {
145	TryCand.Reason = Reason;
146	return true;
147	}
148	if (TryVal > CandVal) {
149	if (Cand.Reason > Reason)
150	Cand.Reason = Reason;
151	return true;
152	}
153	Cand.setRepeat(Reason);
154	return false;
155	}
156
157	static bool tryGreater(int TryVal, int CandVal,
158	SISchedulerCandidate &TryCand,
159	SISchedulerCandidate &Cand,
160	SIScheduleCandReason Reason) {
161	if (TryVal > CandVal) {
162	TryCand.Reason = Reason;
163	return true;
164	}
165	if (TryVal < CandVal) {
166	if (Cand.Reason > Reason)
167	Cand.Reason = Reason;
168	return true;
169	}
170	Cand.setRepeat(Reason);
171	return false;
172	}
173	} // end namespace SISched
174	} // end namespace llvm
175
176	// SIScheduleBlock //
177
178	void SIScheduleBlock::addUnit(SUnit *SU) {
179	NodeNum2Index[SU->NodeNum] = SUnits.size();
180	SUnits.push_back(SU);
181	}
182
183	#ifndef NDEBUG
184	void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) {
185
186	dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
187	dbgs() << '\n';
188	}
189	#endif
190
191	void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand,
192	SISchedCandidate &TryCand) {
193	// Initialize the candidate if needed.
194	if (!Cand.isValid()) {
195	TryCand.Reason = NodeOrder;
196	return;
197	}
198
199	if (Cand.SGPRUsage > 60 &&
200	SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage,
201	TryCand, Cand, RegUsage))
202	return;
203
204	// Schedule low latency instructions as top as possible.
205	// Order of priority is:
206	// . Low latency instructions which do not depend on other low latency
207	// instructions we haven't waited for
208	// . Other instructions which do not depend on low latency instructions
209	// we haven't waited for
210	// . Low latencies
211	// . All other instructions
212	// Goal is to get: low latency instructions - independent instructions
213	// - (eventually some more low latency instructions)
214	// - instructions that depend on the first low latency instructions.
215	// If in the block there is a lot of constant loads, the SGPR usage
216	// could go quite high, thus above the arbitrary limit of 60 will encourage
217	// use the already loaded constants (in order to release some SGPRs) before
218	// loading more.
219	if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent,
220	Cand.HasLowLatencyNonWaitedParent,
221	TryCand, Cand, SIScheduleCandReason::Depth))
222	return;
223
224	if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency,
225	TryCand, Cand, SIScheduleCandReason::Depth))
226	return;
227
228	if (TryCand.IsLowLatency &&
229	SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset,
230	TryCand, Cand, SIScheduleCandReason::Depth))
231	return;
232
233	if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage,
234	TryCand, Cand, RegUsage))
235	return;
236
237	// Fall through to original instruction order.
238	if (TryCand.SU->NodeNum < Cand.SU->NodeNum) {
239	TryCand.Reason = NodeOrder;
240	}
241	}
242
243	SUnit* SIScheduleBlock::pickNode() {
244	SISchedCandidate TopCand;
245
246	for (SUnit* SU : TopReadySUs) {
247	SISchedCandidate TryCand;
248	std::vector<unsigned> pressure;
249	std::vector<unsigned> MaxPressure;
250	// Predict register usage after this instruction.
251	TryCand.SU = SU;
252	TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
253	TryCand.SGPRUsage = pressure[AMDGPU::RegisterPressureSets::SReg_32];
254	TryCand.VGPRUsage = pressure[AMDGPU::RegisterPressureSets::VGPR_32];
255	TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
256	TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
257	TryCand.HasLowLatencyNonWaitedParent =
258	HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]];
259	tryCandidateTopDown(TopCand, TryCand);
260	if (TryCand.Reason != NoCand)
261	TopCand.setBest(TryCand);
262	}
263
264	return TopCand.SU;
265	}
266
267
268	// Schedule something valid.
269	void SIScheduleBlock::fastSchedule() {
270	TopReadySUs.clear();
271	if (Scheduled)
272	undoSchedule();
273
274	for (SUnit* SU : SUnits) {
275	if (!SU->NumPredsLeft)
276	TopReadySUs.push_back(SU);
277	}
278
279	while (!TopReadySUs.empty()) {
280	SUnit *SU = TopReadySUs[0];
281	ScheduledSUnits.push_back(SU);
282	nodeScheduled(SU);
283	}
284
285	Scheduled = true;
286	}
287
288	// Returns if the register was set between first and last.
289	static bool isDefBetween(unsigned Reg,
290	SlotIndex First, SlotIndex Last,
291	const MachineRegisterInfo *MRI,
292	const LiveIntervals *LIS) {
293	for (MachineRegisterInfo::def_instr_iterator
294	UI = MRI->def_instr_begin(Reg),
295	UE = MRI->def_instr_end(); UI != UE; ++UI) {
296	const MachineInstr* MI = &*UI;
297	if (MI->isDebugValue())
298	continue;
299	SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
300	if (InstSlot >= First && InstSlot <= Last)
301	return true;
302	}
303	return false;
304	}
305
306	void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
307	MachineBasicBlock::iterator EndBlock) {
308	IntervalPressure Pressure, BotPressure;
309	RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure);
310	LiveIntervals *LIS = DAG->getLIS();
311	MachineRegisterInfo *MRI = DAG->getMRI();
312	DAG->initRPTracker(TopRPTracker);
313	DAG->initRPTracker(BotRPTracker);
314	DAG->initRPTracker(RPTracker);
315
316	// Goes though all SU. RPTracker captures what had to be alive for the SUs
317	// to execute, and what is still alive at the end.
318	for (SUnit* SU : ScheduledSUnits) {
319	RPTracker.setPos(SU->getInstr());
320	RPTracker.advance();
321	}
322
323	// Close the RPTracker to finalize live ins/outs.
324	RPTracker.closeRegion();
325
326	// Initialize the live ins and live outs.
327	TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
328	BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
329
330	// Do not Track Physical Registers, because it messes up.
331	for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
332	if (RegMaskPair.RegUnit.isVirtual())
333	LiveInRegs.insert(RegMaskPair.RegUnit);
334	}
335	LiveOutRegs.clear();
336	// There is several possibilities to distinguish:
337	// 1) Reg is not input to any instruction in the block, but is output of one
338	// 2) 1) + read in the block and not needed after it
339	// 3) 1) + read in the block but needed in another block
340	// 4) Reg is input of an instruction but another block will read it too
341	// 5) Reg is input of an instruction and then rewritten in the block.
342	// result is not read in the block (implies used in another block)
343	// 6) Reg is input of an instruction and then rewritten in the block.
344	// result is read in the block and not needed in another block
345	// 7) Reg is input of an instruction and then rewritten in the block.
346	// result is read in the block but also needed in another block
347	// LiveInRegs will contains all the regs in situation 4, 5, 6, 7
348	// We want LiveOutRegs to contain only Regs whose content will be read after
349	// in another block, and whose content was written in the current block,
350	// that is we want it to get 1, 3, 5, 7
351	// Since we made the MIs of a block to be packed all together before
352	// scheduling, then the LiveIntervals were correct, and the RPTracker was
353	// able to correctly handle 5 vs 6, 2 vs 3.
354	// (Note: This is not sufficient for RPTracker to not do mistakes for case 4)
355	// The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7
356	// Comparing to LiveInRegs is not sufficient to differentiate 4 vs 5, 7
357	// The use of findDefBetween removes the case 4.
358	for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
359	Register Reg = RegMaskPair.RegUnit;
360	if (Reg.isVirtual() &&
361	isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
362	LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
363	LIS)) {
364	LiveOutRegs.insert(Reg);
365	}
366	}
367
368	// Pressure = sum_alive_registers register size
369	// Internally llvm will represent some registers as big 128 bits registers
370	// for example, but they actually correspond to 4 actual 32 bits registers.
371	// Thus Pressure is not equal to num_alive_registers * constant.
372	LiveInPressure = TopPressure.MaxSetPressure;
373	LiveOutPressure = BotPressure.MaxSetPressure;
374
375	// Prepares TopRPTracker for top down scheduling.
376	TopRPTracker.closeTop();
377	}
378
379	void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock,
380	MachineBasicBlock::iterator EndBlock) {
381	if (!Scheduled)
382	fastSchedule();
383
384	// PreScheduling phase to set LiveIn and LiveOut.
385	initRegPressure(BeginBlock, EndBlock);
386	undoSchedule();
387
388	// Schedule for real now.
389
390	TopReadySUs.clear();
391
392	for (SUnit* SU : SUnits) {
393	if (!SU->NumPredsLeft)
394	TopReadySUs.push_back(SU);
395	}
396
397	while (!TopReadySUs.empty()) {
398	SUnit *SU = pickNode();
399	ScheduledSUnits.push_back(SU);
400	TopRPTracker.setPos(SU->getInstr());
401	TopRPTracker.advance();
402	nodeScheduled(SU);
403	}
404
405	// TODO: compute InternalAdditionalPressure.
406	InternalAdditionalPressure.resize(TopPressure.MaxSetPressure.size());
407
408	// Check everything is right.
409	#ifndef NDEBUG
410	assert(SUnits.size() == ScheduledSUnits.size() &&(static_cast <bool> (SUnits.size() == ScheduledSUnits.size () && TopReadySUs.empty()) ? void (0) : __assert_fail ("SUnits.size() == ScheduledSUnits.size() && TopReadySUs.empty()" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 411, __extension__ __PRETTY_FUNCTION__))
411	TopReadySUs.empty())(static_cast <bool> (SUnits.size() == ScheduledSUnits.size () && TopReadySUs.empty()) ? void (0) : __assert_fail ("SUnits.size() == ScheduledSUnits.size() && TopReadySUs.empty()" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 411, __extension__ __PRETTY_FUNCTION__));
412	for (SUnit* SU : SUnits) {
413	assert(SU->isScheduled &&(static_cast <bool> (SU->isScheduled && SU-> NumPredsLeft == 0) ? void (0) : __assert_fail ("SU->isScheduled && SU->NumPredsLeft == 0" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 414, __extension__ __PRETTY_FUNCTION__))
414	SU->NumPredsLeft == 0)(static_cast <bool> (SU->isScheduled && SU-> NumPredsLeft == 0) ? void (0) : __assert_fail ("SU->isScheduled && SU->NumPredsLeft == 0" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 414, __extension__ __PRETTY_FUNCTION__));
415	}
416	#endif
417
418	Scheduled = true;
419	}
420
421	void SIScheduleBlock::undoSchedule() {
422	for (SUnit* SU : SUnits) {
423	SU->isScheduled = false;
424	for (SDep& Succ : SU->Succs) {
425	if (BC->isSUInBlock(Succ.getSUnit(), ID))
426	undoReleaseSucc(SU, &Succ);
427	}
428	}
429	HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
430	ScheduledSUnits.clear();
431	Scheduled = false;
432	}
433
434	void SIScheduleBlock::undoReleaseSucc(SUnit SU, SDep SuccEdge) {
435	SUnit *SuccSU = SuccEdge->getSUnit();
436
437	if (SuccEdge->isWeak()) {
438	++SuccSU->WeakPredsLeft;
439	return;
440	}
441	++SuccSU->NumPredsLeft;
442	}
443
444	void SIScheduleBlock::releaseSucc(SUnit SU, SDep SuccEdge) {
445	SUnit *SuccSU = SuccEdge->getSUnit();
446
447	if (SuccEdge->isWeak()) {
448	--SuccSU->WeakPredsLeft;
449	return;
450	}
451	#ifndef NDEBUG
452	if (SuccSU->NumPredsLeft == 0) {
453	dbgs() << "* Scheduling failed! *\n";
454	DAG->dumpNode(*SuccSU);
455	dbgs() << " has been released too many times!\n";
456	llvm_unreachable(nullptr)::llvm::llvm_unreachable_internal(nullptr, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 456);
457	}
458	#endif
459
460	--SuccSU->NumPredsLeft;
461	}
462
463	/// Release Successors of the SU that are in the block or not.
464	void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) {
465	for (SDep& Succ : SU->Succs) {
466	SUnit *SuccSU = Succ.getSUnit();
467
468	if (SuccSU->NodeNum >= DAG->SUnits.size())
469	continue;
470
471	if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock)
472	continue;
473
474	releaseSucc(SU, &Succ);
475	if (SuccSU->NumPredsLeft == 0 && InOrOutBlock)
476	TopReadySUs.push_back(SuccSU);
477	}
478	}
479
480	void SIScheduleBlock::nodeScheduled(SUnit *SU) {
481	// Is in TopReadySUs
482	assert (!SU->NumPredsLeft)(static_cast <bool> (!SU->NumPredsLeft) ? void (0) : __assert_fail ("!SU->NumPredsLeft", "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 482, __extension__ __PRETTY_FUNCTION__));
483	std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU);
484	if (I == TopReadySUs.end()) {
485	dbgs() << "Data Structure Bug in SI Scheduler\n";
486	llvm_unreachable(nullptr)::llvm::llvm_unreachable_internal(nullptr, "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 486);
487	}
488	TopReadySUs.erase(I);
489
490	releaseSuccessors(SU, true);
491	// Scheduling this node will trigger a wait,
492	// thus propagate to other instructions that they do not need to wait either.
493	if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]])
494	HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0);
495
496	if (DAG->IsLowLatencySU[SU->NodeNum]) {
497	for (SDep& Succ : SU->Succs) {
498	std::map<unsigned, unsigned>::iterator I =
499	NodeNum2Index.find(Succ.getSUnit()->NodeNum);
500	if (I != NodeNum2Index.end())
501	HasLowLatencyNonWaitedParent[I->second] = 1;
502	}
503	}
504	SU->isScheduled = true;
505	}
506
507	void SIScheduleBlock::finalizeUnits() {
508	// We remove links from outside blocks to enable scheduling inside the block.
509	for (SUnit* SU : SUnits) {
510	releaseSuccessors(SU, false);
511	if (DAG->IsHighLatencySU[SU->NodeNum])
512	HighLatencyBlock = true;
513	}
514	HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0);
515	}
516
517	// we maintain ascending order of IDs
518	void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
519	unsigned PredID = Pred->getID();
520
521	// Check if not already predecessor.
522	for (SIScheduleBlock* P : Preds) {
523	if (PredID == P->getID())
524	return;
525	}
526	Preds.push_back(Pred);
527
528	assert(none_of(Succs,(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock , SIScheduleBlockLinkKind> S) { return PredID == S.first-> getID(); }) && "Loop in the Block Graph!") ? void (0) : __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\"" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 533, __extension__ __PRETTY_FUNCTION__))
529	[=](std::pair<SIScheduleBlock,(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock , SIScheduleBlockLinkKind> S) { return PredID == S.first-> getID(); }) && "Loop in the Block Graph!") ? void (0) : __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\"" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 533, __extension__ __PRETTY_FUNCTION__))
530	SIScheduleBlockLinkKind> S) {(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock , SIScheduleBlockLinkKind> S) { return PredID == S.first-> getID(); }) && "Loop in the Block Graph!") ? void (0) : __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\"" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 533, __extension__ __PRETTY_FUNCTION__))
531	return PredID == S.first->getID();(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock , SIScheduleBlockLinkKind> S) { return PredID == S.first-> getID(); }) && "Loop in the Block Graph!") ? void (0) : __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\"" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 533, __extension__ __PRETTY_FUNCTION__))
532	}) &&(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock , SIScheduleBlockLinkKind> S) { return PredID == S.first-> getID(); }) && "Loop in the Block Graph!") ? void (0) : __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\"" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 533, __extension__ __PRETTY_FUNCTION__))
533	"Loop in the Block Graph!")(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock , SIScheduleBlockLinkKind> S) { return PredID == S.first-> getID(); }) && "Loop in the Block Graph!") ? void (0) : __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\"" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 533, __extension__ __PRETTY_FUNCTION__));
534	}
535
536	void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
537	SIScheduleBlockLinkKind Kind) {
538	unsigned SuccID = Succ->getID();
539
540	// Check if not already predecessor.
541	for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
542	if (SuccID == S.first->getID()) {
543	if (S.second == SIScheduleBlockLinkKind::NoData &&
544	Kind == SIScheduleBlockLinkKind::Data)
545	S.second = Kind;
546	return;
547	}
548	}
549	if (Succ->isHighLatencyBlock())
550	++NumHighLatencySuccessors;
551	Succs.push_back(std::pair(Succ, Kind));
552
553	assert(none_of(Preds,(static_cast <bool> (none_of(Preds, [=](SIScheduleBlock P) { return SuccID == P->getID(); }) && "Loop in the Block Graph!" ) ? void (0) : __assert_fail ("none_of(Preds, [=](SIScheduleBlock P) { return SuccID == P->getID(); }) && \"Loop in the Block Graph!\"" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 555, __extension__ __PRETTY_FUNCTION__))
554	[=](SIScheduleBlock P) { return SuccID == P->getID(); }) &&(static_cast <bool> (none_of(Preds, [=](SIScheduleBlock P) { return SuccID == P->getID(); }) && "Loop in the Block Graph!" ) ? void (0) : __assert_fail ("none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && \"Loop in the Block Graph!\"" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 555, __extension__ __PRETTY_FUNCTION__))
555	"Loop in the Block Graph!")(static_cast <bool> (none_of(Preds, [=](SIScheduleBlock P) { return SuccID == P->getID(); }) && "Loop in the Block Graph!" ) ? void (0) : __assert_fail ("none_of(Preds, [=](SIScheduleBlock P) { return SuccID == P->getID(); }) && \"Loop in the Block Graph!\"" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 555, __extension__ __PRETTY_FUNCTION__));
556	}
557
558	#ifndef NDEBUG
559	void SIScheduleBlock::printDebug(bool full) {
560	dbgs() << "Block (" << ID << ")\n";
561	if (!full)
562	return;
563
564	dbgs() << "\nContains High Latency Instruction: "
565	<< HighLatencyBlock << '\n';
566	dbgs() << "\nDepends On:\n";
567	for (SIScheduleBlock* P : Preds) {
568	P->printDebug(false);
569	}
570
571	dbgs() << "\nSuccessors:\n";
572	for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
573	if (S.second == SIScheduleBlockLinkKind::Data)
574	dbgs() << "(Data Dep) ";
575	S.first->printDebug(false);
576	}
577
578	if (Scheduled) {
579	dbgs() << "LiveInPressure "
580	<< LiveInPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' '
581	<< LiveInPressure[AMDGPU::RegisterPressureSets::VGPR_32] << '\n';
582	dbgs() << "LiveOutPressure "
583	<< LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' '
584	<< LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n";
585	dbgs() << "LiveIns:\n";
586	for (unsigned Reg : LiveInRegs)
587	dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
588
589	dbgs() << "\nLiveOuts:\n";
590	for (unsigned Reg : LiveOutRegs)
591	dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
592	}
593
594	dbgs() << "\nInstructions:\n";
595	for (const SUnit* SU : SUnits)
596	DAG->dumpNode(*SU);
597
598	dbgs() << "///////////////////////\n";
599	}
600	#endif
601
602	// SIScheduleBlockCreator //
603
604	SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG)
605	: DAG(DAG) {}
606
607	SIScheduleBlocks
608	SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) {
609	std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B =
610	Blocks.find(BlockVariant);
611	if (B == Blocks.end()) {
612	SIScheduleBlocks Res;
613	createBlocksForVariant(BlockVariant);
614	topologicalSort();
615	scheduleInsideBlocks();
616	fillStats();
617	Res.Blocks = CurrentBlocks;
618	Res.TopDownIndex2Block = TopDownIndex2Block;
619	Res.TopDownBlock2Index = TopDownBlock2Index;
620	Blocks[BlockVariant] = Res;
621	return Res;
622	} else {
623	return B->second;
624	}
625	}
626
627	bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) {
628	if (SU->NodeNum >= DAG->SUnits.size())
629	return false;
630	return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID;
631	}
632
633	void SIScheduleBlockCreator::colorHighLatenciesAlone() {
634	unsigned DAGSize = DAG->SUnits.size();
635
636	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
637	SUnit *SU = &DAG->SUnits[i];
638	if (DAG->IsHighLatencySU[SU->NodeNum]) {
639	CurrentColoring[SU->NodeNum] = NextReservedID++;
640	}
641	}
642	}
643
644	static bool
645	hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
646	for (const auto &PredDep : SU.Preds) {
647	if (PredDep.getSUnit() == &FromSU &&
648	PredDep.getKind() == llvm::SDep::Data)
649	return true;
650	}
651	return false;
652	}
653
654	void SIScheduleBlockCreator::colorHighLatenciesGroups() {
655	unsigned DAGSize = DAG->SUnits.size();
656	unsigned NumHighLatencies = 0;
657	unsigned GroupSize;
658	int Color = NextReservedID;
659	unsigned Count = 0;
660	std::set<unsigned> FormingGroup;
661
662	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
663	SUnit *SU = &DAG->SUnits[i];
664	if (DAG->IsHighLatencySU[SU->NodeNum])
665	++NumHighLatencies;
666	}
667
668	if (NumHighLatencies == 0)
669	return;
670
671	if (NumHighLatencies <= 6)
672	GroupSize = 2;
673	else if (NumHighLatencies <= 12)
674	GroupSize = 3;
675	else
676	GroupSize = 4;
677
678	for (unsigned SUNum : DAG->TopDownIndex2SU) {
679	const SUnit &SU = DAG->SUnits[SUNum];
680	if (DAG->IsHighLatencySU[SU.NodeNum]) {
681	unsigned CompatibleGroup = true;
682	int ProposedColor = Color;
683	std::vector<int> AdditionalElements;
684
685	// We don't want to put in the same block
686	// two high latency instructions that depend
687	// on each other.
688	// One way would be to check canAddEdge
689	// in both directions, but that currently is not
690	// enough because there the high latency order is
691	// enforced (via links).
692	// Instead, look at the dependencies between the
693	// high latency instructions and deduce if it is
694	// a data dependency or not.
695	for (unsigned j : FormingGroup) {
696	bool HasSubGraph;
697	std::vector<int> SubGraph;
698	// By construction (topological order), if SU and
699	// DAG->SUnits[j] are linked, DAG->SUnits[j] is necessary
700	// in the parent graph of SU.
701	#ifndef NDEBUG
702	SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
703	HasSubGraph);
704	assert(!HasSubGraph)(static_cast <bool> (!HasSubGraph) ? void (0) : __assert_fail ("!HasSubGraph", "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 704, __extension__ __PRETTY_FUNCTION__));
705	#endif
706	SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
707	HasSubGraph);
708	if (!HasSubGraph)
709	continue; // No dependencies between each other
710	else if (SubGraph.size() > 5) {
711	// Too many elements would be required to be added to the block.
712	CompatibleGroup = false;
713	break;
714	}
715	else {
716	// Check the type of dependency
717	for (unsigned k : SubGraph) {
718	// If in the path to join the two instructions,
719	// there is another high latency instruction,
720	// or instructions colored for another block
721	// abort the merge.
722	if (DAG->IsHighLatencySU[k] \|\|
723	(CurrentColoring[k] != ProposedColor &&
724	CurrentColoring[k] != 0)) {
725	CompatibleGroup = false;
726	break;
727	}
728	// If one of the SU in the subgraph depends on the result of SU j,
729	// there'll be a data dependency.
730	if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) {
731	CompatibleGroup = false;
732	break;
733	}
734	}
735	if (!CompatibleGroup)
736	break;
737	// Same check for the SU
738	if (hasDataDependencyPred(SU, DAG->SUnits[j])) {
739	CompatibleGroup = false;
740	break;
741	}
742	// Add all the required instructions to the block
743	// These cannot live in another block (because they
744	// depend (order dependency) on one of the
745	// instruction in the block, and are required for the
746	// high latency instruction we add.
747	llvm::append_range(AdditionalElements, SubGraph);
748	}
749	}
750	if (CompatibleGroup) {
751	FormingGroup.insert(SU.NodeNum);
752	for (unsigned j : AdditionalElements)
753	CurrentColoring[j] = ProposedColor;
754	CurrentColoring[SU.NodeNum] = ProposedColor;
755	++Count;
756	}
757	// Found one incompatible instruction,
758	// or has filled a big enough group.
759	// -> start a new one.
760	if (!CompatibleGroup) {
761	FormingGroup.clear();
762	Color = ++NextReservedID;
763	ProposedColor = Color;
764	FormingGroup.insert(SU.NodeNum);
765	CurrentColoring[SU.NodeNum] = ProposedColor;
766	Count = 0;
767	} else if (Count == GroupSize) {
768	FormingGroup.clear();
769	Color = ++NextReservedID;
770	ProposedColor = Color;
	Value stored to 'ProposedColor' is never read
771	Count = 0;
772	}
773	}
774	}
775	}
776
777	void SIScheduleBlockCreator::colorComputeReservedDependencies() {
778	unsigned DAGSize = DAG->SUnits.size();
779	std::map<std::set<unsigned>, unsigned> ColorCombinations;
780
781	CurrentTopDownReservedDependencyColoring.clear();
782	CurrentBottomUpReservedDependencyColoring.clear();
783
784	CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0);
785	CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0);
786
787	// Traverse TopDown, and give different colors to SUs depending
788	// on which combination of High Latencies they depend on.
789
790	for (unsigned SUNum : DAG->TopDownIndex2SU) {
791	SUnit *SU = &DAG->SUnits[SUNum];
792	std::set<unsigned> SUColors;
793
794	// Already given.
795	if (CurrentColoring[SU->NodeNum]) {
796	CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
797	CurrentColoring[SU->NodeNum];
798	continue;
799	}
800
801	for (SDep& PredDep : SU->Preds) {
802	SUnit *Pred = PredDep.getSUnit();
803	if (PredDep.isWeak() \|\| Pred->NodeNum >= DAGSize)
804	continue;
805	if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0)
806	SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]);
807	}
808	// Color 0 by default.
809	if (SUColors.empty())
810	continue;
811	// Same color than parents.
812	if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
813	CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
814	*SUColors.begin();
815	else {
816	std::map<std::set<unsigned>, unsigned>::iterator Pos =
817	ColorCombinations.find(SUColors);
818	if (Pos != ColorCombinations.end()) {
819	CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second;
820	} else {
821	CurrentTopDownReservedDependencyColoring[SU->NodeNum] =
822	NextNonReservedID;
823	ColorCombinations[SUColors] = NextNonReservedID++;
824	}
825	}
826	}
827
828	ColorCombinations.clear();
829
830	// Same as before, but BottomUp.
831
832	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
833	SUnit *SU = &DAG->SUnits[SUNum];
834	std::set<unsigned> SUColors;
835
836	// Already given.
837	if (CurrentColoring[SU->NodeNum]) {
838	CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
839	CurrentColoring[SU->NodeNum];
840	continue;
841	}
842
843	for (SDep& SuccDep : SU->Succs) {
844	SUnit *Succ = SuccDep.getSUnit();
845	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
846	continue;
847	if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0)
848	SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]);
849	}
850	// Keep color 0.
851	if (SUColors.empty())
852	continue;
853	// Same color than parents.
854	if (SUColors.size() == 1 && *SUColors.begin() > DAGSize)
855	CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
856	*SUColors.begin();
857	else {
858	std::map<std::set<unsigned>, unsigned>::iterator Pos =
859	ColorCombinations.find(SUColors);
860	if (Pos != ColorCombinations.end()) {
861	CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second;
862	} else {
863	CurrentBottomUpReservedDependencyColoring[SU->NodeNum] =
864	NextNonReservedID;
865	ColorCombinations[SUColors] = NextNonReservedID++;
866	}
867	}
868	}
869	}
870
871	void SIScheduleBlockCreator::colorAccordingToReservedDependencies() {
872	std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations;
873
874	// Every combination of colors given by the top down
875	// and bottom up Reserved node dependency
876
877	for (const SUnit &SU : DAG->SUnits) {
878	std::pair<unsigned, unsigned> SUColors;
879
880	// High latency instructions: already given.
881	if (CurrentColoring[SU.NodeNum])
882	continue;
883
884	SUColors.first = CurrentTopDownReservedDependencyColoring[SU.NodeNum];
885	SUColors.second = CurrentBottomUpReservedDependencyColoring[SU.NodeNum];
886
887	std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos =
888	ColorCombinations.find(SUColors);
889	if (Pos != ColorCombinations.end()) {
890	CurrentColoring[SU.NodeNum] = Pos->second;
891	} else {
892	CurrentColoring[SU.NodeNum] = NextNonReservedID;
893	ColorCombinations[SUColors] = NextNonReservedID++;
894	}
895	}
896	}
897
898	void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
899	unsigned DAGSize = DAG->SUnits.size();
900	std::vector<int> PendingColoring = CurrentColoring;
901
902	assert(DAGSize >= 1 &&(static_cast <bool> (DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring .size() == DAGSize && CurrentTopDownReservedDependencyColoring .size() == DAGSize) ? void (0) : __assert_fail ("DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring.size() == DAGSize && CurrentTopDownReservedDependencyColoring.size() == DAGSize" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 904, __extension__ __PRETTY_FUNCTION__))
903	CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&(static_cast <bool> (DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring .size() == DAGSize && CurrentTopDownReservedDependencyColoring .size() == DAGSize) ? void (0) : __assert_fail ("DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring.size() == DAGSize && CurrentTopDownReservedDependencyColoring.size() == DAGSize" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 904, __extension__ __PRETTY_FUNCTION__))
904	CurrentTopDownReservedDependencyColoring.size() == DAGSize)(static_cast <bool> (DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring .size() == DAGSize && CurrentTopDownReservedDependencyColoring .size() == DAGSize) ? void (0) : __assert_fail ("DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring.size() == DAGSize && CurrentTopDownReservedDependencyColoring.size() == DAGSize" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 904, __extension__ __PRETTY_FUNCTION__));
905	// If there is no reserved block at all, do nothing. We don't want
906	// everything in one block.
907	if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
908	CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
909	*std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
910	CurrentTopDownReservedDependencyColoring.end()) == 0)
911	return;
912
913	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
914	SUnit *SU = &DAG->SUnits[SUNum];
915	std::set<unsigned> SUColors;
916	std::set<unsigned> SUColorsPending;
917
918	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
919	continue;
920
921	if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 \|\|
922	CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0)
923	continue;
924
925	for (SDep& SuccDep : SU->Succs) {
926	SUnit *Succ = SuccDep.getSUnit();
927	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
928	continue;
929	if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 \|\|
930	CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0)
931	SUColors.insert(CurrentColoring[Succ->NodeNum]);
932	SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
933	}
934	// If there is only one child/parent block, and that block
935	// is not among the ones we are removing in this path, then
936	// merge the instruction to that block
937	if (SUColors.size() == 1 && SUColorsPending.size() == 1)
938	PendingColoring[SU->NodeNum] = *SUColors.begin();
939	else // TODO: Attribute new colors depending on color
940	// combination of children.
941	PendingColoring[SU->NodeNum] = NextNonReservedID++;
942	}
943	CurrentColoring = PendingColoring;
944	}
945
946
947	void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() {
948	unsigned DAGSize = DAG->SUnits.size();
949	unsigned PreviousColor;
950	std::set<unsigned> SeenColors;
951
952	if (DAGSize <= 1)
953	return;
954
955	PreviousColor = CurrentColoring[0];
956
957	for (unsigned i = 1, e = DAGSize; i != e; ++i) {
958	SUnit *SU = &DAG->SUnits[i];
959	unsigned CurrentColor = CurrentColoring[i];
960	unsigned PreviousColorSave = PreviousColor;
961	assert(i == SU->NodeNum)(static_cast <bool> (i == SU->NodeNum) ? void (0) : __assert_fail ("i == SU->NodeNum", "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 961, __extension__ __PRETTY_FUNCTION__));
962
963	if (CurrentColor != PreviousColor)
964	SeenColors.insert(PreviousColor);
965	PreviousColor = CurrentColor;
966
967	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
968	continue;
969
970	if (SeenColors.find(CurrentColor) == SeenColors.end())
971	continue;
972
973	if (PreviousColorSave != CurrentColor)
974	CurrentColoring[i] = NextNonReservedID++;
975	else
976	CurrentColoring[i] = CurrentColoring[i-1];
977	}
978	}
979
980	void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() {
981	unsigned DAGSize = DAG->SUnits.size();
982
983	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
984	SUnit *SU = &DAG->SUnits[SUNum];
985	std::set<unsigned> SUColors;
986
987	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
988	continue;
989
990	// No predecessor: Vgpr constant loading.
991	// Low latency instructions usually have a predecessor (the address)
992	if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum])
993	continue;
994
995	for (SDep& SuccDep : SU->Succs) {
996	SUnit *Succ = SuccDep.getSUnit();
997	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
998	continue;
999	SUColors.insert(CurrentColoring[Succ->NodeNum]);
1000	}
1001	if (SUColors.size() == 1)
1002	CurrentColoring[SU->NodeNum] = *SUColors.begin();
1003	}
1004	}
1005
1006	void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() {
1007	unsigned DAGSize = DAG->SUnits.size();
1008
1009	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1010	SUnit *SU = &DAG->SUnits[SUNum];
1011	std::set<unsigned> SUColors;
1012
1013	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1014	continue;
1015
1016	for (SDep& SuccDep : SU->Succs) {
1017	SUnit *Succ = SuccDep.getSUnit();
1018	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1019	continue;
1020	SUColors.insert(CurrentColoring[Succ->NodeNum]);
1021	}
1022	if (SUColors.size() == 1)
1023	CurrentColoring[SU->NodeNum] = *SUColors.begin();
1024	}
1025	}
1026
1027	void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() {
1028	unsigned DAGSize = DAG->SUnits.size();
1029
1030	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1031	SUnit *SU = &DAG->SUnits[SUNum];
1032	std::set<unsigned> SUColors;
1033
1034	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1035	continue;
1036
1037	for (SDep& SuccDep : SU->Succs) {
1038	SUnit *Succ = SuccDep.getSUnit();
1039	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1040	continue;
1041	SUColors.insert(CurrentColoring[Succ->NodeNum]);
1042	}
1043	if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize)
1044	CurrentColoring[SU->NodeNum] = *SUColors.begin();
1045	}
1046	}
1047
1048	void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
1049	unsigned DAGSize = DAG->SUnits.size();
1050	std::map<unsigned, unsigned> ColorCount;
1051
1052	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1053	SUnit *SU = &DAG->SUnits[SUNum];
1054	unsigned color = CurrentColoring[SU->NodeNum];
1055	++ColorCount[color];
1056	}
1057
1058	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1059	SUnit *SU = &DAG->SUnits[SUNum];
1060	unsigned color = CurrentColoring[SU->NodeNum];
1061	std::set<unsigned> SUColors;
1062
1063	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1064	continue;
1065
1066	if (ColorCount[color] > 1)
1067	continue;
1068
1069	for (SDep& SuccDep : SU->Succs) {
1070	SUnit *Succ = SuccDep.getSUnit();
1071	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1072	continue;
1073	SUColors.insert(CurrentColoring[Succ->NodeNum]);
1074	}
1075	if (SUColors.size() == 1 && *SUColors.begin() != color) {
1076	--ColorCount[color];
1077	CurrentColoring[SU->NodeNum] = *SUColors.begin();
1078	++ColorCount[*SUColors.begin()];
1079	}
1080	}
1081	}
1082
1083	void SIScheduleBlockCreator::cutHugeBlocks() {
1084	// TODO
1085	}
1086
1087	void SIScheduleBlockCreator::regroupNoUserInstructions() {
1088	unsigned DAGSize = DAG->SUnits.size();
1089	int GroupID = NextNonReservedID++;
1090
1091	for (unsigned SUNum : DAG->BottomUpIndex2SU) {
1092	SUnit *SU = &DAG->SUnits[SUNum];
1093	bool hasSuccessor = false;
1094
1095	if (CurrentColoring[SU->NodeNum] <= (int)DAGSize)
1096	continue;
1097
1098	for (SDep& SuccDep : SU->Succs) {
1099	SUnit *Succ = SuccDep.getSUnit();
1100	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1101	continue;
1102	hasSuccessor = true;
1103	}
1104	if (!hasSuccessor)
1105	CurrentColoring[SU->NodeNum] = GroupID;
1106	}
1107	}
1108
1109	void SIScheduleBlockCreator::colorExports() {
1110	unsigned ExportColor = NextNonReservedID++;
1111	SmallVector<unsigned, 8> ExpGroup;
1112
1113	// Put all exports together in a block.
1114	// The block will naturally end up being scheduled last,
1115	// thus putting exports at the end of the schedule, which
1116	// is better for performance.
1117	// However we must ensure, for safety, the exports can be put
1118	// together in the same block without any other instruction.
1119	// This could happen, for example, when scheduling after regalloc
1120	// if reloading a spilled register from memory using the same
1121	// register than used in a previous export.
1122	// If that happens, do not regroup the exports.
1123	for (unsigned SUNum : DAG->TopDownIndex2SU) {
1124	const SUnit &SU = DAG->SUnits[SUNum];
1125	if (SIInstrInfo::isEXP(*SU.getInstr())) {
1126	// SU is an export instruction. Check whether one of its successor
1127	// dependencies is a non-export, in which case we skip export grouping.
1128	for (const SDep &SuccDep : SU.Succs) {
1129	const SUnit *SuccSU = SuccDep.getSUnit();
1130	if (SuccDep.isWeak() \|\| SuccSU->NodeNum >= DAG->SUnits.size()) {
1131	// Ignore these dependencies.
1132	continue;
1133	}
1134	assert(SuccSU->isInstr() &&(static_cast <bool> (SuccSU->isInstr() && "SUnit unexpectedly not representing an instruction!" ) ? void (0) : __assert_fail ("SuccSU->isInstr() && \"SUnit unexpectedly not representing an instruction!\"" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1135, __extension__ __PRETTY_FUNCTION__))
1135	"SUnit unexpectedly not representing an instruction!")(static_cast <bool> (SuccSU->isInstr() && "SUnit unexpectedly not representing an instruction!" ) ? void (0) : __assert_fail ("SuccSU->isInstr() && \"SUnit unexpectedly not representing an instruction!\"" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1135, __extension__ __PRETTY_FUNCTION__));
1136
1137	if (!SIInstrInfo::isEXP(*SuccSU->getInstr())) {
1138	// A non-export depends on us. Skip export grouping.
1139	// Note that this is a bit pessimistic: We could still group all other
1140	// exports that are not depended on by non-exports, directly or
1141	// indirectly. Simply skipping this particular export but grouping all
1142	// others would not account for indirect dependencies.
1143	return;
1144	}
1145	}
1146	ExpGroup.push_back(SUNum);
1147	}
1148	}
1149
1150	// The group can be formed. Give the color.
1151	for (unsigned j : ExpGroup)
1152	CurrentColoring[j] = ExportColor;
1153	}
1154
1155	void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) {
1156	unsigned DAGSize = DAG->SUnits.size();
1157	std::map<unsigned,unsigned> RealID;
1158
1159	CurrentBlocks.clear();
1160	CurrentColoring.clear();
1161	CurrentColoring.resize(DAGSize, 0);
1162	Node2CurrentBlock.clear();
1163
1164	// Restore links previous scheduling variant has overridden.
1165	DAG->restoreSULinksLeft();
1166
1167	NextReservedID = 1;
1168	NextNonReservedID = DAGSize + 1;
1169
1170	LLVM_DEBUG(dbgs() << "Coloring the graph\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Coloring the graph\n" ; } } while (false);
1171
1172	if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped)
1173	colorHighLatenciesGroups();
1174	else
1175	colorHighLatenciesAlone();
1176	colorComputeReservedDependencies();
1177	colorAccordingToReservedDependencies();
1178	colorEndsAccordingToDependencies();
1179	if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive)
1180	colorForceConsecutiveOrderInGroup();
1181	regroupNoUserInstructions();
1182	colorMergeConstantLoadsNextGroup();
1183	colorMergeIfPossibleNextGroupOnlyForReserved();
1184	colorExports();
1185
1186	// Put SUs of same color into same block
1187	Node2CurrentBlock.resize(DAGSize, -1);
1188	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1189	SUnit *SU = &DAG->SUnits[i];
1190	unsigned Color = CurrentColoring[SU->NodeNum];
1191	if (RealID.find(Color) == RealID.end()) {
1192	int ID = CurrentBlocks.size();
1193	BlockPtrs.push_back(std::make_unique<SIScheduleBlock>(DAG, this, ID));
1194	CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
1195	RealID[Color] = ID;
1196	}
1197	CurrentBlocks[RealID[Color]]->addUnit(SU);
1198	Node2CurrentBlock[SU->NodeNum] = RealID[Color];
1199	}
1200
1201	// Build dependencies between blocks.
1202	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1203	SUnit *SU = &DAG->SUnits[i];
1204	int SUID = Node2CurrentBlock[i];
1205	for (SDep& SuccDep : SU->Succs) {
1206	SUnit *Succ = SuccDep.getSUnit();
1207	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1208	continue;
1209	if (Node2CurrentBlock[Succ->NodeNum] != SUID)
1210	CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
1211	SuccDep.isCtrl() ? NoData : Data);
1212	}
1213	for (SDep& PredDep : SU->Preds) {
1214	SUnit *Pred = PredDep.getSUnit();
1215	if (PredDep.isWeak() \|\| Pred->NodeNum >= DAGSize)
1216	continue;
1217	if (Node2CurrentBlock[Pred->NodeNum] != SUID)
1218	CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]);
1219	}
1220	}
1221
1222	// Free root and leafs of all blocks to enable scheduling inside them.
1223	for (SIScheduleBlock *Block : CurrentBlocks)
1224	Block->finalizeUnits();
1225	LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "Blocks created:\n\n" ; for (SIScheduleBlock *Block : CurrentBlocks) Block->printDebug (true); }; } } while (false)
1226	dbgs() << "Blocks created:\n\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "Blocks created:\n\n" ; for (SIScheduleBlock *Block : CurrentBlocks) Block->printDebug (true); }; } } while (false)
1227	for (SIScheduleBlock Block : CurrentBlocks)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "Blocks created:\n\n" ; for (SIScheduleBlock Block : CurrentBlocks) Block->printDebug (true); }; } } while (false)
1228	Block->printDebug(true);do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "Blocks created:\n\n" ; for (SIScheduleBlock *Block : CurrentBlocks) Block->printDebug (true); }; } } while (false)
1229	})do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "Blocks created:\n\n" ; for (SIScheduleBlock *Block : CurrentBlocks) Block->printDebug (true); }; } } while (false);
1230	}
1231
1232	// Two functions taken from Codegen/MachineScheduler.cpp
1233
1234	/// Non-const version.
1235	static MachineBasicBlock::iterator
1236	nextIfDebug(MachineBasicBlock::iterator I,
1237	MachineBasicBlock::const_iterator End) {
1238	for (; I != End; ++I) {
1239	if (!I->isDebugInstr())
1240	break;
1241	}
1242	return I;
1243	}
1244
1245	void SIScheduleBlockCreator::topologicalSort() {
1246	unsigned DAGSize = CurrentBlocks.size();
1247	std::vector<int> WorkList;
1248
1249	LLVM_DEBUG(dbgs() << "Topological Sort\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Topological Sort\n" ; } } while (false);
1250
1251	WorkList.reserve(DAGSize);
1252	TopDownIndex2Block.resize(DAGSize);
1253	TopDownBlock2Index.resize(DAGSize);
1254	BottomUpIndex2Block.resize(DAGSize);
1255
1256	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1257	SIScheduleBlock *Block = CurrentBlocks[i];
1258	unsigned Degree = Block->getSuccs().size();
1259	TopDownBlock2Index[i] = Degree;
1260	if (Degree == 0) {
1261	WorkList.push_back(i);
1262	}
1263	}
1264
1265	int Id = DAGSize;
1266	while (!WorkList.empty()) {
1267	int i = WorkList.back();
1268	SIScheduleBlock *Block = CurrentBlocks[i];
1269	WorkList.pop_back();
1270	TopDownBlock2Index[i] = --Id;
1271	TopDownIndex2Block[Id] = i;
1272	for (SIScheduleBlock* Pred : Block->getPreds()) {
1273	if (!--TopDownBlock2Index[Pred->getID()])
1274	WorkList.push_back(Pred->getID());
1275	}
1276	}
1277
1278	#ifndef NDEBUG
1279	// Check correctness of the ordering.
1280	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1281	SIScheduleBlock *Block = CurrentBlocks[i];
1282	for (SIScheduleBlock* Pred : Block->getPreds()) {
1283	assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&(static_cast <bool> (TopDownBlock2Index[i] > TopDownBlock2Index [Pred->getID()] && "Wrong Top Down topological sorting" ) ? void (0) : __assert_fail ("TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && \"Wrong Top Down topological sorting\"" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1284, __extension__ __PRETTY_FUNCTION__))
1284	"Wrong Top Down topological sorting")(static_cast <bool> (TopDownBlock2Index[i] > TopDownBlock2Index [Pred->getID()] && "Wrong Top Down topological sorting" ) ? void (0) : __assert_fail ("TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && \"Wrong Top Down topological sorting\"" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1284, __extension__ __PRETTY_FUNCTION__));
1285	}
1286	}
1287	#endif
1288
1289	BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(),
1290	TopDownIndex2Block.rend());
1291	}
1292
1293	void SIScheduleBlockCreator::scheduleInsideBlocks() {
1294	unsigned DAGSize = CurrentBlocks.size();
1295
1296	LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "\nScheduling Blocks\n\n" ; } } while (false);
1297
1298	// We do schedule a valid scheduling such that a Block corresponds
1299	// to a range of instructions.
1300	LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "First phase: Fast scheduling for Reg Liveness\n" ; } } while (false);
1301	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1302	SIScheduleBlock *Block = CurrentBlocks[i];
1303	Block->fastSchedule();
1304	}
1305
1306	// Note: the following code, and the part restoring previous position
1307	// is by far the most expensive operation of the Scheduler.
1308
1309	// Do not update CurrentTop.
1310	MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop();
1311	std::vector<MachineBasicBlock::iterator> PosOld;
1312	std::vector<MachineBasicBlock::iterator> PosNew;
1313	PosOld.reserve(DAG->SUnits.size());
1314	PosNew.reserve(DAG->SUnits.size());
1315
1316	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1317	int BlockIndice = TopDownIndex2Block[i];
1318	SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1319	std::vector<SUnit*> SUs = Block->getScheduledUnits();
1320
1321	for (SUnit* SU : SUs) {
1322	MachineInstr *MI = SU->getInstr();
1323	MachineBasicBlock::iterator Pos = MI;
1324	PosOld.push_back(Pos);
1325	if (&*CurrentTopFastSched == MI) {
1326	PosNew.push_back(Pos);
1327	CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched,
1328	DAG->getCurrentBottom());
1329	} else {
1330	// Update the instruction stream.
1331	DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI);
1332
1333	// Update LiveIntervals.
1334	// Note: Moving all instructions and calling handleMove every time
1335	// is the most cpu intensive operation of the scheduler.
1336	// It would gain a lot if there was a way to recompute the
1337	// LiveIntervals for the entire scheduling region.
1338	DAG->getLIS()->handleMove(MI, /UpdateFlags=*/true);
1339	PosNew.push_back(CurrentTopFastSched);
1340	}
1341	}
1342	}
1343
1344	// Now we have Block of SUs == Block of MI.
1345	// We do the final schedule for the instructions inside the block.
1346	// The property that all the SUs of the Block are grouped together as MI
1347	// is used for correct reg usage tracking.
1348	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1349	SIScheduleBlock *Block = CurrentBlocks[i];
1350	std::vector<SUnit*> SUs = Block->getScheduledUnits();
1351	Block->schedule((SUs.begin())->getInstr(), (SUs.rbegin())->getInstr());
1352	}
1353
1354	LLVM_DEBUG(dbgs() << "Restoring MI Pos\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Restoring MI Pos\n" ; } } while (false);
1355	// Restore old ordering (which prevents a LIS->handleMove bug).
1356	for (unsigned i = PosOld.size(), e = 0; i != e; --i) {
1357	MachineBasicBlock::iterator POld = PosOld[i-1];
1358	MachineBasicBlock::iterator PNew = PosNew[i-1];
1359	if (PNew != POld) {
1360	// Update the instruction stream.
1361	DAG->getBB()->splice(POld, DAG->getBB(), PNew);
1362
1363	// Update LiveIntervals.
1364	DAG->getLIS()->handleMove(POld, /UpdateFlags=*/true);
1365	}
1366	}
1367
1368	LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { for (SIScheduleBlock *Block : CurrentBlocks ) Block->printDebug(true); }; } } while (false)
1369	for (SIScheduleBlock Block : CurrentBlocks)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { for (SIScheduleBlock Block : CurrentBlocks ) Block->printDebug(true); }; } } while (false)
1370	Block->printDebug(true);do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { for (SIScheduleBlock *Block : CurrentBlocks ) Block->printDebug(true); }; } } while (false)
1371	})do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { for (SIScheduleBlock *Block : CurrentBlocks ) Block->printDebug(true); }; } } while (false);
1372	}
1373
1374	void SIScheduleBlockCreator::fillStats() {
1375	unsigned DAGSize = CurrentBlocks.size();
1376
1377	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1378	int BlockIndice = TopDownIndex2Block[i];
1379	SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1380	if (Block->getPreds().empty())
1381	Block->Depth = 0;
1382	else {
1383	unsigned Depth = 0;
1384	for (SIScheduleBlock *Pred : Block->getPreds()) {
1385	if (Depth < Pred->Depth + Pred->getCost())
1386	Depth = Pred->Depth + Pred->getCost();
1387	}
1388	Block->Depth = Depth;
1389	}
1390	}
1391
1392	for (unsigned i = 0, e = DAGSize; i != e; ++i) {
1393	int BlockIndice = BottomUpIndex2Block[i];
1394	SIScheduleBlock *Block = CurrentBlocks[BlockIndice];
1395	if (Block->getSuccs().empty())
1396	Block->Height = 0;
1397	else {
1398	unsigned Height = 0;
1399	for (const auto &Succ : Block->getSuccs())
1400	Height = std::max(Height, Succ.first->Height + Succ.first->getCost());
1401	Block->Height = Height;
1402	}
1403	}
1404	}
1405
1406	// SIScheduleBlockScheduler //
1407
1408	SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
1409	SISchedulerBlockSchedulerVariant Variant,
1410	SIScheduleBlocks BlocksStruct) :
1411	DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks),
1412	LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0),
1413	SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) {
1414
1415	// Fill the usage of every output
1416	// Warning: while by construction we always have a link between two blocks
1417	// when one needs a result from the other, the number of users of an output
1418	// is not the sum of child blocks having as input the same virtual register.
1419	// Here is an example. A produces x and y. B eats x and produces x'.
1420	// C eats x' and y. The register coalescer may have attributed the same
1421	// virtual register to x and x'.
1422	// To count accurately, we do a topological sort. In case the register is
1423	// found for several parents, we increment the usage of the one with the
1424	// highest topological index.
1425	LiveOutRegsNumUsages.resize(Blocks.size());
1426	for (SIScheduleBlock *Block : Blocks) {
1427	for (unsigned Reg : Block->getInRegs()) {
1428	bool Found = false;
1429	int topoInd = -1;
1430	for (SIScheduleBlock* Pred: Block->getPreds()) {
1431	std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1432	std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1433
1434	if (RegPos != PredOutRegs.end()) {
1435	Found = true;
1436	if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) {
1437	topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()];
1438	}
1439	}
1440	}
1441
1442	if (!Found)
1443	continue;
1444
1445	int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
1446	++LiveOutRegsNumUsages[PredID][Reg];
1447	}
1448	}
1449
1450	LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0);
1451	BlockNumPredsLeft.resize(Blocks.size());
1452	BlockNumSuccsLeft.resize(Blocks.size());
1453
1454	for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1455	SIScheduleBlock *Block = Blocks[i];
1456	BlockNumPredsLeft[i] = Block->getPreds().size();
1457	BlockNumSuccsLeft[i] = Block->getSuccs().size();
1458	}
1459
1460	#ifndef NDEBUG
1461	for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1462	SIScheduleBlock *Block = Blocks[i];
1463	assert(Block->getID() == i)(static_cast <bool> (Block->getID() == i) ? void (0) : __assert_fail ("Block->getID() == i", "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 1463, __extension__ __PRETTY_FUNCTION__));
1464	}
1465	#endif
1466
1467	std::set<unsigned> InRegs = DAG->getInRegs();
1468	addLiveRegs(InRegs);
1469
1470	// Increase LiveOutRegsNumUsages for blocks
1471	// producing registers consumed in another
1472	// scheduling region.
1473	for (unsigned Reg : DAG->getOutRegs()) {
1474	for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1475	// Do reverse traversal
1476	int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
1477	SIScheduleBlock *Block = Blocks[ID];
1478	const std::set<unsigned> &OutRegs = Block->getOutRegs();
1479
1480	if (OutRegs.find(Reg) == OutRegs.end())
1481	continue;
1482
1483	++LiveOutRegsNumUsages[ID][Reg];
1484	break;
1485	}
1486	}
1487
1488	// Fill LiveRegsConsumers for regs that were already
1489	// defined before scheduling.
1490	for (SIScheduleBlock *Block : Blocks) {
1491	for (unsigned Reg : Block->getInRegs()) {
1492	bool Found = false;
1493	for (SIScheduleBlock* Pred: Block->getPreds()) {
1494	std::set<unsigned> PredOutRegs = Pred->getOutRegs();
1495	std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
1496
1497	if (RegPos != PredOutRegs.end()) {
1498	Found = true;
1499	break;
1500	}
1501	}
1502
1503	if (!Found)
1504	++LiveRegsConsumers[Reg];
1505	}
1506	}
1507
1508	for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
1509	SIScheduleBlock *Block = Blocks[i];
1510	if (BlockNumPredsLeft[i] == 0) {
1511	ReadyBlocks.push_back(Block);
1512	}
1513	}
1514
1515	while (SIScheduleBlock *Block = pickBlock()) {
1516	BlocksScheduled.push_back(Block);
1517	blockScheduled(Block);
1518	}
1519
1520	LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock Blockdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Block Order:"; for ( SIScheduleBlock Block : BlocksScheduled) { dbgs() << ' ' << Block->getID(); } dbgs() << '\n';; } } while (false)
1521	: BlocksScheduled) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Block Order:"; for ( SIScheduleBlock *Block : BlocksScheduled) { dbgs() << ' ' << Block->getID(); } dbgs() << '\n';; } } while (false)
1522	dbgs() << ' ' << Block->getID();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Block Order:"; for ( SIScheduleBlock *Block : BlocksScheduled) { dbgs() << ' ' << Block->getID(); } dbgs() << '\n';; } } while (false)
1523	} dbgs() << '\n';)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Block Order:"; for ( SIScheduleBlock *Block : BlocksScheduled) { dbgs() << ' ' << Block->getID(); } dbgs() << '\n';; } } while (false);
1524	}
1525
1526	bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand,
1527	SIBlockSchedCandidate &TryCand) {
1528	if (!Cand.isValid()) {
1529	TryCand.Reason = NodeOrder;
1530	return true;
1531	}
1532
1533	// Try to hide high latencies.
1534	if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled,
1535	Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency))
1536	return true;
1537	// Schedule high latencies early so you can hide them better.
1538	if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency,
1539	TryCand, Cand, Latency))
1540	return true;
1541	if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height,
1542	TryCand, Cand, Depth))
1543	return true;
1544	if (SISched::tryGreater(TryCand.NumHighLatencySuccessors,
1545	Cand.NumHighLatencySuccessors,
1546	TryCand, Cand, Successor))
1547	return true;
1548	return false;
1549	}
1550
1551	bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
1552	SIBlockSchedCandidate &TryCand) {
1553	if (!Cand.isValid()) {
1554	TryCand.Reason = NodeOrder;
1555	return true;
1556	}
1557
1558	if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0,
1559	TryCand, Cand, RegUsage))
1560	return true;
1561	if (SISched::tryGreater(TryCand.NumSuccessors > 0,
1562	Cand.NumSuccessors > 0,
1563	TryCand, Cand, Successor))
1564	return true;
1565	if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth))
1566	return true;
1567	if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff,
1568	TryCand, Cand, RegUsage))
1569	return true;
1570	return false;
1571	}
1572
1573	SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
1574	SIBlockSchedCandidate Cand;
1575	std::vector<SIScheduleBlock*>::iterator Best;
1576	SIScheduleBlock *Block;
1577	if (ReadyBlocks.empty())
1578	return nullptr;
1579
1580	DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(),
1581	VregCurrentUsage, SregCurrentUsage);
1582	if (VregCurrentUsage > maxVregUsage)
1583	maxVregUsage = VregCurrentUsage;
1584	if (SregCurrentUsage > maxSregUsage)
1585	maxSregUsage = SregCurrentUsage;
1586	LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false)
1587	for (SIScheduleBlock Blockdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false)
1588	: ReadyBlocks) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false)
1589	<< Block->getID() << ' ';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false)
1590	dbgs() << "\nCurrent Live:\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false)
1591	for (unsigned Regdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false)
1592	: LiveRegs) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false)
1593	<< printVRegOrUnit(Reg, DAG->getTRI()) << ' ';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false)
1594	dbgs() << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false)
1595	dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false)
1596	dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false);
1597
1598	Cand.Block = nullptr;
1599	for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(),
1600	E = ReadyBlocks.end(); I != E; ++I) {
1601	SIBlockSchedCandidate TryCand;
1602	TryCand.Block = *I;
1603	TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
1604	TryCand.VGPRUsageDiff =
1605	checkRegUsageImpact(TryCand.Block->getInRegs(),
1606	TryCand.Block->getOutRegs())[AMDGPU::RegisterPressureSets::VGPR_32];
1607	TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
1608	TryCand.NumHighLatencySuccessors =
1609	TryCand.Block->getNumHighLatencySuccessors();
1610	TryCand.LastPosHighLatParentScheduled =
1611	(unsigned int) std::max<int> (0,
1612	LastPosHighLatencyParentScheduled[TryCand.Block->getID()] -
1613	LastPosWaitedHighLatency);
1614	TryCand.Height = TryCand.Block->Height;
1615	// Try not to increase VGPR usage too much, else we may spill.
1616	if (VregCurrentUsage > 120 \|\|
1617	Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) {
1618	if (!tryCandidateRegUsage(Cand, TryCand) &&
1619	Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage)
1620	tryCandidateLatency(Cand, TryCand);
1621	} else {
1622	if (!tryCandidateLatency(Cand, TryCand))
1623	tryCandidateRegUsage(Cand, TryCand);
1624	}
1625	if (TryCand.Reason != NoCand) {
1626	Cand.setBest(TryCand);
1627	Best = I;
1628	LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' 'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' << getReasonStr (Cand.Reason) << '\n'; } } while (false)
1629	<< getReasonStr(Cand.Reason) << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' << getReasonStr (Cand.Reason) << '\n'; } } while (false);
1630	}
1631	}
1632
1633	LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false)
1634	dbgs() << "Is a block with high latency instruction: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false)
1635	<< (Cand.IsHighLatency ? "yes\n" : "no\n");do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false)
1636	dbgs() << "Position of last high latency dependency: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false)
1637	<< Cand.LastPosHighLatParentScheduled << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false)
1638	dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false)
1639	dbgs() << '\n';)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false);
1640
1641	Block = Cand.Block;
1642	ReadyBlocks.erase(Best);
1643	return Block;
1644	}
1645
1646	// Tracking of currently alive registers to determine VGPR Usage.
1647
1648	void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
1649	for (Register Reg : Regs) {
1650	// For now only track virtual registers.
1651	if (!Reg.isVirtual())
1652	continue;
1653	// If not already in the live set, then add it.
1654	(void) LiveRegs.insert(Reg);
1655	}
1656	}
1657
1658	void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
1659	std::set<unsigned> &Regs) {
1660	for (unsigned Reg : Regs) {
1661	// For now only track virtual registers.
1662	std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
1663	assert (Pos != LiveRegs.end() && // Reg must be live.(static_cast <bool> (Pos != LiveRegs.end() && LiveRegsConsumers .find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers [Reg] >= 1) ? void (0) : __assert_fail ("Pos != LiveRegs.end() && LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers[Reg] >= 1" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1665, __extension__ __PRETTY_FUNCTION__))
1664	LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&(static_cast <bool> (Pos != LiveRegs.end() && LiveRegsConsumers .find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers [Reg] >= 1) ? void (0) : __assert_fail ("Pos != LiveRegs.end() && LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers[Reg] >= 1" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1665, __extension__ __PRETTY_FUNCTION__))
1665	LiveRegsConsumers[Reg] >= 1)(static_cast <bool> (Pos != LiveRegs.end() && LiveRegsConsumers .find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers [Reg] >= 1) ? void (0) : __assert_fail ("Pos != LiveRegs.end() && LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers[Reg] >= 1" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1665, __extension__ __PRETTY_FUNCTION__));
1666	--LiveRegsConsumers[Reg];
1667	if (LiveRegsConsumers[Reg] == 0)
1668	LiveRegs.erase(Pos);
1669	}
1670	}
1671
1672	void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
1673	for (const auto &Block : Parent->getSuccs()) {
1674	if (--BlockNumPredsLeft[Block.first->getID()] == 0)
1675	ReadyBlocks.push_back(Block.first);
1676
1677	if (Parent->isHighLatencyBlock() &&
1678	Block.second == SIScheduleBlockLinkKind::Data)
1679	LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
1680	}
1681	}
1682
1683	void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
1684	decreaseLiveRegs(Block, Block->getInRegs());
1685	addLiveRegs(Block->getOutRegs());
1686	releaseBlockSuccs(Block);
1687	for (const auto &RegP : LiveOutRegsNumUsages[Block->getID()]) {
1688	// We produce this register, thus it must not be previously alive.
1689	assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() \|\|(static_cast <bool> (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() \|\| LiveRegsConsumers[RegP.first] == 0) ? void (0) : __assert_fail ("LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() \|\| LiveRegsConsumers[RegP.first] == 0" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1690, __extension__ __PRETTY_FUNCTION__))
1690	LiveRegsConsumers[RegP.first] == 0)(static_cast <bool> (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() \|\| LiveRegsConsumers[RegP.first] == 0) ? void (0) : __assert_fail ("LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() \|\| LiveRegsConsumers[RegP.first] == 0" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1690, __extension__ __PRETTY_FUNCTION__));
1691	LiveRegsConsumers[RegP.first] += RegP.second;
1692	}
1693	if (LastPosHighLatencyParentScheduled[Block->getID()] >
1694	(unsigned)LastPosWaitedHighLatency)
1695	LastPosWaitedHighLatency =
1696	LastPosHighLatencyParentScheduled[Block->getID()];
1697	++NumBlockScheduled;
1698	}
1699
1700	std::vector<int>
1701	SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
1702	std::set<unsigned> &OutRegs) {
1703	std::vector<int> DiffSetPressure;
1704	DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
1705
1706	for (Register Reg : InRegs) {
1707	// For now only track virtual registers.
1708	if (!Reg.isVirtual())
1709	continue;
1710	if (LiveRegsConsumers[Reg] > 1)
1711	continue;
1712	PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1713	for (; PSetI.isValid(); ++PSetI) {
1714	DiffSetPressure[*PSetI] -= PSetI.getWeight();
1715	}
1716	}
1717
1718	for (Register Reg : OutRegs) {
1719	// For now only track virtual registers.
1720	if (!Reg.isVirtual())
1721	continue;
1722	PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
1723	for (; PSetI.isValid(); ++PSetI) {
1724	DiffSetPressure[*PSetI] += PSetI.getWeight();
1725	}
1726	}
1727
1728	return DiffSetPressure;
1729	}
1730
1731	// SIScheduler //
1732
1733	struct SIScheduleBlockResult
1734	SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
1735	SISchedulerBlockSchedulerVariant ScheduleVariant) {
1736	SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant);
1737	SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks);
1738	std::vector<SIScheduleBlock*> ScheduledBlocks;
1739	struct SIScheduleBlockResult Res;
1740
1741	ScheduledBlocks = Scheduler.getBlocks();
1742
1743	for (SIScheduleBlock *Block : ScheduledBlocks) {
1744	std::vector<SUnit*> SUs = Block->getScheduledUnits();
1745
1746	for (SUnit* SU : SUs)
1747	Res.SUs.push_back(SU->NodeNum);
1748	}
1749
1750	Res.MaxSGPRUsage = Scheduler.getSGPRUsage();
1751	Res.MaxVGPRUsage = Scheduler.getVGPRUsage();
1752	return Res;
1753	}
1754
1755	// SIScheduleDAGMI //
1756
1757	SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
1758	ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)) {
1759	SITII = static_cast<const SIInstrInfo*>(TII);
1760	SITRI = static_cast<const SIRegisterInfo*>(TRI);
1761	}
1762
1763	SIScheduleDAGMI::~SIScheduleDAGMI() = default;
1764
1765	// Code adapted from scheduleDAG.cpp
1766	// Does a topological sort over the SUs.
1767	// Both TopDown and BottomUp
1768	void SIScheduleDAGMI::topologicalSort() {
1769	Topo.InitDAGTopologicalSorting();
1770
1771	TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end());
1772	BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend());
1773	}
1774
1775	// Move low latencies further from their user without
1776	// increasing SGPR usage (in general)
1777	// This is to be replaced by a better pass that would
1778	// take into account SGPR usage (based on VGPR Usage
1779	// and the corresponding wavefront count), that would
1780	// try to merge groups of loads if it make sense, etc
1781	void SIScheduleDAGMI::moveLowLatencies() {
1782	unsigned DAGSize = SUnits.size();
1783	int LastLowLatencyUser = -1;
1784	int LastLowLatencyPos = -1;
1785
1786	for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) {
1787	SUnit *SU = &SUnits[ScheduledSUnits[i]];
1788	bool IsLowLatencyUser = false;
1789	unsigned MinPos = 0;
1790
1791	for (SDep& PredDep : SU->Preds) {
1792	SUnit *Pred = PredDep.getSUnit();
1793	if (SITII->isLowLatencyInstruction(*Pred->getInstr())) {
1794	IsLowLatencyUser = true;
1795	}
1796	if (Pred->NodeNum >= DAGSize)
1797	continue;
1798	unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum];
1799	if (PredPos >= MinPos)
1800	MinPos = PredPos + 1;
1801	}
1802
1803	if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1804	unsigned BestPos = LastLowLatencyUser + 1;
1805	if ((int)BestPos <= LastLowLatencyPos)
1806	BestPos = LastLowLatencyPos + 1;
1807	if (BestPos < MinPos)
1808	BestPos = MinPos;
1809	if (BestPos < i) {
1810	for (unsigned u = i; u > BestPos; --u) {
1811	++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1812	ScheduledSUnits[u] = ScheduledSUnits[u-1];
1813	}
1814	ScheduledSUnits[BestPos] = SU->NodeNum;
1815	ScheduledSUnitsInv[SU->NodeNum] = BestPos;
1816	}
1817	LastLowLatencyPos = BestPos;
1818	if (IsLowLatencyUser)
1819	LastLowLatencyUser = BestPos;
1820	} else if (IsLowLatencyUser) {
1821	LastLowLatencyUser = i;
1822	// Moves COPY instructions on which depends
1823	// the low latency instructions too.
1824	} else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) {
1825	bool CopyForLowLat = false;
1826	for (SDep& SuccDep : SU->Succs) {
1827	SUnit *Succ = SuccDep.getSUnit();
1828	if (SuccDep.isWeak() \|\| Succ->NodeNum >= DAGSize)
1829	continue;
1830	if (SITII->isLowLatencyInstruction(*Succ->getInstr())) {
1831	CopyForLowLat = true;
1832	}
1833	}
1834	if (!CopyForLowLat)
1835	continue;
1836	if (MinPos < i) {
1837	for (unsigned u = i; u > MinPos; --u) {
1838	++ScheduledSUnitsInv[ScheduledSUnits[u-1]];
1839	ScheduledSUnits[u] = ScheduledSUnits[u-1];
1840	}
1841	ScheduledSUnits[MinPos] = SU->NodeNum;
1842	ScheduledSUnitsInv[SU->NodeNum] = MinPos;
1843	}
1844	}
1845	}
1846	}
1847
1848	void SIScheduleDAGMI::restoreSULinksLeft() {
1849	for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1850	SUnits[i].isScheduled = false;
1851	SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft;
1852	SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft;
1853	SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft;
1854	SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft;
1855	}
1856	}
1857
1858	// Return the Vgpr and Sgpr usage corresponding to some virtual registers.
1859	template<typename _Iterator> void
1860	SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
1861	unsigned &VgprUsage, unsigned &SgprUsage) {
1862	VgprUsage = 0;
1863	SgprUsage = 0;
1864	for (_Iterator RegI = First; RegI != End; ++RegI) {
1865	Register Reg = *RegI;
1866	// For now only track virtual registers
1867	if (!Reg.isVirtual())
1868	continue;
1869	PSetIterator PSetI = MRI.getPressureSets(Reg);
1870	for (; PSetI.isValid(); ++PSetI) {
1871	if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32)
1872	VgprUsage += PSetI.getWeight();
1873	else if (*PSetI == AMDGPU::RegisterPressureSets::SReg_32)
1874	SgprUsage += PSetI.getWeight();
1875	}
1876	}
1877	}
1878
1879	void SIScheduleDAGMI::schedule()
1880	{
1881	SmallVector<SUnit*, 8> TopRoots, BotRoots;
1882	SIScheduleBlockResult Best, Temp;
1883	LLVM_DEBUG(dbgs() << "Preparing Scheduling\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Preparing Scheduling\n" ; } } while (false);
1884
1885	buildDAGWithRegPressure();
1886	postProcessDAG();
1887
1888	LLVM_DEBUG(dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dump(); } } while (false);
1889	if (PrintDAGs)
1890	dump();
1891	if (ViewMISchedDAGs)
1892	viewGraph();
1893
1894	topologicalSort();
1895	findRootsAndBiasEdges(TopRoots, BotRoots);
1896	// We reuse several ScheduleDAGMI and ScheduleDAGMILive
1897	// functions, but to make them happy we must initialize
1898	// the default Scheduler implementation (even if we do not
1899	// run it)
1900	SchedImpl->initialize(this);
1901	initQueues(TopRoots, BotRoots);
1902
1903	// Fill some stats to help scheduling.
1904
1905	SUnitsLinksBackup = SUnits;
1906	IsLowLatencySU.clear();
1907	LowLatencyOffset.clear();
1908	IsHighLatencySU.clear();
1909
1910	IsLowLatencySU.resize(SUnits.size(), 0);
1911	LowLatencyOffset.resize(SUnits.size(), 0);
1912	IsHighLatencySU.resize(SUnits.size(), 0);
1913
1914	for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1915	SUnit *SU = &SUnits[i];
1916	const MachineOperand *BaseLatOp;
1917	int64_t OffLatReg;
1918	if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
1919	IsLowLatencySU[i] = 1;
1920	bool OffsetIsScalable;
1921	if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg,
1922	OffsetIsScalable, TRI))
1923	LowLatencyOffset[i] = OffLatReg;
1924	} else if (SITII->isHighLatencyDef(SU->getInstr()->getOpcode()))
1925	IsHighLatencySU[i] = 1;
1926	}
1927
1928	SIScheduler Scheduler(this);
1929	Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone,
1930	SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage);
1931
1932	// if VGPR usage is extremely high, try other good performing variants
1933	// which could lead to lower VGPR usage
1934	if (Best.MaxVGPRUsage > 180) {
1935	static const std::pair<SISchedulerBlockCreatorVariant,
1936	SISchedulerBlockSchedulerVariant>
1937	Variants[] = {
1938	{ LatenciesAlone, BlockRegUsageLatency },
1939	// { LatenciesAlone, BlockRegUsage },
1940	{ LatenciesGrouped, BlockLatencyRegUsage },
1941	// { LatenciesGrouped, BlockRegUsageLatency },
1942	// { LatenciesGrouped, BlockRegUsage },
1943	{ LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1944	// { LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1945	// { LatenciesAlonePlusConsecutive, BlockRegUsage }
1946	};
1947	for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1948	Temp = Scheduler.scheduleVariant(v.first, v.second);
1949	if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1950	Best = Temp;
1951	}
1952	}
1953	// if VGPR usage is still extremely high, we may spill. Try other variants
1954	// which are less performing, but that could lead to lower VGPR usage.
1955	if (Best.MaxVGPRUsage > 200) {
1956	static const std::pair<SISchedulerBlockCreatorVariant,
1957	SISchedulerBlockSchedulerVariant>
1958	Variants[] = {
1959	// { LatenciesAlone, BlockRegUsageLatency },
1960	{ LatenciesAlone, BlockRegUsage },
1961	// { LatenciesGrouped, BlockLatencyRegUsage },
1962	{ LatenciesGrouped, BlockRegUsageLatency },
1963	{ LatenciesGrouped, BlockRegUsage },
1964	// { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage },
1965	{ LatenciesAlonePlusConsecutive, BlockRegUsageLatency },
1966	{ LatenciesAlonePlusConsecutive, BlockRegUsage }
1967	};
1968	for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) {
1969	Temp = Scheduler.scheduleVariant(v.first, v.second);
1970	if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage)
1971	Best = Temp;
1972	}
1973	}
1974
1975	ScheduledSUnits = Best.SUs;
1976	ScheduledSUnitsInv.resize(SUnits.size());
1977
1978	for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) {
1979	ScheduledSUnitsInv[ScheduledSUnits[i]] = i;
1980	}
1981
1982	moveLowLatencies();
1983
1984	// Tell the outside world about the result of the scheduling.
1985
1986	assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker")(static_cast <bool> (TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker") ? void (0) : __assert_fail ("TopRPTracker.getPos() == RegionBegin && \"bad initial Top tracker\"" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1986, __extension__ __PRETTY_FUNCTION__));
1987	TopRPTracker.setPos(CurrentTop);
1988
1989	for (unsigned I : ScheduledSUnits) {
1990	SUnit *SU = &SUnits[I];
1991
1992	scheduleMI(SU, true);
1993
1994	LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr(); } } while (false)
1995	<< SU->getInstr())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << SU->getInstr(); } } while (false);
1996	}
1997
1998	assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.")(static_cast <bool> (CurrentTop == CurrentBottom && "Nonempty unscheduled zone.") ? void (0) : __assert_fail ("CurrentTop == CurrentBottom && \"Nonempty unscheduled zone.\"" , "llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp", 1998, __extension__ __PRETTY_FUNCTION__));
1999
2000	placeDebugValues();
2001
2002	LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "*** Final schedule for " << printMBBReference(begin()->getParent()) << " **\n"; dumpSchedule(); dbgs() << '\n'; }; } } while (false)
2003	dbgs() << "* Final schedule for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "* Final schedule for " << printMBBReference(begin()->getParent()) << " **\n"; dumpSchedule(); dbgs() << '\n'; }; } } while (false)
2004	<< printMBBReference(begin()->getParent()) << " \n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "* Final schedule for " << printMBBReference(begin()->getParent()) << " **\n"; dumpSchedule(); dbgs() << '\n'; }; } } while (false)
2005	dumpSchedule();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "*** Final schedule for " << printMBBReference(begin()->getParent()) << " **\n"; dumpSchedule(); dbgs() << '\n'; }; } } while (false)
2006	dbgs() << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "*** Final schedule for " << printMBBReference(begin()->getParent()) << " **\n"; dumpSchedule(); dbgs() << '\n'; }; } } while (false)
2007	})do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "*** Final schedule for " << printMBBReference(begin()->getParent()) << " **\n"; dumpSchedule(); dbgs() << '\n'; }; } } while (false);
2008	}