File: | lib/Target/AMDGPU/SIMachineScheduler.cpp |
Warning: | line 1679, column 3 Called C++ object pointer is null |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===// | |||||||||
2 | // | |||||||||
3 | // The LLVM Compiler Infrastructure | |||||||||
4 | // | |||||||||
5 | // This file is distributed under the University of Illinois Open Source | |||||||||
6 | // License. See LICENSE.TXT for details. | |||||||||
7 | // | |||||||||
8 | //===----------------------------------------------------------------------===// | |||||||||
9 | // | |||||||||
10 | /// \file | |||||||||
11 | /// \brief SI Machine Scheduler interface | |||||||||
12 | // | |||||||||
13 | //===----------------------------------------------------------------------===// | |||||||||
14 | ||||||||||
15 | #include "SIMachineScheduler.h" | |||||||||
16 | #include "AMDGPU.h" | |||||||||
17 | #include "SIInstrInfo.h" | |||||||||
18 | #include "SIRegisterInfo.h" | |||||||||
19 | #include "llvm/ADT/STLExtras.h" | |||||||||
20 | #include "llvm/ADT/SmallVector.h" | |||||||||
21 | #include "llvm/CodeGen/LiveInterval.h" | |||||||||
22 | #include "llvm/CodeGen/LiveIntervals.h" | |||||||||
23 | #include "llvm/CodeGen/MachineInstr.h" | |||||||||
24 | #include "llvm/CodeGen/MachineRegisterInfo.h" | |||||||||
25 | #include "llvm/CodeGen/MachineScheduler.h" | |||||||||
26 | #include "llvm/CodeGen/RegisterPressure.h" | |||||||||
27 | #include "llvm/CodeGen/SlotIndexes.h" | |||||||||
28 | #include "llvm/CodeGen/TargetRegisterInfo.h" | |||||||||
29 | #include "llvm/Support/Debug.h" | |||||||||
30 | #include "llvm/Support/ErrorHandling.h" | |||||||||
31 | #include "llvm/Support/raw_ostream.h" | |||||||||
32 | #include <algorithm> | |||||||||
33 | #include <cassert> | |||||||||
34 | #include <map> | |||||||||
35 | #include <set> | |||||||||
36 | #include <utility> | |||||||||
37 | #include <vector> | |||||||||
38 | ||||||||||
39 | using namespace llvm; | |||||||||
40 | ||||||||||
41 | #define DEBUG_TYPE"machine-scheduler" "machine-scheduler" | |||||||||
42 | ||||||||||
43 | // This scheduler implements a different scheduling algorithm than | |||||||||
44 | // GenericScheduler. | |||||||||
45 | // | |||||||||
46 | // There are several specific architecture behaviours that can't be modelled | |||||||||
47 | // for GenericScheduler: | |||||||||
48 | // . When accessing the result of an SGPR load instruction, you have to wait | |||||||||
49 | // for all the SGPR load instructions before your current instruction to | |||||||||
50 | // have finished. | |||||||||
51 | // . When accessing the result of an VGPR load instruction, you have to wait | |||||||||
52 | // for all the VGPR load instructions previous to the VGPR load instruction | |||||||||
53 | // you are interested in to finish. | |||||||||
54 | // . The less the register pressure, the best load latencies are hidden | |||||||||
55 | // | |||||||||
56 | // Moreover some specifities (like the fact a lot of instructions in the shader | |||||||||
57 | // have few dependencies) makes the generic scheduler have some unpredictable | |||||||||
58 | // behaviours. For example when register pressure becomes high, it can either | |||||||||
59 | // manage to prevent register pressure from going too high, or it can | |||||||||
60 | // increase register pressure even more than if it hadn't taken register | |||||||||
61 | // pressure into account. | |||||||||
62 | // | |||||||||
63 | // Also some other bad behaviours are generated, like loading at the beginning | |||||||||
64 | // of the shader a constant in VGPR you won't need until the end of the shader. | |||||||||
65 | // | |||||||||
66 | // The scheduling problem for SI can distinguish three main parts: | |||||||||
67 | // . Hiding high latencies (texture sampling, etc) | |||||||||
68 | // . Hiding low latencies (SGPR constant loading, etc) | |||||||||
69 | // . Keeping register usage low for better latency hiding and general | |||||||||
70 | // performance | |||||||||
71 | // | |||||||||
72 | // Some other things can also affect performance, but are hard to predict | |||||||||
73 | // (cache usage, the fact the HW can issue several instructions from different | |||||||||
74 | // wavefronts if different types, etc) | |||||||||
75 | // | |||||||||
76 | // This scheduler tries to solve the scheduling problem by dividing it into | |||||||||
77 | // simpler sub-problems. It divides the instructions into blocks, schedules | |||||||||
78 | // locally inside the blocks where it takes care of low latencies, and then | |||||||||
79 | // chooses the order of the blocks by taking care of high latencies. | |||||||||
80 | // Dividing the instructions into blocks helps control keeping register | |||||||||
81 | // usage low. | |||||||||
82 | // | |||||||||
83 | // First the instructions are put into blocks. | |||||||||
84 | // We want the blocks help control register usage and hide high latencies | |||||||||
85 | // later. To help control register usage, we typically want all local | |||||||||
86 | // computations, when for example you create a result that can be comsummed | |||||||||
87 | // right away, to be contained in a block. Block inputs and outputs would | |||||||||
88 | // typically be important results that are needed in several locations of | |||||||||
89 | // the shader. Since we do want blocks to help hide high latencies, we want | |||||||||
90 | // the instructions inside the block to have a minimal set of dependencies | |||||||||
91 | // on high latencies. It will make it easy to pick blocks to hide specific | |||||||||
92 | // high latencies. | |||||||||
93 | // The block creation algorithm is divided into several steps, and several | |||||||||
94 | // variants can be tried during the scheduling process. | |||||||||
95 | // | |||||||||
96 | // Second the order of the instructions inside the blocks is chosen. | |||||||||
97 | // At that step we do take into account only register usage and hiding | |||||||||
98 | // low latency instructions | |||||||||
99 | // | |||||||||
100 | // Third the block order is chosen, there we try to hide high latencies | |||||||||
101 | // and keep register usage low. | |||||||||
102 | // | |||||||||
103 | // After the third step, a pass is done to improve the hiding of low | |||||||||
104 | // latencies. | |||||||||
105 | // | |||||||||
106 | // Actually when talking about 'low latency' or 'high latency' it includes | |||||||||
107 | // both the latency to get the cache (or global mem) data go to the register, | |||||||||
108 | // and the bandwidth limitations. | |||||||||
109 | // Increasing the number of active wavefronts helps hide the former, but it | |||||||||
110 | // doesn't solve the latter, thus why even if wavefront count is high, we have | |||||||||
111 | // to try have as many instructions hiding high latencies as possible. | |||||||||
112 | // The OpenCL doc says for example latency of 400 cycles for a global mem access, | |||||||||
113 | // which is hidden by 10 instructions if the wavefront count is 10. | |||||||||
114 | ||||||||||
115 | // Some figures taken from AMD docs: | |||||||||
116 | // Both texture and constant L1 caches are 4-way associative with 64 bytes | |||||||||
117 | // lines. | |||||||||
118 | // Constant cache is shared with 4 CUs. | |||||||||
119 | // For texture sampling, the address generation unit receives 4 texture | |||||||||
120 | // addresses per cycle, thus we could expect texture sampling latency to be | |||||||||
121 | // equivalent to 4 instructions in the very best case (a VGPR is 64 work items, | |||||||||
122 | // instructions in a wavefront group are executed every 4 cycles), | |||||||||
123 | // or 16 instructions if the other wavefronts associated to the 3 other VALUs | |||||||||
124 | // of the CU do texture sampling too. (Don't take these figures too seriously, | |||||||||
125 | // as I'm not 100% sure of the computation) | |||||||||
126 | // Data exports should get similar latency. | |||||||||
127 | // For constant loading, the cache is shader with 4 CUs. | |||||||||
128 | // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit" | |||||||||
129 | // I guess if the other CU don't read the cache, it can go up to 64B/cycle. | |||||||||
130 | // It means a simple s_buffer_load should take one instruction to hide, as | |||||||||
131 | // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same | |||||||||
132 | // cache line. | |||||||||
133 | // | |||||||||
134 | // As of today the driver doesn't preload the constants in cache, thus the | |||||||||
135 | // first loads get extra latency. The doc says global memory access can be | |||||||||
136 | // 300-600 cycles. We do not specially take that into account when scheduling | |||||||||
137 | // As we expect the driver to be able to preload the constants soon. | |||||||||
138 | ||||||||||
139 | // common code // | |||||||||
140 | ||||||||||
141 | #ifndef NDEBUG | |||||||||
142 | ||||||||||
143 | static const char *getReasonStr(SIScheduleCandReason Reason) { | |||||||||
144 | switch (Reason) { | |||||||||
145 | case NoCand: return "NOCAND"; | |||||||||
146 | case RegUsage: return "REGUSAGE"; | |||||||||
147 | case Latency: return "LATENCY"; | |||||||||
148 | case Successor: return "SUCCESSOR"; | |||||||||
149 | case Depth: return "DEPTH"; | |||||||||
150 | case NodeOrder: return "ORDER"; | |||||||||
151 | } | |||||||||
152 | llvm_unreachable("Unknown reason!")::llvm::llvm_unreachable_internal("Unknown reason!", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 152); | |||||||||
153 | } | |||||||||
154 | ||||||||||
155 | #endif | |||||||||
156 | ||||||||||
157 | static bool tryLess(int TryVal, int CandVal, | |||||||||
158 | SISchedulerCandidate &TryCand, | |||||||||
159 | SISchedulerCandidate &Cand, | |||||||||
160 | SIScheduleCandReason Reason) { | |||||||||
161 | if (TryVal < CandVal) { | |||||||||
162 | TryCand.Reason = Reason; | |||||||||
163 | return true; | |||||||||
164 | } | |||||||||
165 | if (TryVal > CandVal) { | |||||||||
166 | if (Cand.Reason > Reason) | |||||||||
167 | Cand.Reason = Reason; | |||||||||
168 | return true; | |||||||||
169 | } | |||||||||
170 | Cand.setRepeat(Reason); | |||||||||
171 | return false; | |||||||||
172 | } | |||||||||
173 | ||||||||||
174 | static bool tryGreater(int TryVal, int CandVal, | |||||||||
175 | SISchedulerCandidate &TryCand, | |||||||||
176 | SISchedulerCandidate &Cand, | |||||||||
177 | SIScheduleCandReason Reason) { | |||||||||
178 | if (TryVal > CandVal) { | |||||||||
179 | TryCand.Reason = Reason; | |||||||||
180 | return true; | |||||||||
181 | } | |||||||||
182 | if (TryVal < CandVal) { | |||||||||
183 | if (Cand.Reason > Reason) | |||||||||
184 | Cand.Reason = Reason; | |||||||||
185 | return true; | |||||||||
186 | } | |||||||||
187 | Cand.setRepeat(Reason); | |||||||||
188 | return false; | |||||||||
189 | } | |||||||||
190 | ||||||||||
191 | // SIScheduleBlock // | |||||||||
192 | ||||||||||
193 | void SIScheduleBlock::addUnit(SUnit *SU) { | |||||||||
194 | NodeNum2Index[SU->NodeNum] = SUnits.size(); | |||||||||
195 | SUnits.push_back(SU); | |||||||||
196 | } | |||||||||
197 | ||||||||||
198 | #ifndef NDEBUG | |||||||||
199 | void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) { | |||||||||
200 | ||||||||||
201 | dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); | |||||||||
202 | dbgs() << '\n'; | |||||||||
203 | } | |||||||||
204 | #endif | |||||||||
205 | ||||||||||
206 | void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand, | |||||||||
207 | SISchedCandidate &TryCand) { | |||||||||
208 | // Initialize the candidate if needed. | |||||||||
209 | if (!Cand.isValid()) { | |||||||||
210 | TryCand.Reason = NodeOrder; | |||||||||
211 | return; | |||||||||
212 | } | |||||||||
213 | ||||||||||
214 | if (Cand.SGPRUsage > 60 && | |||||||||
215 | tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, TryCand, Cand, RegUsage)) | |||||||||
216 | return; | |||||||||
217 | ||||||||||
218 | // Schedule low latency instructions as top as possible. | |||||||||
219 | // Order of priority is: | |||||||||
220 | // . Low latency instructions which do not depend on other low latency | |||||||||
221 | // instructions we haven't waited for | |||||||||
222 | // . Other instructions which do not depend on low latency instructions | |||||||||
223 | // we haven't waited for | |||||||||
224 | // . Low latencies | |||||||||
225 | // . All other instructions | |||||||||
226 | // Goal is to get: low latency instructions - independent instructions | |||||||||
227 | // - (eventually some more low latency instructions) | |||||||||
228 | // - instructions that depend on the first low latency instructions. | |||||||||
229 | // If in the block there is a lot of constant loads, the SGPR usage | |||||||||
230 | // could go quite high, thus above the arbitrary limit of 60 will encourage | |||||||||
231 | // use the already loaded constants (in order to release some SGPRs) before | |||||||||
232 | // loading more. | |||||||||
233 | if (tryLess(TryCand.HasLowLatencyNonWaitedParent, | |||||||||
234 | Cand.HasLowLatencyNonWaitedParent, | |||||||||
235 | TryCand, Cand, SIScheduleCandReason::Depth)) | |||||||||
236 | return; | |||||||||
237 | ||||||||||
238 | if (tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency, | |||||||||
239 | TryCand, Cand, SIScheduleCandReason::Depth)) | |||||||||
240 | return; | |||||||||
241 | ||||||||||
242 | if (TryCand.IsLowLatency && | |||||||||
243 | tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset, | |||||||||
244 | TryCand, Cand, SIScheduleCandReason::Depth)) | |||||||||
245 | return; | |||||||||
246 | ||||||||||
247 | if (tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, TryCand, Cand, RegUsage)) | |||||||||
248 | return; | |||||||||
249 | ||||||||||
250 | // Fall through to original instruction order. | |||||||||
251 | if (TryCand.SU->NodeNum < Cand.SU->NodeNum) { | |||||||||
252 | TryCand.Reason = NodeOrder; | |||||||||
253 | } | |||||||||
254 | } | |||||||||
255 | ||||||||||
256 | SUnit* SIScheduleBlock::pickNode() { | |||||||||
257 | SISchedCandidate TopCand; | |||||||||
258 | ||||||||||
259 | for (SUnit* SU : TopReadySUs) { | |||||||||
260 | SISchedCandidate TryCand; | |||||||||
261 | std::vector<unsigned> pressure; | |||||||||
262 | std::vector<unsigned> MaxPressure; | |||||||||
263 | // Predict register usage after this instruction. | |||||||||
264 | TryCand.SU = SU; | |||||||||
265 | TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure); | |||||||||
266 | TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()]; | |||||||||
267 | TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()]; | |||||||||
268 | TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum]; | |||||||||
269 | TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum]; | |||||||||
270 | TryCand.HasLowLatencyNonWaitedParent = | |||||||||
271 | HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]; | |||||||||
272 | tryCandidateTopDown(TopCand, TryCand); | |||||||||
273 | if (TryCand.Reason != NoCand) | |||||||||
274 | TopCand.setBest(TryCand); | |||||||||
275 | } | |||||||||
276 | ||||||||||
277 | return TopCand.SU; | |||||||||
278 | } | |||||||||
279 | ||||||||||
280 | ||||||||||
281 | // Schedule something valid. | |||||||||
282 | void SIScheduleBlock::fastSchedule() { | |||||||||
283 | TopReadySUs.clear(); | |||||||||
284 | if (Scheduled) | |||||||||
285 | undoSchedule(); | |||||||||
286 | ||||||||||
287 | for (SUnit* SU : SUnits) { | |||||||||
288 | if (!SU->NumPredsLeft) | |||||||||
289 | TopReadySUs.push_back(SU); | |||||||||
290 | } | |||||||||
291 | ||||||||||
292 | while (!TopReadySUs.empty()) { | |||||||||
293 | SUnit *SU = TopReadySUs[0]; | |||||||||
294 | ScheduledSUnits.push_back(SU); | |||||||||
295 | nodeScheduled(SU); | |||||||||
296 | } | |||||||||
297 | ||||||||||
298 | Scheduled = true; | |||||||||
299 | } | |||||||||
300 | ||||||||||
301 | // Returns if the register was set between first and last. | |||||||||
302 | static bool isDefBetween(unsigned Reg, | |||||||||
303 | SlotIndex First, SlotIndex Last, | |||||||||
304 | const MachineRegisterInfo *MRI, | |||||||||
305 | const LiveIntervals *LIS) { | |||||||||
306 | for (MachineRegisterInfo::def_instr_iterator | |||||||||
307 | UI = MRI->def_instr_begin(Reg), | |||||||||
308 | UE = MRI->def_instr_end(); UI != UE; ++UI) { | |||||||||
309 | const MachineInstr* MI = &*UI; | |||||||||
310 | if (MI->isDebugValue()) | |||||||||
311 | continue; | |||||||||
312 | SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot(); | |||||||||
313 | if (InstSlot >= First && InstSlot <= Last) | |||||||||
314 | return true; | |||||||||
315 | } | |||||||||
316 | return false; | |||||||||
317 | } | |||||||||
318 | ||||||||||
319 | void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, | |||||||||
320 | MachineBasicBlock::iterator EndBlock) { | |||||||||
321 | IntervalPressure Pressure, BotPressure; | |||||||||
322 | RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure); | |||||||||
323 | LiveIntervals *LIS = DAG->getLIS(); | |||||||||
324 | MachineRegisterInfo *MRI = DAG->getMRI(); | |||||||||
325 | DAG->initRPTracker(TopRPTracker); | |||||||||
326 | DAG->initRPTracker(BotRPTracker); | |||||||||
327 | DAG->initRPTracker(RPTracker); | |||||||||
328 | ||||||||||
329 | // Goes though all SU. RPTracker captures what had to be alive for the SUs | |||||||||
330 | // to execute, and what is still alive at the end. | |||||||||
331 | for (SUnit* SU : ScheduledSUnits) { | |||||||||
332 | RPTracker.setPos(SU->getInstr()); | |||||||||
333 | RPTracker.advance(); | |||||||||
334 | } | |||||||||
335 | ||||||||||
336 | // Close the RPTracker to finalize live ins/outs. | |||||||||
337 | RPTracker.closeRegion(); | |||||||||
338 | ||||||||||
339 | // Initialize the live ins and live outs. | |||||||||
340 | TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); | |||||||||
341 | BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); | |||||||||
342 | ||||||||||
343 | // Do not Track Physical Registers, because it messes up. | |||||||||
344 | for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { | |||||||||
345 | if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit)) | |||||||||
346 | LiveInRegs.insert(RegMaskPair.RegUnit); | |||||||||
347 | } | |||||||||
348 | LiveOutRegs.clear(); | |||||||||
349 | // There is several possibilities to distinguish: | |||||||||
350 | // 1) Reg is not input to any instruction in the block, but is output of one | |||||||||
351 | // 2) 1) + read in the block and not needed after it | |||||||||
352 | // 3) 1) + read in the block but needed in another block | |||||||||
353 | // 4) Reg is input of an instruction but another block will read it too | |||||||||
354 | // 5) Reg is input of an instruction and then rewritten in the block. | |||||||||
355 | // result is not read in the block (implies used in another block) | |||||||||
356 | // 6) Reg is input of an instruction and then rewritten in the block. | |||||||||
357 | // result is read in the block and not needed in another block | |||||||||
358 | // 7) Reg is input of an instruction and then rewritten in the block. | |||||||||
359 | // result is read in the block but also needed in another block | |||||||||
360 | // LiveInRegs will contains all the regs in situation 4, 5, 6, 7 | |||||||||
361 | // We want LiveOutRegs to contain only Regs whose content will be read after | |||||||||
362 | // in another block, and whose content was written in the current block, | |||||||||
363 | // that is we want it to get 1, 3, 5, 7 | |||||||||
364 | // Since we made the MIs of a block to be packed all together before | |||||||||
365 | // scheduling, then the LiveIntervals were correct, and the RPTracker was | |||||||||
366 | // able to correctly handle 5 vs 6, 2 vs 3. | |||||||||
367 | // (Note: This is not sufficient for RPTracker to not do mistakes for case 4) | |||||||||
368 | // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7 | |||||||||
369 | // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7 | |||||||||
370 | // The use of findDefBetween removes the case 4. | |||||||||
371 | for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { | |||||||||
372 | unsigned Reg = RegMaskPair.RegUnit; | |||||||||
373 | if (TargetRegisterInfo::isVirtualRegister(Reg) && | |||||||||
374 | isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(), | |||||||||
375 | LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI, | |||||||||
376 | LIS)) { | |||||||||
377 | LiveOutRegs.insert(Reg); | |||||||||
378 | } | |||||||||
379 | } | |||||||||
380 | ||||||||||
381 | // Pressure = sum_alive_registers register size | |||||||||
382 | // Internally llvm will represent some registers as big 128 bits registers | |||||||||
383 | // for example, but they actually correspond to 4 actual 32 bits registers. | |||||||||
384 | // Thus Pressure is not equal to num_alive_registers * constant. | |||||||||
385 | LiveInPressure = TopPressure.MaxSetPressure; | |||||||||
386 | LiveOutPressure = BotPressure.MaxSetPressure; | |||||||||
387 | ||||||||||
388 | // Prepares TopRPTracker for top down scheduling. | |||||||||
389 | TopRPTracker.closeTop(); | |||||||||
390 | } | |||||||||
391 | ||||||||||
392 | void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock, | |||||||||
393 | MachineBasicBlock::iterator EndBlock) { | |||||||||
394 | if (!Scheduled) | |||||||||
395 | fastSchedule(); | |||||||||
396 | ||||||||||
397 | // PreScheduling phase to set LiveIn and LiveOut. | |||||||||
398 | initRegPressure(BeginBlock, EndBlock); | |||||||||
399 | undoSchedule(); | |||||||||
400 | ||||||||||
401 | // Schedule for real now. | |||||||||
402 | ||||||||||
403 | TopReadySUs.clear(); | |||||||||
404 | ||||||||||
405 | for (SUnit* SU : SUnits) { | |||||||||
406 | if (!SU->NumPredsLeft) | |||||||||
407 | TopReadySUs.push_back(SU); | |||||||||
408 | } | |||||||||
409 | ||||||||||
410 | while (!TopReadySUs.empty()) { | |||||||||
411 | SUnit *SU = pickNode(); | |||||||||
412 | ScheduledSUnits.push_back(SU); | |||||||||
413 | TopRPTracker.setPos(SU->getInstr()); | |||||||||
414 | TopRPTracker.advance(); | |||||||||
415 | nodeScheduled(SU); | |||||||||
416 | } | |||||||||
417 | ||||||||||
418 | // TODO: compute InternalAdditionnalPressure. | |||||||||
419 | InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size()); | |||||||||
420 | ||||||||||
421 | // Check everything is right. | |||||||||
422 | #ifndef NDEBUG | |||||||||
423 | assert(SUnits.size() == ScheduledSUnits.size() &&(static_cast <bool> (SUnits.size() == ScheduledSUnits.size () && TopReadySUs.empty()) ? void (0) : __assert_fail ("SUnits.size() == ScheduledSUnits.size() && TopReadySUs.empty()" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 424, __extension__ __PRETTY_FUNCTION__)) | |||||||||
424 | TopReadySUs.empty())(static_cast <bool> (SUnits.size() == ScheduledSUnits.size () && TopReadySUs.empty()) ? void (0) : __assert_fail ("SUnits.size() == ScheduledSUnits.size() && TopReadySUs.empty()" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 424, __extension__ __PRETTY_FUNCTION__)); | |||||||||
425 | for (SUnit* SU : SUnits) { | |||||||||
426 | assert(SU->isScheduled &&(static_cast <bool> (SU->isScheduled && SU-> NumPredsLeft == 0) ? void (0) : __assert_fail ("SU->isScheduled && SU->NumPredsLeft == 0" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 427, __extension__ __PRETTY_FUNCTION__)) | |||||||||
427 | SU->NumPredsLeft == 0)(static_cast <bool> (SU->isScheduled && SU-> NumPredsLeft == 0) ? void (0) : __assert_fail ("SU->isScheduled && SU->NumPredsLeft == 0" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 427, __extension__ __PRETTY_FUNCTION__)); | |||||||||
428 | } | |||||||||
429 | #endif | |||||||||
430 | ||||||||||
431 | Scheduled = true; | |||||||||
432 | } | |||||||||
433 | ||||||||||
434 | void SIScheduleBlock::undoSchedule() { | |||||||||
435 | for (SUnit* SU : SUnits) { | |||||||||
436 | SU->isScheduled = false; | |||||||||
437 | for (SDep& Succ : SU->Succs) { | |||||||||
438 | if (BC->isSUInBlock(Succ.getSUnit(), ID)) | |||||||||
439 | undoReleaseSucc(SU, &Succ); | |||||||||
440 | } | |||||||||
441 | } | |||||||||
442 | HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); | |||||||||
443 | ScheduledSUnits.clear(); | |||||||||
444 | Scheduled = false; | |||||||||
445 | } | |||||||||
446 | ||||||||||
447 | void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) { | |||||||||
448 | SUnit *SuccSU = SuccEdge->getSUnit(); | |||||||||
449 | ||||||||||
450 | if (SuccEdge->isWeak()) { | |||||||||
451 | ++SuccSU->WeakPredsLeft; | |||||||||
452 | return; | |||||||||
453 | } | |||||||||
454 | ++SuccSU->NumPredsLeft; | |||||||||
455 | } | |||||||||
456 | ||||||||||
457 | void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) { | |||||||||
458 | SUnit *SuccSU = SuccEdge->getSUnit(); | |||||||||
459 | ||||||||||
460 | if (SuccEdge->isWeak()) { | |||||||||
461 | --SuccSU->WeakPredsLeft; | |||||||||
462 | return; | |||||||||
463 | } | |||||||||
464 | #ifndef NDEBUG | |||||||||
465 | if (SuccSU->NumPredsLeft == 0) { | |||||||||
466 | dbgs() << "*** Scheduling failed! ***\n"; | |||||||||
467 | SuccSU->dump(DAG); | |||||||||
468 | dbgs() << " has been released too many times!\n"; | |||||||||
469 | llvm_unreachable(nullptr)::llvm::llvm_unreachable_internal(nullptr, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 469); | |||||||||
470 | } | |||||||||
471 | #endif | |||||||||
472 | ||||||||||
473 | --SuccSU->NumPredsLeft; | |||||||||
474 | } | |||||||||
475 | ||||||||||
476 | /// Release Successors of the SU that are in the block or not. | |||||||||
477 | void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { | |||||||||
478 | for (SDep& Succ : SU->Succs) { | |||||||||
479 | SUnit *SuccSU = Succ.getSUnit(); | |||||||||
480 | ||||||||||
481 | if (SuccSU->NodeNum >= DAG->SUnits.size()) | |||||||||
482 | continue; | |||||||||
483 | ||||||||||
484 | if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock) | |||||||||
485 | continue; | |||||||||
486 | ||||||||||
487 | releaseSucc(SU, &Succ); | |||||||||
488 | if (SuccSU->NumPredsLeft == 0 && InOrOutBlock) | |||||||||
489 | TopReadySUs.push_back(SuccSU); | |||||||||
490 | } | |||||||||
491 | } | |||||||||
492 | ||||||||||
493 | void SIScheduleBlock::nodeScheduled(SUnit *SU) { | |||||||||
494 | // Is in TopReadySUs | |||||||||
495 | assert (!SU->NumPredsLeft)(static_cast <bool> (!SU->NumPredsLeft) ? void (0) : __assert_fail ("!SU->NumPredsLeft", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 495, __extension__ __PRETTY_FUNCTION__)); | |||||||||
496 | std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU); | |||||||||
497 | if (I == TopReadySUs.end()) { | |||||||||
498 | dbgs() << "Data Structure Bug in SI Scheduler\n"; | |||||||||
499 | llvm_unreachable(nullptr)::llvm::llvm_unreachable_internal(nullptr, "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 499); | |||||||||
500 | } | |||||||||
501 | TopReadySUs.erase(I); | |||||||||
502 | ||||||||||
503 | releaseSuccessors(SU, true); | |||||||||
504 | // Scheduling this node will trigger a wait, | |||||||||
505 | // thus propagate to other instructions that they do not need to wait either. | |||||||||
506 | if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]) | |||||||||
507 | HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); | |||||||||
508 | ||||||||||
509 | if (DAG->IsLowLatencySU[SU->NodeNum]) { | |||||||||
510 | for (SDep& Succ : SU->Succs) { | |||||||||
511 | std::map<unsigned, unsigned>::iterator I = | |||||||||
512 | NodeNum2Index.find(Succ.getSUnit()->NodeNum); | |||||||||
513 | if (I != NodeNum2Index.end()) | |||||||||
514 | HasLowLatencyNonWaitedParent[I->second] = 1; | |||||||||
515 | } | |||||||||
516 | } | |||||||||
517 | SU->isScheduled = true; | |||||||||
518 | } | |||||||||
519 | ||||||||||
520 | void SIScheduleBlock::finalizeUnits() { | |||||||||
521 | // We remove links from outside blocks to enable scheduling inside the block. | |||||||||
522 | for (SUnit* SU : SUnits) { | |||||||||
523 | releaseSuccessors(SU, false); | |||||||||
524 | if (DAG->IsHighLatencySU[SU->NodeNum]) | |||||||||
525 | HighLatencyBlock = true; | |||||||||
526 | } | |||||||||
527 | HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0); | |||||||||
528 | } | |||||||||
529 | ||||||||||
530 | // we maintain ascending order of IDs | |||||||||
531 | void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { | |||||||||
532 | unsigned PredID = Pred->getID(); | |||||||||
533 | ||||||||||
534 | // Check if not already predecessor. | |||||||||
535 | for (SIScheduleBlock* P : Preds) { | |||||||||
536 | if (PredID == P->getID()) | |||||||||
537 | return; | |||||||||
538 | } | |||||||||
539 | Preds.push_back(Pred); | |||||||||
540 | ||||||||||
541 | assert(none_of(Succs,(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock *, SIScheduleBlockLinkKind> S) { return PredID == S.first-> getID(); }) && "Loop in the Block Graph!") ? void (0) : __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\"" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 546, __extension__ __PRETTY_FUNCTION__)) | |||||||||
542 | [=](std::pair<SIScheduleBlock*,(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock *, SIScheduleBlockLinkKind> S) { return PredID == S.first-> getID(); }) && "Loop in the Block Graph!") ? void (0) : __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\"" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 546, __extension__ __PRETTY_FUNCTION__)) | |||||||||
543 | SIScheduleBlockLinkKind> S) {(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock *, SIScheduleBlockLinkKind> S) { return PredID == S.first-> getID(); }) && "Loop in the Block Graph!") ? void (0) : __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\"" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 546, __extension__ __PRETTY_FUNCTION__)) | |||||||||
544 | return PredID == S.first->getID();(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock *, SIScheduleBlockLinkKind> S) { return PredID == S.first-> getID(); }) && "Loop in the Block Graph!") ? void (0) : __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\"" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 546, __extension__ __PRETTY_FUNCTION__)) | |||||||||
545 | }) &&(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock *, SIScheduleBlockLinkKind> S) { return PredID == S.first-> getID(); }) && "Loop in the Block Graph!") ? void (0) : __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\"" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 546, __extension__ __PRETTY_FUNCTION__)) | |||||||||
546 | "Loop in the Block Graph!")(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock *, SIScheduleBlockLinkKind> S) { return PredID == S.first-> getID(); }) && "Loop in the Block Graph!") ? void (0) : __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\"" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 546, __extension__ __PRETTY_FUNCTION__)); | |||||||||
547 | } | |||||||||
548 | ||||||||||
549 | void SIScheduleBlock::addSucc(SIScheduleBlock *Succ, | |||||||||
550 | SIScheduleBlockLinkKind Kind) { | |||||||||
551 | unsigned SuccID = Succ->getID(); | |||||||||
552 | ||||||||||
553 | // Check if not already predecessor. | |||||||||
554 | for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) { | |||||||||
555 | if (SuccID == S.first->getID()) { | |||||||||
556 | if (S.second == SIScheduleBlockLinkKind::NoData && | |||||||||
557 | Kind == SIScheduleBlockLinkKind::Data) | |||||||||
558 | S.second = Kind; | |||||||||
559 | return; | |||||||||
560 | } | |||||||||
561 | } | |||||||||
562 | if (Succ->isHighLatencyBlock()) | |||||||||
563 | ++NumHighLatencySuccessors; | |||||||||
564 | Succs.push_back(std::make_pair(Succ, Kind)); | |||||||||
565 | ||||||||||
566 | assert(none_of(Preds,(static_cast <bool> (none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && "Loop in the Block Graph!" ) ? void (0) : __assert_fail ("none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && \"Loop in the Block Graph!\"" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 568, __extension__ __PRETTY_FUNCTION__)) | |||||||||
567 | [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&(static_cast <bool> (none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && "Loop in the Block Graph!" ) ? void (0) : __assert_fail ("none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && \"Loop in the Block Graph!\"" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 568, __extension__ __PRETTY_FUNCTION__)) | |||||||||
568 | "Loop in the Block Graph!")(static_cast <bool> (none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && "Loop in the Block Graph!" ) ? void (0) : __assert_fail ("none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && \"Loop in the Block Graph!\"" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 568, __extension__ __PRETTY_FUNCTION__)); | |||||||||
569 | } | |||||||||
570 | ||||||||||
571 | #ifndef NDEBUG | |||||||||
572 | void SIScheduleBlock::printDebug(bool full) { | |||||||||
573 | dbgs() << "Block (" << ID << ")\n"; | |||||||||
574 | if (!full) | |||||||||
575 | return; | |||||||||
576 | ||||||||||
577 | dbgs() << "\nContains High Latency Instruction: " | |||||||||
578 | << HighLatencyBlock << '\n'; | |||||||||
579 | dbgs() << "\nDepends On:\n"; | |||||||||
580 | for (SIScheduleBlock* P : Preds) { | |||||||||
581 | P->printDebug(false); | |||||||||
582 | } | |||||||||
583 | ||||||||||
584 | dbgs() << "\nSuccessors:\n"; | |||||||||
585 | for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) { | |||||||||
586 | if (S.second == SIScheduleBlockLinkKind::Data) | |||||||||
587 | dbgs() << "(Data Dep) "; | |||||||||
588 | S.first->printDebug(false); | |||||||||
589 | } | |||||||||
590 | ||||||||||
591 | if (Scheduled) { | |||||||||
592 | dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' ' | |||||||||
593 | << LiveInPressure[DAG->getVGPRSetID()] << '\n'; | |||||||||
594 | dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' ' | |||||||||
595 | << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n"; | |||||||||
596 | dbgs() << "LiveIns:\n"; | |||||||||
597 | for (unsigned Reg : LiveInRegs) | |||||||||
598 | dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; | |||||||||
599 | ||||||||||
600 | dbgs() << "\nLiveOuts:\n"; | |||||||||
601 | for (unsigned Reg : LiveOutRegs) | |||||||||
602 | dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; | |||||||||
603 | } | |||||||||
604 | ||||||||||
605 | dbgs() << "\nInstructions:\n"; | |||||||||
606 | if (!Scheduled) { | |||||||||
607 | for (SUnit* SU : SUnits) { | |||||||||
608 | SU->dump(DAG); | |||||||||
609 | } | |||||||||
610 | } else { | |||||||||
611 | for (SUnit* SU : SUnits) { | |||||||||
612 | SU->dump(DAG); | |||||||||
613 | } | |||||||||
614 | } | |||||||||
615 | ||||||||||
616 | dbgs() << "///////////////////////\n"; | |||||||||
617 | } | |||||||||
618 | #endif | |||||||||
619 | ||||||||||
620 | // SIScheduleBlockCreator // | |||||||||
621 | ||||||||||
622 | SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) : | |||||||||
623 | DAG(DAG) { | |||||||||
624 | } | |||||||||
625 | ||||||||||
626 | SIScheduleBlockCreator::~SIScheduleBlockCreator() = default; | |||||||||
627 | ||||||||||
628 | SIScheduleBlocks | |||||||||
629 | SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) { | |||||||||
630 | std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B = | |||||||||
631 | Blocks.find(BlockVariant); | |||||||||
632 | if (B == Blocks.end()) { | |||||||||
633 | SIScheduleBlocks Res; | |||||||||
634 | createBlocksForVariant(BlockVariant); | |||||||||
635 | topologicalSort(); | |||||||||
636 | scheduleInsideBlocks(); | |||||||||
637 | fillStats(); | |||||||||
638 | Res.Blocks = CurrentBlocks; | |||||||||
639 | Res.TopDownIndex2Block = TopDownIndex2Block; | |||||||||
640 | Res.TopDownBlock2Index = TopDownBlock2Index; | |||||||||
641 | Blocks[BlockVariant] = Res; | |||||||||
642 | return Res; | |||||||||
643 | } else { | |||||||||
644 | return B->second; | |||||||||
645 | } | |||||||||
646 | } | |||||||||
647 | ||||||||||
648 | bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) { | |||||||||
649 | if (SU->NodeNum >= DAG->SUnits.size()) | |||||||||
650 | return false; | |||||||||
651 | return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID; | |||||||||
652 | } | |||||||||
653 | ||||||||||
654 | void SIScheduleBlockCreator::colorHighLatenciesAlone() { | |||||||||
655 | unsigned DAGSize = DAG->SUnits.size(); | |||||||||
656 | ||||||||||
657 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||||||||
658 | SUnit *SU = &DAG->SUnits[i]; | |||||||||
659 | if (DAG->IsHighLatencySU[SU->NodeNum]) { | |||||||||
660 | CurrentColoring[SU->NodeNum] = NextReservedID++; | |||||||||
661 | } | |||||||||
662 | } | |||||||||
663 | } | |||||||||
664 | ||||||||||
665 | static bool | |||||||||
666 | hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) { | |||||||||
667 | for (const auto &PredDep : SU.Preds) { | |||||||||
668 | if (PredDep.getSUnit() == &FromSU && | |||||||||
669 | PredDep.getKind() == llvm::SDep::Data) | |||||||||
670 | return true; | |||||||||
671 | } | |||||||||
672 | return false; | |||||||||
673 | } | |||||||||
674 | ||||||||||
675 | void SIScheduleBlockCreator::colorHighLatenciesGroups() { | |||||||||
676 | unsigned DAGSize = DAG->SUnits.size(); | |||||||||
677 | unsigned NumHighLatencies = 0; | |||||||||
678 | unsigned GroupSize; | |||||||||
679 | int Color = NextReservedID; | |||||||||
680 | unsigned Count = 0; | |||||||||
681 | std::set<unsigned> FormingGroup; | |||||||||
682 | ||||||||||
683 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||||||||
684 | SUnit *SU = &DAG->SUnits[i]; | |||||||||
685 | if (DAG->IsHighLatencySU[SU->NodeNum]) | |||||||||
686 | ++NumHighLatencies; | |||||||||
687 | } | |||||||||
688 | ||||||||||
689 | if (NumHighLatencies == 0) | |||||||||
690 | return; | |||||||||
691 | ||||||||||
692 | if (NumHighLatencies <= 6) | |||||||||
693 | GroupSize = 2; | |||||||||
694 | else if (NumHighLatencies <= 12) | |||||||||
695 | GroupSize = 3; | |||||||||
696 | else | |||||||||
697 | GroupSize = 4; | |||||||||
698 | ||||||||||
699 | for (unsigned SUNum : DAG->TopDownIndex2SU) { | |||||||||
700 | const SUnit &SU = DAG->SUnits[SUNum]; | |||||||||
701 | if (DAG->IsHighLatencySU[SU.NodeNum]) { | |||||||||
702 | unsigned CompatibleGroup = true; | |||||||||
703 | int ProposedColor = Color; | |||||||||
704 | std::vector<int> AdditionalElements; | |||||||||
705 | ||||||||||
706 | // We don't want to put in the same block | |||||||||
707 | // two high latency instructions that depend | |||||||||
708 | // on each other. | |||||||||
709 | // One way would be to check canAddEdge | |||||||||
710 | // in both directions, but that currently is not | |||||||||
711 | // enough because there the high latency order is | |||||||||
712 | // enforced (via links). | |||||||||
713 | // Instead, look at the dependencies between the | |||||||||
714 | // high latency instructions and deduce if it is | |||||||||
715 | // a data dependency or not. | |||||||||
716 | for (unsigned j : FormingGroup) { | |||||||||
717 | bool HasSubGraph; | |||||||||
718 | std::vector<int> SubGraph; | |||||||||
719 | // By construction (topological order), if SU and | |||||||||
720 | // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary | |||||||||
721 | // in the parent graph of SU. | |||||||||
722 | #ifndef NDEBUG | |||||||||
723 | SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], | |||||||||
724 | HasSubGraph); | |||||||||
725 | assert(!HasSubGraph)(static_cast <bool> (!HasSubGraph) ? void (0) : __assert_fail ("!HasSubGraph", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 725, __extension__ __PRETTY_FUNCTION__)); | |||||||||
726 | #endif | |||||||||
727 | SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, | |||||||||
728 | HasSubGraph); | |||||||||
729 | if (!HasSubGraph) | |||||||||
730 | continue; // No dependencies between each other | |||||||||
731 | else if (SubGraph.size() > 5) { | |||||||||
732 | // Too many elements would be required to be added to the block. | |||||||||
733 | CompatibleGroup = false; | |||||||||
734 | break; | |||||||||
735 | } | |||||||||
736 | else { | |||||||||
737 | // Check the type of dependency | |||||||||
738 | for (unsigned k : SubGraph) { | |||||||||
739 | // If in the path to join the two instructions, | |||||||||
740 | // there is another high latency instruction, | |||||||||
741 | // or instructions colored for another block | |||||||||
742 | // abort the merge. | |||||||||
743 | if (DAG->IsHighLatencySU[k] || | |||||||||
744 | (CurrentColoring[k] != ProposedColor && | |||||||||
745 | CurrentColoring[k] != 0)) { | |||||||||
746 | CompatibleGroup = false; | |||||||||
747 | break; | |||||||||
748 | } | |||||||||
749 | // If one of the SU in the subgraph depends on the result of SU j, | |||||||||
750 | // there'll be a data dependency. | |||||||||
751 | if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) { | |||||||||
752 | CompatibleGroup = false; | |||||||||
753 | break; | |||||||||
754 | } | |||||||||
755 | } | |||||||||
756 | if (!CompatibleGroup) | |||||||||
757 | break; | |||||||||
758 | // Same check for the SU | |||||||||
759 | if (hasDataDependencyPred(SU, DAG->SUnits[j])) { | |||||||||
760 | CompatibleGroup = false; | |||||||||
761 | break; | |||||||||
762 | } | |||||||||
763 | // Add all the required instructions to the block | |||||||||
764 | // These cannot live in another block (because they | |||||||||
765 | // depend (order dependency) on one of the | |||||||||
766 | // instruction in the block, and are required for the | |||||||||
767 | // high latency instruction we add. | |||||||||
768 | AdditionalElements.insert(AdditionalElements.end(), | |||||||||
769 | SubGraph.begin(), SubGraph.end()); | |||||||||
770 | } | |||||||||
771 | } | |||||||||
772 | if (CompatibleGroup) { | |||||||||
773 | FormingGroup.insert(SU.NodeNum); | |||||||||
774 | for (unsigned j : AdditionalElements) | |||||||||
775 | CurrentColoring[j] = ProposedColor; | |||||||||
776 | CurrentColoring[SU.NodeNum] = ProposedColor; | |||||||||
777 | ++Count; | |||||||||
778 | } | |||||||||
779 | // Found one incompatible instruction, | |||||||||
780 | // or has filled a big enough group. | |||||||||
781 | // -> start a new one. | |||||||||
782 | if (!CompatibleGroup) { | |||||||||
783 | FormingGroup.clear(); | |||||||||
784 | Color = ++NextReservedID; | |||||||||
785 | ProposedColor = Color; | |||||||||
786 | FormingGroup.insert(SU.NodeNum); | |||||||||
787 | CurrentColoring[SU.NodeNum] = ProposedColor; | |||||||||
788 | Count = 0; | |||||||||
789 | } else if (Count == GroupSize) { | |||||||||
790 | FormingGroup.clear(); | |||||||||
791 | Color = ++NextReservedID; | |||||||||
792 | ProposedColor = Color; | |||||||||
793 | Count = 0; | |||||||||
794 | } | |||||||||
795 | } | |||||||||
796 | } | |||||||||
797 | } | |||||||||
798 | ||||||||||
799 | void SIScheduleBlockCreator::colorComputeReservedDependencies() { | |||||||||
800 | unsigned DAGSize = DAG->SUnits.size(); | |||||||||
801 | std::map<std::set<unsigned>, unsigned> ColorCombinations; | |||||||||
802 | ||||||||||
803 | CurrentTopDownReservedDependencyColoring.clear(); | |||||||||
804 | CurrentBottomUpReservedDependencyColoring.clear(); | |||||||||
805 | ||||||||||
806 | CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0); | |||||||||
807 | CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0); | |||||||||
808 | ||||||||||
809 | // Traverse TopDown, and give different colors to SUs depending | |||||||||
810 | // on which combination of High Latencies they depend on. | |||||||||
811 | ||||||||||
812 | for (unsigned SUNum : DAG->TopDownIndex2SU) { | |||||||||
813 | SUnit *SU = &DAG->SUnits[SUNum]; | |||||||||
814 | std::set<unsigned> SUColors; | |||||||||
815 | ||||||||||
816 | // Already given. | |||||||||
817 | if (CurrentColoring[SU->NodeNum]) { | |||||||||
818 | CurrentTopDownReservedDependencyColoring[SU->NodeNum] = | |||||||||
819 | CurrentColoring[SU->NodeNum]; | |||||||||
820 | continue; | |||||||||
821 | } | |||||||||
822 | ||||||||||
823 | for (SDep& PredDep : SU->Preds) { | |||||||||
824 | SUnit *Pred = PredDep.getSUnit(); | |||||||||
825 | if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) | |||||||||
826 | continue; | |||||||||
827 | if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0) | |||||||||
828 | SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]); | |||||||||
829 | } | |||||||||
830 | // Color 0 by default. | |||||||||
831 | if (SUColors.empty()) | |||||||||
832 | continue; | |||||||||
833 | // Same color than parents. | |||||||||
834 | if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) | |||||||||
835 | CurrentTopDownReservedDependencyColoring[SU->NodeNum] = | |||||||||
836 | *SUColors.begin(); | |||||||||
837 | else { | |||||||||
838 | std::map<std::set<unsigned>, unsigned>::iterator Pos = | |||||||||
839 | ColorCombinations.find(SUColors); | |||||||||
840 | if (Pos != ColorCombinations.end()) { | |||||||||
841 | CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second; | |||||||||
842 | } else { | |||||||||
843 | CurrentTopDownReservedDependencyColoring[SU->NodeNum] = | |||||||||
844 | NextNonReservedID; | |||||||||
845 | ColorCombinations[SUColors] = NextNonReservedID++; | |||||||||
846 | } | |||||||||
847 | } | |||||||||
848 | } | |||||||||
849 | ||||||||||
850 | ColorCombinations.clear(); | |||||||||
851 | ||||||||||
852 | // Same as before, but BottomUp. | |||||||||
853 | ||||||||||
854 | for (unsigned SUNum : DAG->BottomUpIndex2SU) { | |||||||||
855 | SUnit *SU = &DAG->SUnits[SUNum]; | |||||||||
856 | std::set<unsigned> SUColors; | |||||||||
857 | ||||||||||
858 | // Already given. | |||||||||
859 | if (CurrentColoring[SU->NodeNum]) { | |||||||||
860 | CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = | |||||||||
861 | CurrentColoring[SU->NodeNum]; | |||||||||
862 | continue; | |||||||||
863 | } | |||||||||
864 | ||||||||||
865 | for (SDep& SuccDep : SU->Succs) { | |||||||||
866 | SUnit *Succ = SuccDep.getSUnit(); | |||||||||
867 | if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) | |||||||||
868 | continue; | |||||||||
869 | if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0) | |||||||||
870 | SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]); | |||||||||
871 | } | |||||||||
872 | // Keep color 0. | |||||||||
873 | if (SUColors.empty()) | |||||||||
874 | continue; | |||||||||
875 | // Same color than parents. | |||||||||
876 | if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) | |||||||||
877 | CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = | |||||||||
878 | *SUColors.begin(); | |||||||||
879 | else { | |||||||||
880 | std::map<std::set<unsigned>, unsigned>::iterator Pos = | |||||||||
881 | ColorCombinations.find(SUColors); | |||||||||
882 | if (Pos != ColorCombinations.end()) { | |||||||||
883 | CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second; | |||||||||
884 | } else { | |||||||||
885 | CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = | |||||||||
886 | NextNonReservedID; | |||||||||
887 | ColorCombinations[SUColors] = NextNonReservedID++; | |||||||||
888 | } | |||||||||
889 | } | |||||||||
890 | } | |||||||||
891 | } | |||||||||
892 | ||||||||||
893 | void SIScheduleBlockCreator::colorAccordingToReservedDependencies() { | |||||||||
894 | unsigned DAGSize = DAG->SUnits.size(); | |||||||||
895 | std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations; | |||||||||
896 | ||||||||||
897 | // Every combination of colors given by the top down | |||||||||
898 | // and bottom up Reserved node dependency | |||||||||
899 | ||||||||||
900 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||||||||
901 | SUnit *SU = &DAG->SUnits[i]; | |||||||||
902 | std::pair<unsigned, unsigned> SUColors; | |||||||||
903 | ||||||||||
904 | // High latency instructions: already given. | |||||||||
905 | if (CurrentColoring[SU->NodeNum]) | |||||||||
906 | continue; | |||||||||
907 | ||||||||||
908 | SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum]; | |||||||||
909 | SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum]; | |||||||||
910 | ||||||||||
911 | std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos = | |||||||||
912 | ColorCombinations.find(SUColors); | |||||||||
913 | if (Pos != ColorCombinations.end()) { | |||||||||
914 | CurrentColoring[SU->NodeNum] = Pos->second; | |||||||||
915 | } else { | |||||||||
916 | CurrentColoring[SU->NodeNum] = NextNonReservedID; | |||||||||
917 | ColorCombinations[SUColors] = NextNonReservedID++; | |||||||||
918 | } | |||||||||
919 | } | |||||||||
920 | } | |||||||||
921 | ||||||||||
922 | void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { | |||||||||
923 | unsigned DAGSize = DAG->SUnits.size(); | |||||||||
924 | std::vector<int> PendingColoring = CurrentColoring; | |||||||||
925 | ||||||||||
926 | assert(DAGSize >= 1 &&(static_cast <bool> (DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring .size() == DAGSize && CurrentTopDownReservedDependencyColoring .size() == DAGSize) ? void (0) : __assert_fail ("DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring.size() == DAGSize && CurrentTopDownReservedDependencyColoring.size() == DAGSize" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 928, __extension__ __PRETTY_FUNCTION__)) | |||||||||
927 | CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&(static_cast <bool> (DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring .size() == DAGSize && CurrentTopDownReservedDependencyColoring .size() == DAGSize) ? void (0) : __assert_fail ("DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring.size() == DAGSize && CurrentTopDownReservedDependencyColoring.size() == DAGSize" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 928, __extension__ __PRETTY_FUNCTION__)) | |||||||||
928 | CurrentTopDownReservedDependencyColoring.size() == DAGSize)(static_cast <bool> (DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring .size() == DAGSize && CurrentTopDownReservedDependencyColoring .size() == DAGSize) ? void (0) : __assert_fail ("DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring.size() == DAGSize && CurrentTopDownReservedDependencyColoring.size() == DAGSize" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 928, __extension__ __PRETTY_FUNCTION__)); | |||||||||
929 | // If there is no reserved block at all, do nothing. We don't want | |||||||||
930 | // everything in one block. | |||||||||
931 | if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(), | |||||||||
932 | CurrentBottomUpReservedDependencyColoring.end()) == 0 && | |||||||||
933 | *std::max_element(CurrentTopDownReservedDependencyColoring.begin(), | |||||||||
934 | CurrentTopDownReservedDependencyColoring.end()) == 0) | |||||||||
935 | return; | |||||||||
936 | ||||||||||
937 | for (unsigned SUNum : DAG->BottomUpIndex2SU) { | |||||||||
938 | SUnit *SU = &DAG->SUnits[SUNum]; | |||||||||
939 | std::set<unsigned> SUColors; | |||||||||
940 | std::set<unsigned> SUColorsPending; | |||||||||
941 | ||||||||||
942 | if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) | |||||||||
943 | continue; | |||||||||
944 | ||||||||||
945 | if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 || | |||||||||
946 | CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0) | |||||||||
947 | continue; | |||||||||
948 | ||||||||||
949 | for (SDep& SuccDep : SU->Succs) { | |||||||||
950 | SUnit *Succ = SuccDep.getSUnit(); | |||||||||
951 | if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) | |||||||||
952 | continue; | |||||||||
953 | if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 || | |||||||||
954 | CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0) | |||||||||
955 | SUColors.insert(CurrentColoring[Succ->NodeNum]); | |||||||||
956 | SUColorsPending.insert(PendingColoring[Succ->NodeNum]); | |||||||||
957 | } | |||||||||
958 | // If there is only one child/parent block, and that block | |||||||||
959 | // is not among the ones we are removing in this path, then | |||||||||
960 | // merge the instruction to that block | |||||||||
961 | if (SUColors.size() == 1 && SUColorsPending.size() == 1) | |||||||||
962 | PendingColoring[SU->NodeNum] = *SUColors.begin(); | |||||||||
963 | else // TODO: Attribute new colors depending on color | |||||||||
964 | // combination of children. | |||||||||
965 | PendingColoring[SU->NodeNum] = NextNonReservedID++; | |||||||||
966 | } | |||||||||
967 | CurrentColoring = PendingColoring; | |||||||||
968 | } | |||||||||
969 | ||||||||||
970 | ||||||||||
971 | void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() { | |||||||||
972 | unsigned DAGSize = DAG->SUnits.size(); | |||||||||
973 | unsigned PreviousColor; | |||||||||
974 | std::set<unsigned> SeenColors; | |||||||||
975 | ||||||||||
976 | if (DAGSize <= 1) | |||||||||
977 | return; | |||||||||
978 | ||||||||||
979 | PreviousColor = CurrentColoring[0]; | |||||||||
980 | ||||||||||
981 | for (unsigned i = 1, e = DAGSize; i != e; ++i) { | |||||||||
982 | SUnit *SU = &DAG->SUnits[i]; | |||||||||
983 | unsigned CurrentColor = CurrentColoring[i]; | |||||||||
984 | unsigned PreviousColorSave = PreviousColor; | |||||||||
985 | assert(i == SU->NodeNum)(static_cast <bool> (i == SU->NodeNum) ? void (0) : __assert_fail ("i == SU->NodeNum", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 985, __extension__ __PRETTY_FUNCTION__)); | |||||||||
986 | ||||||||||
987 | if (CurrentColor != PreviousColor) | |||||||||
988 | SeenColors.insert(PreviousColor); | |||||||||
989 | PreviousColor = CurrentColor; | |||||||||
990 | ||||||||||
991 | if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) | |||||||||
992 | continue; | |||||||||
993 | ||||||||||
994 | if (SeenColors.find(CurrentColor) == SeenColors.end()) | |||||||||
995 | continue; | |||||||||
996 | ||||||||||
997 | if (PreviousColorSave != CurrentColor) | |||||||||
998 | CurrentColoring[i] = NextNonReservedID++; | |||||||||
999 | else | |||||||||
1000 | CurrentColoring[i] = CurrentColoring[i-1]; | |||||||||
1001 | } | |||||||||
1002 | } | |||||||||
1003 | ||||||||||
1004 | void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { | |||||||||
1005 | unsigned DAGSize = DAG->SUnits.size(); | |||||||||
1006 | ||||||||||
1007 | for (unsigned SUNum : DAG->BottomUpIndex2SU) { | |||||||||
1008 | SUnit *SU = &DAG->SUnits[SUNum]; | |||||||||
1009 | std::set<unsigned> SUColors; | |||||||||
1010 | ||||||||||
1011 | if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) | |||||||||
1012 | continue; | |||||||||
1013 | ||||||||||
1014 | // No predecessor: Vgpr constant loading. | |||||||||
1015 | // Low latency instructions usually have a predecessor (the address) | |||||||||
1016 | if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum]) | |||||||||
1017 | continue; | |||||||||
1018 | ||||||||||
1019 | for (SDep& SuccDep : SU->Succs) { | |||||||||
1020 | SUnit *Succ = SuccDep.getSUnit(); | |||||||||
1021 | if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) | |||||||||
1022 | continue; | |||||||||
1023 | SUColors.insert(CurrentColoring[Succ->NodeNum]); | |||||||||
1024 | } | |||||||||
1025 | if (SUColors.size() == 1) | |||||||||
1026 | CurrentColoring[SU->NodeNum] = *SUColors.begin(); | |||||||||
1027 | } | |||||||||
1028 | } | |||||||||
1029 | ||||||||||
1030 | void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { | |||||||||
1031 | unsigned DAGSize = DAG->SUnits.size(); | |||||||||
1032 | ||||||||||
1033 | for (unsigned SUNum : DAG->BottomUpIndex2SU) { | |||||||||
1034 | SUnit *SU = &DAG->SUnits[SUNum]; | |||||||||
1035 | std::set<unsigned> SUColors; | |||||||||
1036 | ||||||||||
1037 | if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) | |||||||||
1038 | continue; | |||||||||
1039 | ||||||||||
1040 | for (SDep& SuccDep : SU->Succs) { | |||||||||
1041 | SUnit *Succ = SuccDep.getSUnit(); | |||||||||
1042 | if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) | |||||||||
1043 | continue; | |||||||||
1044 | SUColors.insert(CurrentColoring[Succ->NodeNum]); | |||||||||
1045 | } | |||||||||
1046 | if (SUColors.size() == 1) | |||||||||
1047 | CurrentColoring[SU->NodeNum] = *SUColors.begin(); | |||||||||
1048 | } | |||||||||
1049 | } | |||||||||
1050 | ||||||||||
1051 | void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() { | |||||||||
1052 | unsigned DAGSize = DAG->SUnits.size(); | |||||||||
1053 | ||||||||||
1054 | for (unsigned SUNum : DAG->BottomUpIndex2SU) { | |||||||||
1055 | SUnit *SU = &DAG->SUnits[SUNum]; | |||||||||
1056 | std::set<unsigned> SUColors; | |||||||||
1057 | ||||||||||
1058 | if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) | |||||||||
1059 | continue; | |||||||||
1060 | ||||||||||
1061 | for (SDep& SuccDep : SU->Succs) { | |||||||||
1062 | SUnit *Succ = SuccDep.getSUnit(); | |||||||||
1063 | if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) | |||||||||
1064 | continue; | |||||||||
1065 | SUColors.insert(CurrentColoring[Succ->NodeNum]); | |||||||||
1066 | } | |||||||||
1067 | if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize) | |||||||||
1068 | CurrentColoring[SU->NodeNum] = *SUColors.begin(); | |||||||||
1069 | } | |||||||||
1070 | } | |||||||||
1071 | ||||||||||
1072 | void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { | |||||||||
1073 | unsigned DAGSize = DAG->SUnits.size(); | |||||||||
1074 | std::map<unsigned, unsigned> ColorCount; | |||||||||
1075 | ||||||||||
1076 | for (unsigned SUNum : DAG->BottomUpIndex2SU) { | |||||||||
1077 | SUnit *SU = &DAG->SUnits[SUNum]; | |||||||||
1078 | unsigned color = CurrentColoring[SU->NodeNum]; | |||||||||
1079 | ++ColorCount[color]; | |||||||||
1080 | } | |||||||||
1081 | ||||||||||
1082 | for (unsigned SUNum : DAG->BottomUpIndex2SU) { | |||||||||
1083 | SUnit *SU = &DAG->SUnits[SUNum]; | |||||||||
1084 | unsigned color = CurrentColoring[SU->NodeNum]; | |||||||||
1085 | std::set<unsigned> SUColors; | |||||||||
1086 | ||||||||||
1087 | if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) | |||||||||
1088 | continue; | |||||||||
1089 | ||||||||||
1090 | if (ColorCount[color] > 1) | |||||||||
1091 | continue; | |||||||||
1092 | ||||||||||
1093 | for (SDep& SuccDep : SU->Succs) { | |||||||||
1094 | SUnit *Succ = SuccDep.getSUnit(); | |||||||||
1095 | if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) | |||||||||
1096 | continue; | |||||||||
1097 | SUColors.insert(CurrentColoring[Succ->NodeNum]); | |||||||||
1098 | } | |||||||||
1099 | if (SUColors.size() == 1 && *SUColors.begin() != color) { | |||||||||
1100 | --ColorCount[color]; | |||||||||
1101 | CurrentColoring[SU->NodeNum] = *SUColors.begin(); | |||||||||
1102 | ++ColorCount[*SUColors.begin()]; | |||||||||
1103 | } | |||||||||
1104 | } | |||||||||
1105 | } | |||||||||
1106 | ||||||||||
1107 | void SIScheduleBlockCreator::cutHugeBlocks() { | |||||||||
1108 | // TODO | |||||||||
1109 | } | |||||||||
1110 | ||||||||||
1111 | void SIScheduleBlockCreator::regroupNoUserInstructions() { | |||||||||
1112 | unsigned DAGSize = DAG->SUnits.size(); | |||||||||
1113 | int GroupID = NextNonReservedID++; | |||||||||
1114 | ||||||||||
1115 | for (unsigned SUNum : DAG->BottomUpIndex2SU) { | |||||||||
1116 | SUnit *SU = &DAG->SUnits[SUNum]; | |||||||||
1117 | bool hasSuccessor = false; | |||||||||
1118 | ||||||||||
1119 | if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) | |||||||||
1120 | continue; | |||||||||
1121 | ||||||||||
1122 | for (SDep& SuccDep : SU->Succs) { | |||||||||
1123 | SUnit *Succ = SuccDep.getSUnit(); | |||||||||
1124 | if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) | |||||||||
1125 | continue; | |||||||||
1126 | hasSuccessor = true; | |||||||||
1127 | } | |||||||||
1128 | if (!hasSuccessor) | |||||||||
1129 | CurrentColoring[SU->NodeNum] = GroupID; | |||||||||
1130 | } | |||||||||
1131 | } | |||||||||
1132 | ||||||||||
1133 | void SIScheduleBlockCreator::colorExports() { | |||||||||
1134 | unsigned ExportColor = NextNonReservedID++; | |||||||||
1135 | SmallVector<unsigned, 8> ExpGroup; | |||||||||
1136 | ||||||||||
1137 | // Put all exports together in a block. | |||||||||
1138 | // The block will naturally end up being scheduled last, | |||||||||
1139 | // thus putting exports at the end of the schedule, which | |||||||||
1140 | // is better for performance. | |||||||||
1141 | // However we must ensure, for safety, the exports can be put | |||||||||
1142 | // together in the same block without any other instruction. | |||||||||
1143 | // This could happen, for example, when scheduling after regalloc | |||||||||
1144 | // if reloading a spilled register from memory using the same | |||||||||
1145 | // register than used in a previous export. | |||||||||
1146 | // If that happens, do not regroup the exports. | |||||||||
1147 | for (unsigned SUNum : DAG->TopDownIndex2SU) { | |||||||||
1148 | const SUnit &SU = DAG->SUnits[SUNum]; | |||||||||
1149 | if (SIInstrInfo::isEXP(*SU.getInstr())) { | |||||||||
1150 | // Check the EXP can be added to the group safely, | |||||||||
1151 | // ie without needing any other instruction. | |||||||||
1152 | // The EXP is allowed to depend on other EXP | |||||||||
1153 | // (they will be in the same group). | |||||||||
1154 | for (unsigned j : ExpGroup) { | |||||||||
1155 | bool HasSubGraph; | |||||||||
1156 | std::vector<int> SubGraph; | |||||||||
1157 | // By construction (topological order), if SU and | |||||||||
1158 | // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary | |||||||||
1159 | // in the parent graph of SU. | |||||||||
1160 | #ifndef NDEBUG | |||||||||
1161 | SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], | |||||||||
1162 | HasSubGraph); | |||||||||
1163 | assert(!HasSubGraph)(static_cast <bool> (!HasSubGraph) ? void (0) : __assert_fail ("!HasSubGraph", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 1163, __extension__ __PRETTY_FUNCTION__)); | |||||||||
1164 | #endif | |||||||||
1165 | SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, | |||||||||
1166 | HasSubGraph); | |||||||||
1167 | if (!HasSubGraph) | |||||||||
1168 | continue; // No dependencies between each other | |||||||||
1169 | ||||||||||
1170 | // SubGraph contains all the instructions required | |||||||||
1171 | // between EXP SUnits[j] and EXP SU. | |||||||||
1172 | for (unsigned k : SubGraph) { | |||||||||
1173 | if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr())) | |||||||||
1174 | // Other instructions than EXP would be required in the group. | |||||||||
1175 | // Abort the groupping. | |||||||||
1176 | return; | |||||||||
1177 | } | |||||||||
1178 | } | |||||||||
1179 | ||||||||||
1180 | ExpGroup.push_back(SUNum); | |||||||||
1181 | } | |||||||||
1182 | } | |||||||||
1183 | ||||||||||
1184 | // The group can be formed. Give the color. | |||||||||
1185 | for (unsigned j : ExpGroup) | |||||||||
1186 | CurrentColoring[j] = ExportColor; | |||||||||
1187 | } | |||||||||
1188 | ||||||||||
1189 | void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) { | |||||||||
1190 | unsigned DAGSize = DAG->SUnits.size(); | |||||||||
1191 | std::map<unsigned,unsigned> RealID; | |||||||||
1192 | ||||||||||
1193 | CurrentBlocks.clear(); | |||||||||
1194 | CurrentColoring.clear(); | |||||||||
1195 | CurrentColoring.resize(DAGSize, 0); | |||||||||
1196 | Node2CurrentBlock.clear(); | |||||||||
1197 | ||||||||||
1198 | // Restore links previous scheduling variant has overridden. | |||||||||
1199 | DAG->restoreSULinksLeft(); | |||||||||
1200 | ||||||||||
1201 | NextReservedID = 1; | |||||||||
1202 | NextNonReservedID = DAGSize + 1; | |||||||||
1203 | ||||||||||
1204 | DEBUG(dbgs() << "Coloring the graph\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Coloring the graph\n" ; } } while (false); | |||||||||
1205 | ||||||||||
1206 | if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped) | |||||||||
1207 | colorHighLatenciesGroups(); | |||||||||
1208 | else | |||||||||
1209 | colorHighLatenciesAlone(); | |||||||||
1210 | colorComputeReservedDependencies(); | |||||||||
1211 | colorAccordingToReservedDependencies(); | |||||||||
1212 | colorEndsAccordingToDependencies(); | |||||||||
1213 | if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive) | |||||||||
1214 | colorForceConsecutiveOrderInGroup(); | |||||||||
1215 | regroupNoUserInstructions(); | |||||||||
1216 | colorMergeConstantLoadsNextGroup(); | |||||||||
1217 | colorMergeIfPossibleNextGroupOnlyForReserved(); | |||||||||
1218 | colorExports(); | |||||||||
1219 | ||||||||||
1220 | // Put SUs of same color into same block | |||||||||
1221 | Node2CurrentBlock.resize(DAGSize, -1); | |||||||||
1222 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||||||||
1223 | SUnit *SU = &DAG->SUnits[i]; | |||||||||
1224 | unsigned Color = CurrentColoring[SU->NodeNum]; | |||||||||
1225 | if (RealID.find(Color) == RealID.end()) { | |||||||||
1226 | int ID = CurrentBlocks.size(); | |||||||||
1227 | BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID)); | |||||||||
1228 | CurrentBlocks.push_back(BlockPtrs.rbegin()->get()); | |||||||||
1229 | RealID[Color] = ID; | |||||||||
1230 | } | |||||||||
1231 | CurrentBlocks[RealID[Color]]->addUnit(SU); | |||||||||
1232 | Node2CurrentBlock[SU->NodeNum] = RealID[Color]; | |||||||||
1233 | } | |||||||||
1234 | ||||||||||
1235 | // Build dependencies between blocks. | |||||||||
1236 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||||||||
1237 | SUnit *SU = &DAG->SUnits[i]; | |||||||||
1238 | int SUID = Node2CurrentBlock[i]; | |||||||||
1239 | for (SDep& SuccDep : SU->Succs) { | |||||||||
1240 | SUnit *Succ = SuccDep.getSUnit(); | |||||||||
1241 | if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) | |||||||||
1242 | continue; | |||||||||
1243 | if (Node2CurrentBlock[Succ->NodeNum] != SUID) | |||||||||
1244 | CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]], | |||||||||
1245 | SuccDep.isCtrl() ? NoData : Data); | |||||||||
1246 | } | |||||||||
1247 | for (SDep& PredDep : SU->Preds) { | |||||||||
1248 | SUnit *Pred = PredDep.getSUnit(); | |||||||||
1249 | if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) | |||||||||
1250 | continue; | |||||||||
1251 | if (Node2CurrentBlock[Pred->NodeNum] != SUID) | |||||||||
1252 | CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]); | |||||||||
1253 | } | |||||||||
1254 | } | |||||||||
1255 | ||||||||||
1256 | // Free root and leafs of all blocks to enable scheduling inside them. | |||||||||
1257 | for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { | |||||||||
1258 | SIScheduleBlock *Block = CurrentBlocks[i]; | |||||||||
1259 | Block->finalizeUnits(); | |||||||||
1260 | } | |||||||||
1261 | DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Blocks created:\n\n" ; for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks[i]; Block->printDebug (true); }; } } while (false) | |||||||||
1262 | dbgs() << "Blocks created:\n\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Blocks created:\n\n" ; for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks[i]; Block->printDebug (true); }; } } while (false) | |||||||||
1263 | for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Blocks created:\n\n" ; for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks[i]; Block->printDebug (true); }; } } while (false) | |||||||||
1264 | SIScheduleBlock *Block = CurrentBlocks[i];do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Blocks created:\n\n" ; for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks[i]; Block->printDebug (true); }; } } while (false) | |||||||||
1265 | Block->printDebug(true);do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Blocks created:\n\n" ; for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks[i]; Block->printDebug (true); }; } } while (false) | |||||||||
1266 | }do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Blocks created:\n\n" ; for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks[i]; Block->printDebug (true); }; } } while (false) | |||||||||
1267 | )do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Blocks created:\n\n" ; for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks[i]; Block->printDebug (true); }; } } while (false); | |||||||||
1268 | } | |||||||||
1269 | ||||||||||
1270 | // Two functions taken from Codegen/MachineScheduler.cpp | |||||||||
1271 | ||||||||||
1272 | /// Non-const version. | |||||||||
1273 | static MachineBasicBlock::iterator | |||||||||
1274 | nextIfDebug(MachineBasicBlock::iterator I, | |||||||||
1275 | MachineBasicBlock::const_iterator End) { | |||||||||
1276 | for (; I != End; ++I) { | |||||||||
1277 | if (!I->isDebugValue()) | |||||||||
1278 | break; | |||||||||
1279 | } | |||||||||
1280 | return I; | |||||||||
1281 | } | |||||||||
1282 | ||||||||||
1283 | void SIScheduleBlockCreator::topologicalSort() { | |||||||||
1284 | unsigned DAGSize = CurrentBlocks.size(); | |||||||||
1285 | std::vector<int> WorkList; | |||||||||
1286 | ||||||||||
1287 | DEBUG(dbgs() << "Topological Sort\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Topological Sort\n" ; } } while (false); | |||||||||
1288 | ||||||||||
1289 | WorkList.reserve(DAGSize); | |||||||||
1290 | TopDownIndex2Block.resize(DAGSize); | |||||||||
1291 | TopDownBlock2Index.resize(DAGSize); | |||||||||
1292 | BottomUpIndex2Block.resize(DAGSize); | |||||||||
1293 | ||||||||||
1294 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||||||||
1295 | SIScheduleBlock *Block = CurrentBlocks[i]; | |||||||||
1296 | unsigned Degree = Block->getSuccs().size(); | |||||||||
1297 | TopDownBlock2Index[i] = Degree; | |||||||||
1298 | if (Degree == 0) { | |||||||||
1299 | WorkList.push_back(i); | |||||||||
1300 | } | |||||||||
1301 | } | |||||||||
1302 | ||||||||||
1303 | int Id = DAGSize; | |||||||||
1304 | while (!WorkList.empty()) { | |||||||||
1305 | int i = WorkList.back(); | |||||||||
1306 | SIScheduleBlock *Block = CurrentBlocks[i]; | |||||||||
1307 | WorkList.pop_back(); | |||||||||
1308 | TopDownBlock2Index[i] = --Id; | |||||||||
1309 | TopDownIndex2Block[Id] = i; | |||||||||
1310 | for (SIScheduleBlock* Pred : Block->getPreds()) { | |||||||||
1311 | if (!--TopDownBlock2Index[Pred->getID()]) | |||||||||
1312 | WorkList.push_back(Pred->getID()); | |||||||||
1313 | } | |||||||||
1314 | } | |||||||||
1315 | ||||||||||
1316 | #ifndef NDEBUG | |||||||||
1317 | // Check correctness of the ordering. | |||||||||
1318 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||||||||
1319 | SIScheduleBlock *Block = CurrentBlocks[i]; | |||||||||
1320 | for (SIScheduleBlock* Pred : Block->getPreds()) { | |||||||||
1321 | assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&(static_cast <bool> (TopDownBlock2Index[i] > TopDownBlock2Index [Pred->getID()] && "Wrong Top Down topological sorting" ) ? void (0) : __assert_fail ("TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && \"Wrong Top Down topological sorting\"" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 1322, __extension__ __PRETTY_FUNCTION__)) | |||||||||
1322 | "Wrong Top Down topological sorting")(static_cast <bool> (TopDownBlock2Index[i] > TopDownBlock2Index [Pred->getID()] && "Wrong Top Down topological sorting" ) ? void (0) : __assert_fail ("TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && \"Wrong Top Down topological sorting\"" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 1322, __extension__ __PRETTY_FUNCTION__)); | |||||||||
1323 | } | |||||||||
1324 | } | |||||||||
1325 | #endif | |||||||||
1326 | ||||||||||
1327 | BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(), | |||||||||
1328 | TopDownIndex2Block.rend()); | |||||||||
1329 | } | |||||||||
1330 | ||||||||||
1331 | void SIScheduleBlockCreator::scheduleInsideBlocks() { | |||||||||
1332 | unsigned DAGSize = CurrentBlocks.size(); | |||||||||
1333 | ||||||||||
1334 | DEBUG(dbgs() << "\nScheduling Blocks\n\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "\nScheduling Blocks\n\n" ; } } while (false); | |||||||||
1335 | ||||||||||
1336 | // We do schedule a valid scheduling such that a Block corresponds | |||||||||
1337 | // to a range of instructions. | |||||||||
1338 | DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "First phase: Fast scheduling for Reg Liveness\n" ; } } while (false); | |||||||||
1339 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||||||||
1340 | SIScheduleBlock *Block = CurrentBlocks[i]; | |||||||||
1341 | Block->fastSchedule(); | |||||||||
1342 | } | |||||||||
1343 | ||||||||||
1344 | // Note: the following code, and the part restoring previous position | |||||||||
1345 | // is by far the most expensive operation of the Scheduler. | |||||||||
1346 | ||||||||||
1347 | // Do not update CurrentTop. | |||||||||
1348 | MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop(); | |||||||||
1349 | std::vector<MachineBasicBlock::iterator> PosOld; | |||||||||
1350 | std::vector<MachineBasicBlock::iterator> PosNew; | |||||||||
1351 | PosOld.reserve(DAG->SUnits.size()); | |||||||||
1352 | PosNew.reserve(DAG->SUnits.size()); | |||||||||
1353 | ||||||||||
1354 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||||||||
1355 | int BlockIndice = TopDownIndex2Block[i]; | |||||||||
1356 | SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; | |||||||||
1357 | std::vector<SUnit*> SUs = Block->getScheduledUnits(); | |||||||||
1358 | ||||||||||
1359 | for (SUnit* SU : SUs) { | |||||||||
1360 | MachineInstr *MI = SU->getInstr(); | |||||||||
1361 | MachineBasicBlock::iterator Pos = MI; | |||||||||
1362 | PosOld.push_back(Pos); | |||||||||
1363 | if (&*CurrentTopFastSched == MI) { | |||||||||
1364 | PosNew.push_back(Pos); | |||||||||
1365 | CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched, | |||||||||
1366 | DAG->getCurrentBottom()); | |||||||||
1367 | } else { | |||||||||
1368 | // Update the instruction stream. | |||||||||
1369 | DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI); | |||||||||
1370 | ||||||||||
1371 | // Update LiveIntervals. | |||||||||
1372 | // Note: Moving all instructions and calling handleMove every time | |||||||||
1373 | // is the most cpu intensive operation of the scheduler. | |||||||||
1374 | // It would gain a lot if there was a way to recompute the | |||||||||
1375 | // LiveIntervals for the entire scheduling region. | |||||||||
1376 | DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true); | |||||||||
1377 | PosNew.push_back(CurrentTopFastSched); | |||||||||
1378 | } | |||||||||
1379 | } | |||||||||
1380 | } | |||||||||
1381 | ||||||||||
1382 | // Now we have Block of SUs == Block of MI. | |||||||||
1383 | // We do the final schedule for the instructions inside the block. | |||||||||
1384 | // The property that all the SUs of the Block are grouped together as MI | |||||||||
1385 | // is used for correct reg usage tracking. | |||||||||
1386 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||||||||
1387 | SIScheduleBlock *Block = CurrentBlocks[i]; | |||||||||
1388 | std::vector<SUnit*> SUs = Block->getScheduledUnits(); | |||||||||
1389 | Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr()); | |||||||||
1390 | } | |||||||||
1391 | ||||||||||
1392 | DEBUG(dbgs() << "Restoring MI Pos\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Restoring MI Pos\n" ; } } while (false); | |||||||||
1393 | // Restore old ordering (which prevents a LIS->handleMove bug). | |||||||||
1394 | for (unsigned i = PosOld.size(), e = 0; i != e; --i) { | |||||||||
1395 | MachineBasicBlock::iterator POld = PosOld[i-1]; | |||||||||
1396 | MachineBasicBlock::iterator PNew = PosNew[i-1]; | |||||||||
1397 | if (PNew != POld) { | |||||||||
1398 | // Update the instruction stream. | |||||||||
1399 | DAG->getBB()->splice(POld, DAG->getBB(), PNew); | |||||||||
1400 | ||||||||||
1401 | // Update LiveIntervals. | |||||||||
1402 | DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true); | |||||||||
1403 | } | |||||||||
1404 | } | |||||||||
1405 | ||||||||||
1406 | DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { for (unsigned i = 0, e = CurrentBlocks .size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks [i]; Block->printDebug(true); }; } } while (false) | |||||||||
1407 | for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { for (unsigned i = 0, e = CurrentBlocks .size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks [i]; Block->printDebug(true); }; } } while (false) | |||||||||
1408 | SIScheduleBlock *Block = CurrentBlocks[i];do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { for (unsigned i = 0, e = CurrentBlocks .size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks [i]; Block->printDebug(true); }; } } while (false) | |||||||||
1409 | Block->printDebug(true);do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { for (unsigned i = 0, e = CurrentBlocks .size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks [i]; Block->printDebug(true); }; } } while (false) | |||||||||
1410 | }do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { for (unsigned i = 0, e = CurrentBlocks .size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks [i]; Block->printDebug(true); }; } } while (false) | |||||||||
1411 | )do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { for (unsigned i = 0, e = CurrentBlocks .size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks [i]; Block->printDebug(true); }; } } while (false); | |||||||||
1412 | } | |||||||||
1413 | ||||||||||
1414 | void SIScheduleBlockCreator::fillStats() { | |||||||||
1415 | unsigned DAGSize = CurrentBlocks.size(); | |||||||||
1416 | ||||||||||
1417 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||||||||
1418 | int BlockIndice = TopDownIndex2Block[i]; | |||||||||
1419 | SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; | |||||||||
1420 | if (Block->getPreds().empty()) | |||||||||
1421 | Block->Depth = 0; | |||||||||
1422 | else { | |||||||||
1423 | unsigned Depth = 0; | |||||||||
1424 | for (SIScheduleBlock *Pred : Block->getPreds()) { | |||||||||
1425 | if (Depth < Pred->Depth + Pred->getCost()) | |||||||||
1426 | Depth = Pred->Depth + Pred->getCost(); | |||||||||
1427 | } | |||||||||
1428 | Block->Depth = Depth; | |||||||||
1429 | } | |||||||||
1430 | } | |||||||||
1431 | ||||||||||
1432 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||||||||
1433 | int BlockIndice = BottomUpIndex2Block[i]; | |||||||||
1434 | SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; | |||||||||
1435 | if (Block->getSuccs().empty()) | |||||||||
1436 | Block->Height = 0; | |||||||||
1437 | else { | |||||||||
1438 | unsigned Height = 0; | |||||||||
1439 | for (const auto &Succ : Block->getSuccs()) | |||||||||
1440 | Height = std::max(Height, Succ.first->Height + Succ.first->getCost()); | |||||||||
1441 | Block->Height = Height; | |||||||||
1442 | } | |||||||||
1443 | } | |||||||||
1444 | } | |||||||||
1445 | ||||||||||
1446 | // SIScheduleBlockScheduler // | |||||||||
1447 | ||||||||||
1448 | SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, | |||||||||
1449 | SISchedulerBlockSchedulerVariant Variant, | |||||||||
1450 | SIScheduleBlocks BlocksStruct) : | |||||||||
1451 | DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks), | |||||||||
1452 | LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0), | |||||||||
1453 | SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) { | |||||||||
1454 | ||||||||||
1455 | // Fill the usage of every output | |||||||||
1456 | // Warning: while by construction we always have a link between two blocks | |||||||||
1457 | // when one needs a result from the other, the number of users of an output | |||||||||
1458 | // is not the sum of child blocks having as input the same virtual register. | |||||||||
1459 | // Here is an example. A produces x and y. B eats x and produces x'. | |||||||||
1460 | // C eats x' and y. The register coalescer may have attributed the same | |||||||||
1461 | // virtual register to x and x'. | |||||||||
1462 | // To count accurately, we do a topological sort. In case the register is | |||||||||
1463 | // found for several parents, we increment the usage of the one with the | |||||||||
1464 | // highest topological index. | |||||||||
1465 | LiveOutRegsNumUsages.resize(Blocks.size()); | |||||||||
1466 | for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { | |||||||||
| ||||||||||
1467 | SIScheduleBlock *Block = Blocks[i]; | |||||||||
1468 | for (unsigned Reg : Block->getInRegs()) { | |||||||||
1469 | bool Found = false; | |||||||||
1470 | int topoInd = -1; | |||||||||
1471 | for (SIScheduleBlock* Pred: Block->getPreds()) { | |||||||||
1472 | std::set<unsigned> PredOutRegs = Pred->getOutRegs(); | |||||||||
1473 | std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); | |||||||||
1474 | ||||||||||
1475 | if (RegPos != PredOutRegs.end()) { | |||||||||
1476 | Found = true; | |||||||||
1477 | if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) { | |||||||||
1478 | topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()]; | |||||||||
1479 | } | |||||||||
1480 | } | |||||||||
1481 | } | |||||||||
1482 | ||||||||||
1483 | if (!Found) | |||||||||
1484 | continue; | |||||||||
1485 | ||||||||||
1486 | int PredID = BlocksStruct.TopDownIndex2Block[topoInd]; | |||||||||
1487 | ++LiveOutRegsNumUsages[PredID][Reg]; | |||||||||
1488 | } | |||||||||
1489 | } | |||||||||
1490 | ||||||||||
1491 | LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0); | |||||||||
1492 | BlockNumPredsLeft.resize(Blocks.size()); | |||||||||
1493 | BlockNumSuccsLeft.resize(Blocks.size()); | |||||||||
1494 | ||||||||||
1495 | for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { | |||||||||
1496 | SIScheduleBlock *Block = Blocks[i]; | |||||||||
1497 | BlockNumPredsLeft[i] = Block->getPreds().size(); | |||||||||
1498 | BlockNumSuccsLeft[i] = Block->getSuccs().size(); | |||||||||
1499 | } | |||||||||
1500 | ||||||||||
1501 | #ifndef NDEBUG | |||||||||
1502 | for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { | |||||||||
1503 | SIScheduleBlock *Block = Blocks[i]; | |||||||||
1504 | assert(Block->getID() == i)(static_cast <bool> (Block->getID() == i) ? void (0) : __assert_fail ("Block->getID() == i", "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 1504, __extension__ __PRETTY_FUNCTION__)); | |||||||||
1505 | } | |||||||||
1506 | #endif | |||||||||
1507 | ||||||||||
1508 | std::set<unsigned> InRegs = DAG->getInRegs(); | |||||||||
1509 | addLiveRegs(InRegs); | |||||||||
1510 | ||||||||||
1511 | // Increase LiveOutRegsNumUsages for blocks | |||||||||
1512 | // producing registers consumed in another | |||||||||
1513 | // scheduling region. | |||||||||
1514 | for (unsigned Reg : DAG->getOutRegs()) { | |||||||||
1515 | for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { | |||||||||
1516 | // Do reverse traversal | |||||||||
1517 | int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i]; | |||||||||
1518 | SIScheduleBlock *Block = Blocks[ID]; | |||||||||
1519 | const std::set<unsigned> &OutRegs = Block->getOutRegs(); | |||||||||
1520 | ||||||||||
1521 | if (OutRegs.find(Reg) == OutRegs.end()) | |||||||||
1522 | continue; | |||||||||
1523 | ||||||||||
1524 | ++LiveOutRegsNumUsages[ID][Reg]; | |||||||||
1525 | break; | |||||||||
1526 | } | |||||||||
1527 | } | |||||||||
1528 | ||||||||||
1529 | // Fill LiveRegsConsumers for regs that were already | |||||||||
1530 | // defined before scheduling. | |||||||||
1531 | for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { | |||||||||
1532 | SIScheduleBlock *Block = Blocks[i]; | |||||||||
1533 | for (unsigned Reg : Block->getInRegs()) { | |||||||||
1534 | bool Found = false; | |||||||||
1535 | for (SIScheduleBlock* Pred: Block->getPreds()) { | |||||||||
1536 | std::set<unsigned> PredOutRegs = Pred->getOutRegs(); | |||||||||
1537 | std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); | |||||||||
1538 | ||||||||||
1539 | if (RegPos != PredOutRegs.end()) { | |||||||||
1540 | Found = true; | |||||||||
1541 | break; | |||||||||
1542 | } | |||||||||
1543 | } | |||||||||
1544 | ||||||||||
1545 | if (!Found) | |||||||||
1546 | ++LiveRegsConsumers[Reg]; | |||||||||
1547 | } | |||||||||
1548 | } | |||||||||
1549 | ||||||||||
1550 | for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { | |||||||||
1551 | SIScheduleBlock *Block = Blocks[i]; | |||||||||
1552 | if (BlockNumPredsLeft[i] == 0) { | |||||||||
1553 | ReadyBlocks.push_back(Block); | |||||||||
1554 | } | |||||||||
1555 | } | |||||||||
1556 | ||||||||||
1557 | while (SIScheduleBlock *Block = pickBlock()) { | |||||||||
1558 | BlocksScheduled.push_back(Block); | |||||||||
1559 | blockScheduled(Block); | |||||||||
1560 | } | |||||||||
1561 | ||||||||||
1562 | DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Block Order:"; for ( SIScheduleBlock* Block : BlocksScheduled) { dbgs() << ' ' << Block->getID(); } dbgs() << '\n';; } } while (false) | |||||||||
1563 | dbgs() << "Block Order:";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Block Order:"; for ( SIScheduleBlock* Block : BlocksScheduled) { dbgs() << ' ' << Block->getID(); } dbgs() << '\n';; } } while (false) | |||||||||
1564 | for (SIScheduleBlock* Block : BlocksScheduled) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Block Order:"; for ( SIScheduleBlock* Block : BlocksScheduled) { dbgs() << ' ' << Block->getID(); } dbgs() << '\n';; } } while (false) | |||||||||
1565 | dbgs() << ' ' << Block->getID();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Block Order:"; for ( SIScheduleBlock* Block : BlocksScheduled) { dbgs() << ' ' << Block->getID(); } dbgs() << '\n';; } } while (false) | |||||||||
1566 | }do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Block Order:"; for ( SIScheduleBlock* Block : BlocksScheduled) { dbgs() << ' ' << Block->getID(); } dbgs() << '\n';; } } while (false) | |||||||||
1567 | dbgs() << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Block Order:"; for ( SIScheduleBlock* Block : BlocksScheduled) { dbgs() << ' ' << Block->getID(); } dbgs() << '\n';; } } while (false) | |||||||||
1568 | )do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Block Order:"; for ( SIScheduleBlock* Block : BlocksScheduled) { dbgs() << ' ' << Block->getID(); } dbgs() << '\n';; } } while (false); | |||||||||
1569 | } | |||||||||
1570 | ||||||||||
1571 | bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand, | |||||||||
1572 | SIBlockSchedCandidate &TryCand) { | |||||||||
1573 | if (!Cand.isValid()) { | |||||||||
1574 | TryCand.Reason = NodeOrder; | |||||||||
1575 | return true; | |||||||||
1576 | } | |||||||||
1577 | ||||||||||
1578 | // Try to hide high latencies. | |||||||||
1579 | if (tryLess(TryCand.LastPosHighLatParentScheduled, | |||||||||
1580 | Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency)) | |||||||||
1581 | return true; | |||||||||
1582 | // Schedule high latencies early so you can hide them better. | |||||||||
1583 | if (tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency, | |||||||||
1584 | TryCand, Cand, Latency)) | |||||||||
1585 | return true; | |||||||||
1586 | if (TryCand.IsHighLatency && tryGreater(TryCand.Height, Cand.Height, | |||||||||
1587 | TryCand, Cand, Depth)) | |||||||||
1588 | return true; | |||||||||
1589 | if (tryGreater(TryCand.NumHighLatencySuccessors, | |||||||||
1590 | Cand.NumHighLatencySuccessors, | |||||||||
1591 | TryCand, Cand, Successor)) | |||||||||
1592 | return true; | |||||||||
1593 | return false; | |||||||||
1594 | } | |||||||||
1595 | ||||||||||
1596 | bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand, | |||||||||
1597 | SIBlockSchedCandidate &TryCand) { | |||||||||
1598 | if (!Cand.isValid()) { | |||||||||
1599 | TryCand.Reason = NodeOrder; | |||||||||
1600 | return true; | |||||||||
1601 | } | |||||||||
1602 | ||||||||||
1603 | if (tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0, | |||||||||
1604 | TryCand, Cand, RegUsage)) | |||||||||
1605 | return true; | |||||||||
1606 | if (tryGreater(TryCand.NumSuccessors > 0, | |||||||||
1607 | Cand.NumSuccessors > 0, | |||||||||
1608 | TryCand, Cand, Successor)) | |||||||||
1609 | return true; | |||||||||
1610 | if (tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth)) | |||||||||
1611 | return true; | |||||||||
1612 | if (tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff, | |||||||||
1613 | TryCand, Cand, RegUsage)) | |||||||||
1614 | return true; | |||||||||
1615 | return false; | |||||||||
1616 | } | |||||||||
1617 | ||||||||||
1618 | SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { | |||||||||
1619 | SIBlockSchedCandidate Cand; | |||||||||
1620 | std::vector<SIScheduleBlock*>::iterator Best; | |||||||||
1621 | SIScheduleBlock *Block; | |||||||||
1622 | if (ReadyBlocks.empty()) | |||||||||
1623 | return nullptr; | |||||||||
1624 | ||||||||||
1625 | DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(), | |||||||||
1626 | VregCurrentUsage, SregCurrentUsage); | |||||||||
1627 | if (VregCurrentUsage > maxVregUsage) | |||||||||
1628 | maxVregUsage = VregCurrentUsage; | |||||||||
1629 | if (SregCurrentUsage > maxSregUsage) | |||||||||
1630 | maxSregUsage = SregCurrentUsage; | |||||||||
1631 | DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock* Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||||||||
1632 | dbgs() << "Picking New Blocks\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock* Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||||||||
1633 | dbgs() << "Available: ";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock* Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||||||||
1634 | for (SIScheduleBlock* Block : ReadyBlocks)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock* Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||||||||
1635 | dbgs() << Block->getID() << ' ';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock* Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||||||||
1636 | dbgs() << "\nCurrent Live:\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock* Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||||||||
1637 | for (unsigned Reg : LiveRegs)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock* Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||||||||
1638 | dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock* Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||||||||
1639 | dbgs() << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock* Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||||||||
1640 | dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock* Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||||||||
1641 | dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock* Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||||||||
1642 | )do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock* Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false); | |||||||||
1643 | ||||||||||
1644 | Cand.Block = nullptr; | |||||||||
1645 | for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(), | |||||||||
1646 | E = ReadyBlocks.end(); I != E; ++I) { | |||||||||
1647 | SIBlockSchedCandidate TryCand; | |||||||||
1648 | TryCand.Block = *I; | |||||||||
1649 | TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock(); | |||||||||
1650 | TryCand.VGPRUsageDiff = | |||||||||
1651 | checkRegUsageImpact(TryCand.Block->getInRegs(), | |||||||||
1652 | TryCand.Block->getOutRegs())[DAG->getVGPRSetID()]; | |||||||||
1653 | TryCand.NumSuccessors = TryCand.Block->getSuccs().size(); | |||||||||
1654 | TryCand.NumHighLatencySuccessors = | |||||||||
1655 | TryCand.Block->getNumHighLatencySuccessors(); | |||||||||
1656 | TryCand.LastPosHighLatParentScheduled = | |||||||||
1657 | (unsigned int) std::max<int> (0, | |||||||||
1658 | LastPosHighLatencyParentScheduled[TryCand.Block->getID()] - | |||||||||
1659 | LastPosWaitedHighLatency); | |||||||||
1660 | TryCand.Height = TryCand.Block->Height; | |||||||||
1661 | // Try not to increase VGPR usage too much, else we may spill. | |||||||||
1662 | if (VregCurrentUsage > 120 || | |||||||||
1663 | Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) { | |||||||||
1664 | if (!tryCandidateRegUsage(Cand, TryCand) && | |||||||||
1665 | Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage) | |||||||||
1666 | tryCandidateLatency(Cand, TryCand); | |||||||||
1667 | } else { | |||||||||
1668 | if (!tryCandidateLatency(Cand, TryCand)) | |||||||||
1669 | tryCandidateRegUsage(Cand, TryCand); | |||||||||
1670 | } | |||||||||
1671 | if (TryCand.Reason != NoCand) { | |||||||||
1672 | Cand.setBest(TryCand); | |||||||||
1673 | Best = I; | |||||||||
1674 | DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' 'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' << getReasonStr (Cand.Reason) << '\n'; } } while (false) | |||||||||
1675 | << getReasonStr(Cand.Reason) << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' << getReasonStr (Cand.Reason) << '\n'; } } while (false); | |||||||||
1676 | } | |||||||||
1677 | } | |||||||||
1678 | ||||||||||
1679 | DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false) | |||||||||
| ||||||||||
1680 | dbgs() << "Picking: " << Cand.Block->getID() << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false) | |||||||||
1681 | dbgs() << "Is a block with high latency instruction: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false) | |||||||||
1682 | << (Cand.IsHighLatency ? "yes\n" : "no\n");do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false) | |||||||||
1683 | dbgs() << "Position of last high latency dependency: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false) | |||||||||
1684 | << Cand.LastPosHighLatParentScheduled << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false) | |||||||||
1685 | dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false) | |||||||||
1686 | dbgs() << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false) | |||||||||
1687 | )do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false); | |||||||||
1688 | ||||||||||
1689 | Block = Cand.Block; | |||||||||
1690 | ReadyBlocks.erase(Best); | |||||||||
1691 | return Block; | |||||||||
1692 | } | |||||||||
1693 | ||||||||||
1694 | // Tracking of currently alive registers to determine VGPR Usage. | |||||||||
1695 | ||||||||||
1696 | void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) { | |||||||||
1697 | for (unsigned Reg : Regs) { | |||||||||
1698 | // For now only track virtual registers. | |||||||||
1699 | if (!TargetRegisterInfo::isVirtualRegister(Reg)) | |||||||||
1700 | continue; | |||||||||
1701 | // If not already in the live set, then add it. | |||||||||
1702 | (void) LiveRegs.insert(Reg); | |||||||||
1703 | } | |||||||||
1704 | } | |||||||||
1705 | ||||||||||
1706 | void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, | |||||||||
1707 | std::set<unsigned> &Regs) { | |||||||||
1708 | for (unsigned Reg : Regs) { | |||||||||
1709 | // For now only track virtual registers. | |||||||||
1710 | std::set<unsigned>::iterator Pos = LiveRegs.find(Reg); | |||||||||
1711 | assert (Pos != LiveRegs.end() && // Reg must be live.(static_cast <bool> (Pos != LiveRegs.end() && LiveRegsConsumers .find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers [Reg] >= 1) ? void (0) : __assert_fail ("Pos != LiveRegs.end() && LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers[Reg] >= 1" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 1713, __extension__ __PRETTY_FUNCTION__)) | |||||||||
1712 | LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&(static_cast <bool> (Pos != LiveRegs.end() && LiveRegsConsumers .find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers [Reg] >= 1) ? void (0) : __assert_fail ("Pos != LiveRegs.end() && LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers[Reg] >= 1" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 1713, __extension__ __PRETTY_FUNCTION__)) | |||||||||
1713 | LiveRegsConsumers[Reg] >= 1)(static_cast <bool> (Pos != LiveRegs.end() && LiveRegsConsumers .find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers [Reg] >= 1) ? void (0) : __assert_fail ("Pos != LiveRegs.end() && LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers[Reg] >= 1" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 1713, __extension__ __PRETTY_FUNCTION__)); | |||||||||
1714 | --LiveRegsConsumers[Reg]; | |||||||||
1715 | if (LiveRegsConsumers[Reg] == 0) | |||||||||
1716 | LiveRegs.erase(Pos); | |||||||||
1717 | } | |||||||||
1718 | } | |||||||||
1719 | ||||||||||
1720 | void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { | |||||||||
1721 | for (const auto &Block : Parent->getSuccs()) { | |||||||||
1722 | if (--BlockNumPredsLeft[Block.first->getID()] == 0) | |||||||||
1723 | ReadyBlocks.push_back(Block.first); | |||||||||
1724 | ||||||||||
1725 | if (Parent->isHighLatencyBlock() && | |||||||||
1726 | Block.second == SIScheduleBlockLinkKind::Data) | |||||||||
1727 | LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled; | |||||||||
1728 | } | |||||||||
1729 | } | |||||||||
1730 | ||||||||||
1731 | void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { | |||||||||
1732 | decreaseLiveRegs(Block, Block->getInRegs()); | |||||||||
1733 | addLiveRegs(Block->getOutRegs()); | |||||||||
1734 | releaseBlockSuccs(Block); | |||||||||
1735 | for (std::map<unsigned, unsigned>::iterator RegI = | |||||||||
1736 | LiveOutRegsNumUsages[Block->getID()].begin(), | |||||||||
1737 | E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) { | |||||||||
1738 | std::pair<unsigned, unsigned> RegP = *RegI; | |||||||||
1739 | // We produce this register, thus it must not be previously alive. | |||||||||
1740 | assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||(static_cast <bool> (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || LiveRegsConsumers[RegP.first] == 0) ? void (0) : __assert_fail ("LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || LiveRegsConsumers[RegP.first] == 0" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 1741, __extension__ __PRETTY_FUNCTION__)) | |||||||||
1741 | LiveRegsConsumers[RegP.first] == 0)(static_cast <bool> (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || LiveRegsConsumers[RegP.first] == 0) ? void (0) : __assert_fail ("LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || LiveRegsConsumers[RegP.first] == 0" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 1741, __extension__ __PRETTY_FUNCTION__)); | |||||||||
1742 | LiveRegsConsumers[RegP.first] += RegP.second; | |||||||||
1743 | } | |||||||||
1744 | if (LastPosHighLatencyParentScheduled[Block->getID()] > | |||||||||
1745 | (unsigned)LastPosWaitedHighLatency) | |||||||||
1746 | LastPosWaitedHighLatency = | |||||||||
1747 | LastPosHighLatencyParentScheduled[Block->getID()]; | |||||||||
1748 | ++NumBlockScheduled; | |||||||||
1749 | } | |||||||||
1750 | ||||||||||
1751 | std::vector<int> | |||||||||
1752 | SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs, | |||||||||
1753 | std::set<unsigned> &OutRegs) { | |||||||||
1754 | std::vector<int> DiffSetPressure; | |||||||||
1755 | DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0); | |||||||||
1756 | ||||||||||
1757 | for (unsigned Reg : InRegs) { | |||||||||
1758 | // For now only track virtual registers. | |||||||||
1759 | if (!TargetRegisterInfo::isVirtualRegister(Reg)) | |||||||||
1760 | continue; | |||||||||
1761 | if (LiveRegsConsumers[Reg] > 1) | |||||||||
1762 | continue; | |||||||||
1763 | PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); | |||||||||
1764 | for (; PSetI.isValid(); ++PSetI) { | |||||||||
1765 | DiffSetPressure[*PSetI] -= PSetI.getWeight(); | |||||||||
1766 | } | |||||||||
1767 | } | |||||||||
1768 | ||||||||||
1769 | for (unsigned Reg : OutRegs) { | |||||||||
1770 | // For now only track virtual registers. | |||||||||
1771 | if (!TargetRegisterInfo::isVirtualRegister(Reg)) | |||||||||
1772 | continue; | |||||||||
1773 | PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); | |||||||||
1774 | for (; PSetI.isValid(); ++PSetI) { | |||||||||
1775 | DiffSetPressure[*PSetI] += PSetI.getWeight(); | |||||||||
1776 | } | |||||||||
1777 | } | |||||||||
1778 | ||||||||||
1779 | return DiffSetPressure; | |||||||||
1780 | } | |||||||||
1781 | ||||||||||
1782 | // SIScheduler // | |||||||||
1783 | ||||||||||
1784 | struct SIScheduleBlockResult | |||||||||
1785 | SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, | |||||||||
1786 | SISchedulerBlockSchedulerVariant ScheduleVariant) { | |||||||||
1787 | SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant); | |||||||||
1788 | SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks); | |||||||||
1789 | std::vector<SIScheduleBlock*> ScheduledBlocks; | |||||||||
1790 | struct SIScheduleBlockResult Res; | |||||||||
1791 | ||||||||||
1792 | ScheduledBlocks = Scheduler.getBlocks(); | |||||||||
1793 | ||||||||||
1794 | for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) { | |||||||||
1795 | SIScheduleBlock *Block = ScheduledBlocks[b]; | |||||||||
1796 | std::vector<SUnit*> SUs = Block->getScheduledUnits(); | |||||||||
1797 | ||||||||||
1798 | for (SUnit* SU : SUs) | |||||||||
1799 | Res.SUs.push_back(SU->NodeNum); | |||||||||
1800 | } | |||||||||
1801 | ||||||||||
1802 | Res.MaxSGPRUsage = Scheduler.getSGPRUsage(); | |||||||||
1803 | Res.MaxVGPRUsage = Scheduler.getVGPRUsage(); | |||||||||
1804 | return Res; | |||||||||
1805 | } | |||||||||
1806 | ||||||||||
1807 | // SIScheduleDAGMI // | |||||||||
1808 | ||||||||||
1809 | SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : | |||||||||
1810 | ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) { | |||||||||
1811 | SITII = static_cast<const SIInstrInfo*>(TII); | |||||||||
1812 | SITRI = static_cast<const SIRegisterInfo*>(TRI); | |||||||||
1813 | ||||||||||
1814 | VGPRSetID = SITRI->getVGPRPressureSet(); | |||||||||
1815 | SGPRSetID = SITRI->getSGPRPressureSet(); | |||||||||
1816 | } | |||||||||
1817 | ||||||||||
1818 | SIScheduleDAGMI::~SIScheduleDAGMI() = default; | |||||||||
1819 | ||||||||||
1820 | // Code adapted from scheduleDAG.cpp | |||||||||
1821 | // Does a topological sort over the SUs. | |||||||||
1822 | // Both TopDown and BottomUp | |||||||||
1823 | void SIScheduleDAGMI::topologicalSort() { | |||||||||
1824 | Topo.InitDAGTopologicalSorting(); | |||||||||
1825 | ||||||||||
1826 | TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end()); | |||||||||
1827 | BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend()); | |||||||||
1828 | } | |||||||||
1829 | ||||||||||
1830 | // Move low latencies further from their user without | |||||||||
1831 | // increasing SGPR usage (in general) | |||||||||
1832 | // This is to be replaced by a better pass that would | |||||||||
1833 | // take into account SGPR usage (based on VGPR Usage | |||||||||
1834 | // and the corresponding wavefront count), that would | |||||||||
1835 | // try to merge groups of loads if it make sense, etc | |||||||||
1836 | void SIScheduleDAGMI::moveLowLatencies() { | |||||||||
1837 | unsigned DAGSize = SUnits.size(); | |||||||||
1838 | int LastLowLatencyUser = -1; | |||||||||
1839 | int LastLowLatencyPos = -1; | |||||||||
1840 | ||||||||||
1841 | for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) { | |||||||||
1842 | SUnit *SU = &SUnits[ScheduledSUnits[i]]; | |||||||||
1843 | bool IsLowLatencyUser = false; | |||||||||
1844 | unsigned MinPos = 0; | |||||||||
1845 | ||||||||||
1846 | for (SDep& PredDep : SU->Preds) { | |||||||||
1847 | SUnit *Pred = PredDep.getSUnit(); | |||||||||
1848 | if (SITII->isLowLatencyInstruction(*Pred->getInstr())) { | |||||||||
1849 | IsLowLatencyUser = true; | |||||||||
1850 | } | |||||||||
1851 | if (Pred->NodeNum >= DAGSize) | |||||||||
1852 | continue; | |||||||||
1853 | unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum]; | |||||||||
1854 | if (PredPos >= MinPos) | |||||||||
1855 | MinPos = PredPos + 1; | |||||||||
1856 | } | |||||||||
1857 | ||||||||||
1858 | if (SITII->isLowLatencyInstruction(*SU->getInstr())) { | |||||||||
1859 | unsigned BestPos = LastLowLatencyUser + 1; | |||||||||
1860 | if ((int)BestPos <= LastLowLatencyPos) | |||||||||
1861 | BestPos = LastLowLatencyPos + 1; | |||||||||
1862 | if (BestPos < MinPos) | |||||||||
1863 | BestPos = MinPos; | |||||||||
1864 | if (BestPos < i) { | |||||||||
1865 | for (unsigned u = i; u > BestPos; --u) { | |||||||||
1866 | ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; | |||||||||
1867 | ScheduledSUnits[u] = ScheduledSUnits[u-1]; | |||||||||
1868 | } | |||||||||
1869 | ScheduledSUnits[BestPos] = SU->NodeNum; | |||||||||
1870 | ScheduledSUnitsInv[SU->NodeNum] = BestPos; | |||||||||
1871 | } | |||||||||
1872 | LastLowLatencyPos = BestPos; | |||||||||
1873 | if (IsLowLatencyUser) | |||||||||
1874 | LastLowLatencyUser = BestPos; | |||||||||
1875 | } else if (IsLowLatencyUser) { | |||||||||
1876 | LastLowLatencyUser = i; | |||||||||
1877 | // Moves COPY instructions on which depends | |||||||||
1878 | // the low latency instructions too. | |||||||||
1879 | } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) { | |||||||||
1880 | bool CopyForLowLat = false; | |||||||||
1881 | for (SDep& SuccDep : SU->Succs) { | |||||||||
1882 | SUnit *Succ = SuccDep.getSUnit(); | |||||||||
1883 | if (SITII->isLowLatencyInstruction(*Succ->getInstr())) { | |||||||||
1884 | CopyForLowLat = true; | |||||||||
1885 | } | |||||||||
1886 | } | |||||||||
1887 | if (!CopyForLowLat) | |||||||||
1888 | continue; | |||||||||
1889 | if (MinPos < i) { | |||||||||
1890 | for (unsigned u = i; u > MinPos; --u) { | |||||||||
1891 | ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; | |||||||||
1892 | ScheduledSUnits[u] = ScheduledSUnits[u-1]; | |||||||||
1893 | } | |||||||||
1894 | ScheduledSUnits[MinPos] = SU->NodeNum; | |||||||||
1895 | ScheduledSUnitsInv[SU->NodeNum] = MinPos; | |||||||||
1896 | } | |||||||||
1897 | } | |||||||||
1898 | } | |||||||||
1899 | } | |||||||||
1900 | ||||||||||
1901 | void SIScheduleDAGMI::restoreSULinksLeft() { | |||||||||
1902 | for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { | |||||||||
1903 | SUnits[i].isScheduled = false; | |||||||||
1904 | SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft; | |||||||||
1905 | SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft; | |||||||||
1906 | SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft; | |||||||||
1907 | SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft; | |||||||||
1908 | } | |||||||||
1909 | } | |||||||||
1910 | ||||||||||
1911 | // Return the Vgpr and Sgpr usage corresponding to some virtual registers. | |||||||||
1912 | template<typename _Iterator> void | |||||||||
1913 | SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, | |||||||||
1914 | unsigned &VgprUsage, unsigned &SgprUsage) { | |||||||||
1915 | VgprUsage = 0; | |||||||||
1916 | SgprUsage = 0; | |||||||||
1917 | for (_Iterator RegI = First; RegI != End; ++RegI) { | |||||||||
1918 | unsigned Reg = *RegI; | |||||||||
1919 | // For now only track virtual registers | |||||||||
1920 | if (!TargetRegisterInfo::isVirtualRegister(Reg)) | |||||||||
1921 | continue; | |||||||||
1922 | PSetIterator PSetI = MRI.getPressureSets(Reg); | |||||||||
1923 | for (; PSetI.isValid(); ++PSetI) { | |||||||||
1924 | if (*PSetI == VGPRSetID) | |||||||||
1925 | VgprUsage += PSetI.getWeight(); | |||||||||
1926 | else if (*PSetI == SGPRSetID) | |||||||||
1927 | SgprUsage += PSetI.getWeight(); | |||||||||
1928 | } | |||||||||
1929 | } | |||||||||
1930 | } | |||||||||
1931 | ||||||||||
1932 | void SIScheduleDAGMI::schedule() | |||||||||
1933 | { | |||||||||
1934 | SmallVector<SUnit*, 8> TopRoots, BotRoots; | |||||||||
1935 | SIScheduleBlockResult Best, Temp; | |||||||||
1936 | DEBUG(dbgs() << "Preparing Scheduling\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Preparing Scheduling\n" ; } } while (false); | |||||||||
1937 | ||||||||||
1938 | buildDAGWithRegPressure(); | |||||||||
1939 | DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { for(SUnit& SU : SUnits) SU.dumpAll (this); } } while (false) | |||||||||
1940 | for(SUnit& SU : SUnits)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { for(SUnit& SU : SUnits) SU.dumpAll (this); } } while (false) | |||||||||
1941 | SU.dumpAll(this)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { for(SUnit& SU : SUnits) SU.dumpAll (this); } } while (false) | |||||||||
1942 | )do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { for(SUnit& SU : SUnits) SU.dumpAll (this); } } while (false); | |||||||||
1943 | ||||||||||
1944 | topologicalSort(); | |||||||||
1945 | findRootsAndBiasEdges(TopRoots, BotRoots); | |||||||||
1946 | // We reuse several ScheduleDAGMI and ScheduleDAGMILive | |||||||||
1947 | // functions, but to make them happy we must initialize | |||||||||
1948 | // the default Scheduler implementation (even if we do not | |||||||||
1949 | // run it) | |||||||||
1950 | SchedImpl->initialize(this); | |||||||||
1951 | initQueues(TopRoots, BotRoots); | |||||||||
1952 | ||||||||||
1953 | // Fill some stats to help scheduling. | |||||||||
1954 | ||||||||||
1955 | SUnitsLinksBackup = SUnits; | |||||||||
1956 | IsLowLatencySU.clear(); | |||||||||
1957 | LowLatencyOffset.clear(); | |||||||||
1958 | IsHighLatencySU.clear(); | |||||||||
1959 | ||||||||||
1960 | IsLowLatencySU.resize(SUnits.size(), 0); | |||||||||
1961 | LowLatencyOffset.resize(SUnits.size(), 0); | |||||||||
1962 | IsHighLatencySU.resize(SUnits.size(), 0); | |||||||||
1963 | ||||||||||
1964 | for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { | |||||||||
1965 | SUnit *SU = &SUnits[i]; | |||||||||
1966 | unsigned BaseLatReg; | |||||||||
1967 | int64_t OffLatReg; | |||||||||
1968 | if (SITII->isLowLatencyInstruction(*SU->getInstr())) { | |||||||||
1969 | IsLowLatencySU[i] = 1; | |||||||||
1970 | if (SITII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseLatReg, OffLatReg, | |||||||||
1971 | TRI)) | |||||||||
1972 | LowLatencyOffset[i] = OffLatReg; | |||||||||
1973 | } else if (SITII->isHighLatencyInstruction(*SU->getInstr())) | |||||||||
1974 | IsHighLatencySU[i] = 1; | |||||||||
1975 | } | |||||||||
1976 | ||||||||||
1977 | SIScheduler Scheduler(this); | |||||||||
1978 | Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone, | |||||||||
1979 | SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage); | |||||||||
1980 | ||||||||||
1981 | // if VGPR usage is extremely high, try other good performing variants | |||||||||
1982 | // which could lead to lower VGPR usage | |||||||||
1983 | if (Best.MaxVGPRUsage > 180) { | |||||||||
1984 | static const std::pair<SISchedulerBlockCreatorVariant, | |||||||||
1985 | SISchedulerBlockSchedulerVariant> | |||||||||
1986 | Variants[] = { | |||||||||
1987 | { LatenciesAlone, BlockRegUsageLatency }, | |||||||||
1988 | // { LatenciesAlone, BlockRegUsage }, | |||||||||
1989 | { LatenciesGrouped, BlockLatencyRegUsage }, | |||||||||
1990 | // { LatenciesGrouped, BlockRegUsageLatency }, | |||||||||
1991 | // { LatenciesGrouped, BlockRegUsage }, | |||||||||
1992 | { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, | |||||||||
1993 | // { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, | |||||||||
1994 | // { LatenciesAlonePlusConsecutive, BlockRegUsage } | |||||||||
1995 | }; | |||||||||
1996 | for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { | |||||||||
1997 | Temp = Scheduler.scheduleVariant(v.first, v.second); | |||||||||
1998 | if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) | |||||||||
1999 | Best = Temp; | |||||||||
2000 | } | |||||||||
2001 | } | |||||||||
2002 | // if VGPR usage is still extremely high, we may spill. Try other variants | |||||||||
2003 | // which are less performing, but that could lead to lower VGPR usage. | |||||||||
2004 | if (Best.MaxVGPRUsage > 200) { | |||||||||
2005 | static const std::pair<SISchedulerBlockCreatorVariant, | |||||||||
2006 | SISchedulerBlockSchedulerVariant> | |||||||||
2007 | Variants[] = { | |||||||||
2008 | // { LatenciesAlone, BlockRegUsageLatency }, | |||||||||
2009 | { LatenciesAlone, BlockRegUsage }, | |||||||||
2010 | // { LatenciesGrouped, BlockLatencyRegUsage }, | |||||||||
2011 | { LatenciesGrouped, BlockRegUsageLatency }, | |||||||||
2012 | { LatenciesGrouped, BlockRegUsage }, | |||||||||
2013 | // { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, | |||||||||
2014 | { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, | |||||||||
2015 | { LatenciesAlonePlusConsecutive, BlockRegUsage } | |||||||||
2016 | }; | |||||||||
2017 | for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { | |||||||||
2018 | Temp = Scheduler.scheduleVariant(v.first, v.second); | |||||||||
2019 | if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) | |||||||||
2020 | Best = Temp; | |||||||||
2021 | } | |||||||||
2022 | } | |||||||||
2023 | ||||||||||
2024 | ScheduledSUnits = Best.SUs; | |||||||||
2025 | ScheduledSUnitsInv.resize(SUnits.size()); | |||||||||
2026 | ||||||||||
2027 | for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { | |||||||||
2028 | ScheduledSUnitsInv[ScheduledSUnits[i]] = i; | |||||||||
2029 | } | |||||||||
2030 | ||||||||||
2031 | moveLowLatencies(); | |||||||||
2032 | ||||||||||
2033 | // Tell the outside world about the result of the scheduling. | |||||||||
2034 | ||||||||||
2035 | assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker")(static_cast <bool> (TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker") ? void (0) : __assert_fail ("TopRPTracker.getPos() == RegionBegin && \"bad initial Top tracker\"" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 2035, __extension__ __PRETTY_FUNCTION__)); | |||||||||
2036 | TopRPTracker.setPos(CurrentTop); | |||||||||
2037 | ||||||||||
2038 | for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(), | |||||||||
2039 | E = ScheduledSUnits.end(); I != E; ++I) { | |||||||||
2040 | SUnit *SU = &SUnits[*I]; | |||||||||
2041 | ||||||||||
2042 | scheduleMI(SU, true); | |||||||||
2043 | ||||||||||
2044 | DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr(); } } while (false) | |||||||||
2045 | << *SU->getInstr())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr(); } } while (false); | |||||||||
2046 | } | |||||||||
2047 | ||||||||||
2048 | assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.")(static_cast <bool> (CurrentTop == CurrentBottom && "Nonempty unscheduled zone.") ? void (0) : __assert_fail ("CurrentTop == CurrentBottom && \"Nonempty unscheduled zone.\"" , "/build/llvm-toolchain-snapshot-7~svn329677/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 2048, __extension__ __PRETTY_FUNCTION__)); | |||||||||
2049 | ||||||||||
2050 | placeDebugValues(); | |||||||||
2051 | ||||||||||
2052 | DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "*** Final schedule for " << printMBBReference(*begin()->getParent()) << " ***\n"; dumpSchedule(); dbgs() << '\n'; }; } } while (false) | |||||||||
2053 | dbgs() << "*** Final schedule for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "*** Final schedule for " << printMBBReference(*begin()->getParent()) << " ***\n"; dumpSchedule(); dbgs() << '\n'; }; } } while (false) | |||||||||
2054 | << printMBBReference(*begin()->getParent()) << " ***\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "*** Final schedule for " << printMBBReference(*begin()->getParent()) << " ***\n"; dumpSchedule(); dbgs() << '\n'; }; } } while (false) | |||||||||
2055 | dumpSchedule();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "*** Final schedule for " << printMBBReference(*begin()->getParent()) << " ***\n"; dumpSchedule(); dbgs() << '\n'; }; } } while (false) | |||||||||
2056 | dbgs() << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "*** Final schedule for " << printMBBReference(*begin()->getParent()) << " ***\n"; dumpSchedule(); dbgs() << '\n'; }; } } while (false) | |||||||||
2057 | })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "*** Final schedule for " << printMBBReference(*begin()->getParent()) << " ***\n"; dumpSchedule(); dbgs() << '\n'; }; } } while (false); | |||||||||
2058 | } |