Line data Source code
1 : //===- SILowerControlFlow.cpp - Use predicates for control flow -----------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : /// \file
11 : /// Insert wait instructions for memory reads and writes.
12 : ///
13 : /// Memory reads and writes are issued asynchronously, so we need to insert
14 : /// S_WAITCNT instructions when we want to access any of their results or
15 : /// overwrite any register that's used asynchronously.
16 : //
17 : //===----------------------------------------------------------------------===//
18 :
19 : #include "AMDGPU.h"
20 : #include "AMDGPUSubtarget.h"
21 : #include "SIDefines.h"
22 : #include "SIInstrInfo.h"
23 : #include "SIMachineFunctionInfo.h"
24 : #include "SIRegisterInfo.h"
25 : #include "Utils/AMDGPUBaseInfo.h"
26 : #include "llvm/ADT/SmallVector.h"
27 : #include "llvm/ADT/StringRef.h"
28 : #include "llvm/CodeGen/MachineBasicBlock.h"
29 : #include "llvm/CodeGen/MachineFunction.h"
30 : #include "llvm/CodeGen/MachineFunctionPass.h"
31 : #include "llvm/CodeGen/MachineInstr.h"
32 : #include "llvm/CodeGen/MachineInstrBuilder.h"
33 : #include "llvm/CodeGen/MachineOperand.h"
34 : #include "llvm/CodeGen/MachineRegisterInfo.h"
35 : #include "llvm/IR/DebugLoc.h"
36 : #include "llvm/MC/MCInstrDesc.h"
37 : #include "llvm/Pass.h"
38 : #include "llvm/Support/Debug.h"
39 : #include "llvm/Support/raw_ostream.h"
40 : #include <algorithm>
41 : #include <cassert>
42 : #include <cstdint>
43 : #include <cstring>
44 : #include <utility>
45 :
46 : #define DEBUG_TYPE "si-insert-waits"
47 :
48 : using namespace llvm;
49 :
50 : namespace {
51 :
52 : /// One variable for each of the hardware counters
53 : using Counters = union {
54 : struct {
55 : unsigned VM;
56 : unsigned EXP;
57 : unsigned LGKM;
58 : } Named;
59 : unsigned Array[3];
60 : };
61 :
62 : using InstType = enum {
63 : OTHER,
64 : SMEM,
65 : VMEM
66 : };
67 :
68 : using RegCounters = Counters[512];
69 : using RegInterval = std::pair<unsigned, unsigned>;
70 :
71 2 : class SIInsertWaits : public MachineFunctionPass {
72 : private:
73 : const SISubtarget *ST = nullptr;
74 : const SIInstrInfo *TII = nullptr;
75 : const SIRegisterInfo *TRI = nullptr;
76 : const MachineRegisterInfo *MRI;
77 : AMDGPU::IsaInfo::IsaVersion ISA;
78 :
79 : /// Constant zero value
80 : static const Counters ZeroCounts;
81 :
82 : /// Hardware limits
83 : Counters HardwareLimits;
84 :
85 : /// Counter values we have already waited on.
86 : Counters WaitedOn;
87 :
88 : /// Counter values that we must wait on before the next counter
89 : /// increase.
90 : Counters DelayedWaitOn;
91 :
92 : /// Counter values for last instruction issued.
93 : Counters LastIssued;
94 :
95 : /// Registers used by async instructions.
96 : RegCounters UsedRegs;
97 :
98 : /// Registers defined by async instructions.
99 : RegCounters DefinedRegs;
100 :
101 : /// Different export instruction types seen since last wait.
102 : unsigned ExpInstrTypesSeen = 0;
103 :
104 : /// Type of the last opcode.
105 : InstType LastOpcodeType;
106 :
107 : bool LastInstWritesM0;
108 :
109 : /// Whether or not we have flat operations outstanding.
110 : bool IsFlatOutstanding;
111 :
112 : /// Whether the machine function returns void
113 : bool ReturnsVoid;
114 :
115 : /// Whether the VCCZ bit is possibly corrupt
116 : bool VCCZCorrupt = false;
117 :
118 : /// Get increment/decrement amount for this instruction.
119 : Counters getHwCounts(MachineInstr &MI);
120 :
121 : /// Is operand relevant for async execution?
122 : bool isOpRelevant(MachineOperand &Op);
123 :
124 : /// Get register interval an operand affects.
125 : RegInterval getRegInterval(const TargetRegisterClass *RC,
126 : const MachineOperand &Reg) const;
127 :
128 : /// Handle instructions async components
129 : void pushInstruction(MachineBasicBlock &MBB,
130 : MachineBasicBlock::iterator I,
131 : const Counters& Increment);
132 :
133 : /// Insert the actual wait instruction
134 : bool insertWait(MachineBasicBlock &MBB,
135 : MachineBasicBlock::iterator I,
136 : const Counters &Counts);
137 :
138 : /// Handle existing wait instructions (from intrinsics)
139 : void handleExistingWait(MachineBasicBlock::iterator I);
140 :
141 : /// Do we need def2def checks?
142 : bool unorderedDefines(MachineInstr &MI);
143 :
144 : /// Resolve all operand dependencies to counter requirements
145 : Counters handleOperands(MachineInstr &MI);
146 :
147 : /// Insert S_NOP between an instruction writing M0 and S_SENDMSG.
148 : void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
149 :
150 : /// Return true if there are LGKM instrucitons that haven't been waited on
151 : /// yet.
152 : bool hasOutstandingLGKM() const;
153 :
154 : public:
155 : static char ID;
156 :
157 2 : SIInsertWaits() : MachineFunctionPass(ID) {}
158 :
159 : bool runOnMachineFunction(MachineFunction &MF) override;
160 :
161 2 : StringRef getPassName() const override {
162 2 : return "SI insert wait instructions";
163 : }
164 :
165 2 : void getAnalysisUsage(AnalysisUsage &AU) const override {
166 2 : AU.setPreservesCFG();
167 2 : MachineFunctionPass::getAnalysisUsage(AU);
168 2 : }
169 : };
170 :
171 : } // end anonymous namespace
172 :
173 74724 : INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
174 : "SI Insert Waits", false, false)
175 350216 : INITIALIZE_PASS_END(SIInsertWaits, DEBUG_TYPE,
176 : "SI Insert Waits", false, false)
177 :
178 : char SIInsertWaits::ID = 0;
179 :
180 : char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
181 :
182 0 : FunctionPass *llvm::createSIInsertWaitsPass() {
183 0 : return new SIInsertWaits();
184 : }
185 :
186 : const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
187 :
188 : static bool readsVCCZ(const MachineInstr &MI) {
189 32 : unsigned Opc = MI.getOpcode();
190 34 : return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
191 2 : !MI.getOperand(1).isUndef();
192 : }
193 :
194 : bool SIInsertWaits::hasOutstandingLGKM() const {
195 : return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
196 : }
197 :
198 55 : Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
199 55 : uint64_t TSFlags = MI.getDesc().TSFlags;
200 : Counters Result = { { 0, 0, 0 } };
201 :
202 55 : Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
203 :
204 : // Only consider stores or EXP for EXP_CNT
205 55 : Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore();
206 :
207 : // LGKM may uses larger values
208 55 : if (TSFlags & SIInstrFlags::LGKM_CNT) {
209 :
210 14 : if (TII->isSMRD(MI)) {
211 :
212 3 : if (MI.getNumOperands() != 0) {
213 : assert(MI.getOperand(0).isReg() &&
214 : "First LGKM operand must be a register!");
215 :
216 : // XXX - What if this is a write into a super register?
217 3 : const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
218 3 : unsigned Size = TRI->getRegSizeInBits(*RC);
219 3 : Result.Named.LGKM = Size > 32 ? 2 : 1;
220 : } else {
221 : // s_dcache_inv etc. do not have a destination register. Assume we
222 : // want a wait on these.
223 : // XXX - What is the right value?
224 : Result.Named.LGKM = 1;
225 : }
226 : } else {
227 : // DS
228 : Result.Named.LGKM = 1;
229 : }
230 :
231 : } else {
232 : Result.Named.LGKM = 0;
233 : }
234 :
235 55 : return Result;
236 : }
237 :
238 137 : bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
239 : // Constants are always irrelevant
240 205 : if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
241 : return false;
242 :
243 : // Defines are always relevant
244 68 : if (Op.isDef())
245 : return true;
246 :
247 : // For exports all registers are relevant.
248 : // TODO: Skip undef/disabled registers.
249 57 : MachineInstr &MI = *Op.getParent();
250 57 : if (TII->isEXP(MI))
251 : return true;
252 :
253 : // For stores the stored value is also relevant
254 114 : if (!MI.getDesc().mayStore())
255 : return false;
256 :
257 : // Check if this operand is the value being stored.
258 : // Special case for DS/FLAT instructions, since the address
259 : // operand comes before the value operand and it may have
260 : // multiple data operands.
261 :
262 30 : if (TII->isDS(MI)) {
263 0 : MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
264 0 : if (Data0 && Op.isIdenticalTo(*Data0))
265 : return true;
266 :
267 0 : MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
268 0 : return Data1 && Op.isIdenticalTo(*Data1);
269 : }
270 :
271 30 : if (TII->isFLAT(MI)) {
272 12 : MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
273 12 : if (Data && Op.isIdenticalTo(*Data))
274 : return true;
275 : }
276 :
277 : // NOTE: This assumes that the value operand is before the
278 : // address operand, and that there is only one value operand.
279 27 : for (MachineInstr::mop_iterator I = MI.operands_begin(),
280 54 : E = MI.operands_end(); I != E; ++I) {
281 :
282 54 : if (I->isReg() && I->isUse())
283 27 : return Op.isIdenticalTo(*I);
284 : }
285 :
286 : return false;
287 : }
288 :
289 : RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
290 : const MachineOperand &Reg) const {
291 : unsigned Size = TRI->getRegSizeInBits(*RC);
292 : assert(Size >= 32);
293 :
294 : RegInterval Result;
295 270 : Result.first = TRI->getEncodingValue(Reg.getReg());
296 135 : Result.second = Result.first + Size / 32;
297 :
298 : return Result;
299 : }
300 :
301 55 : void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
302 : MachineBasicBlock::iterator I,
303 : const Counters &Increment) {
304 : // Get the hardware counter increments and sum them up
305 55 : Counters Limit = ZeroCounts;
306 : unsigned Sum = 0;
307 :
308 55 : if (TII->mayAccessFlatAddressSpace(*I))
309 8 : IsFlatOutstanding = true;
310 :
311 385 : for (unsigned i = 0; i < 3; ++i) {
312 165 : LastIssued.Array[i] += Increment.Array[i];
313 165 : if (Increment.Array[i])
314 37 : Limit.Array[i] = LastIssued.Array[i];
315 165 : Sum += Increment.Array[i];
316 : }
317 :
318 : // If we don't increase anything then that's it
319 55 : if (Sum == 0) {
320 35 : LastOpcodeType = OTHER;
321 35 : return;
322 : }
323 :
324 20 : if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
325 : // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
326 : // or SMEM clause, respectively.
327 : //
328 : // The temporary workaround is to break the clauses with S_NOP.
329 : //
330 : // The proper solution would be to allocate registers such that all source
331 : // and destination registers don't overlap, e.g. this is illegal:
332 : // r0 = load r2
333 : // r2 = load r0
334 11 : if (LastOpcodeType == VMEM && Increment.Named.VM) {
335 : // Insert a NOP to break the clause.
336 4 : BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
337 : .addImm(0);
338 1 : LastInstWritesM0 = false;
339 : }
340 :
341 11 : if (TII->isSMRD(*I))
342 0 : LastOpcodeType = SMEM;
343 11 : else if (Increment.Named.VM)
344 11 : LastOpcodeType = VMEM;
345 : }
346 :
347 : // Remember which export instructions we have seen
348 20 : if (Increment.Named.EXP) {
349 6 : ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2;
350 : }
351 :
352 157 : for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
353 137 : MachineOperand &Op = I->getOperand(i);
354 137 : if (!isOpRelevant(Op))
355 : continue;
356 :
357 23 : const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
358 23 : RegInterval Interval = getRegInterval(RC, Op);
359 97 : for (unsigned j = Interval.first; j < Interval.second; ++j) {
360 :
361 : // Remember which registers we define
362 37 : if (Op.isDef())
363 22 : DefinedRegs[j] = Limit;
364 :
365 : // and which one we are using
366 37 : if (Op.isUse())
367 15 : UsedRegs[j] = Limit;
368 : }
369 : }
370 : }
371 :
372 69 : bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
373 : MachineBasicBlock::iterator I,
374 : const Counters &Required) {
375 : // End of program? No need to wait on anything
376 : // A function not returning void needs to wait, because other bytecode will
377 : // be appended after it and we don't know what it will be.
378 136 : if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
379 : return false;
380 :
381 : // Figure out if the async instructions execute in order
382 : bool Ordered[3];
383 :
384 : // VM_CNT is always ordered except when there are flat instructions, which
385 : // can return out of order.
386 57 : Ordered[0] = !IsFlatOutstanding;
387 :
388 : // EXP_CNT is unordered if we have both EXP & VM-writes
389 57 : Ordered[1] = ExpInstrTypesSeen == 3;
390 :
391 : // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
392 57 : Ordered[2] = false;
393 :
394 : // The values we are going to put into the S_WAITCNT instruction
395 57 : Counters Counts = HardwareLimits;
396 :
397 : // Do we really need to wait?
398 : bool NeedWait = false;
399 :
400 399 : for (unsigned i = 0; i < 3; ++i) {
401 171 : if (Required.Array[i] <= WaitedOn.Array[i])
402 151 : continue;
403 :
404 : NeedWait = true;
405 :
406 20 : if (Ordered[i]) {
407 5 : unsigned Value = LastIssued.Array[i] - Required.Array[i];
408 :
409 : // Adjust the value to the real hardware possibilities.
410 10 : Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]);
411 : } else
412 15 : Counts.Array[i] = 0;
413 :
414 : // Remember on what we have waited on.
415 20 : WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
416 : }
417 :
418 57 : if (!NeedWait)
419 : return false;
420 :
421 : // Reset EXP_CNT instruction types
422 13 : if (Counts.Named.EXP == 0)
423 4 : ExpInstrTypesSeen = 0;
424 :
425 : // Build the wait instruction
426 52 : BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
427 13 : .addImm(AMDGPU::encodeWaitcnt(ISA,
428 : Counts.Named.VM,
429 : Counts.Named.EXP,
430 13 : Counts.Named.LGKM));
431 :
432 13 : LastOpcodeType = OTHER;
433 13 : LastInstWritesM0 = false;
434 13 : IsFlatOutstanding = false;
435 13 : return true;
436 : }
437 :
438 : /// helper function for handleOperands
439 : static void increaseCounters(Counters &Dst, const Counters &Src) {
440 1786 : for (unsigned i = 0; i < 3; ++i)
441 1692 : Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
442 : }
443 :
444 : /// check whether any of the counters is non-zero
445 : static bool countersNonZero(const Counters &Counter) {
446 479 : for (unsigned i = 0; i < 3; ++i)
447 225 : if (Counter.Array[i])
448 : return true;
449 : return false;
450 : }
451 :
452 0 : void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
453 : assert(I->getOpcode() == AMDGPU::S_WAITCNT);
454 :
455 0 : unsigned Imm = I->getOperand(0).getImm();
456 : Counters Counts, WaitOn;
457 :
458 0 : Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm);
459 0 : Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm);
460 0 : Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm);
461 :
462 0 : for (unsigned i = 0; i < 3; ++i) {
463 0 : if (Counts.Array[i] <= LastIssued.Array[i])
464 0 : WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
465 : else
466 0 : WaitOn.Array[i] = 0;
467 : }
468 :
469 : increaseCounters(DelayedWaitOn, WaitOn);
470 0 : }
471 :
472 55 : Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
473 55 : Counters Result = ZeroCounts;
474 :
475 : // For each register affected by this instruction increase the result
476 : // sequence.
477 : //
478 : // TODO: We could probably just look at explicit operands if we removed VCC /
479 : // EXEC from SMRD dest reg classes.
480 265 : for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
481 210 : MachineOperand &Op = MI.getOperand(i);
482 322 : if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
483 : continue;
484 :
485 112 : const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
486 112 : RegInterval Interval = getRegInterval(RC, Op);
487 520 : for (unsigned j = Interval.first; j < Interval.second; ++j) {
488 204 : if (Op.isDef()) {
489 : increaseCounters(Result, UsedRegs[j]);
490 : increaseCounters(Result, DefinedRegs[j]);
491 : }
492 :
493 204 : if (Op.isUse())
494 : increaseCounters(Result, DefinedRegs[j]);
495 : }
496 : }
497 :
498 55 : return Result;
499 : }
500 :
501 55 : void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
502 : MachineBasicBlock::iterator I) {
503 55 : if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
504 : return;
505 :
506 : // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
507 23 : if (LastInstWritesM0 && (I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT)) {
508 0 : BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
509 0 : LastInstWritesM0 = false;
510 0 : return;
511 : }
512 :
513 : // Set whether this instruction sets M0
514 23 : LastInstWritesM0 = false;
515 :
516 23 : unsigned NumOperands = I->getNumOperands();
517 217 : for (unsigned i = 0; i < NumOperands; i++) {
518 97 : const MachineOperand &Op = I->getOperand(i);
519 :
520 156 : if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
521 0 : LastInstWritesM0 = true;
522 : }
523 : }
524 :
525 : /// Return true if \p MBB has one successor immediately following, and is its
526 : /// only predecessor
527 16 : static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) {
528 16 : if (MBB.succ_size() != 1)
529 : return false;
530 :
531 8 : const MachineBasicBlock *Succ = *MBB.succ_begin();
532 8 : return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ);
533 : }
534 :
535 : // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
536 : // around other non-memory instructions.
537 5 : bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
538 : bool Changes = false;
539 :
540 5 : ST = &MF.getSubtarget<SISubtarget>();
541 5 : TII = ST->getInstrInfo();
542 5 : TRI = &TII->getRegisterInfo();
543 5 : MRI = &MF.getRegInfo();
544 5 : ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
545 5 : const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
546 :
547 5 : HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA);
548 5 : HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA);
549 5 : HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA);
550 :
551 5 : WaitedOn = ZeroCounts;
552 5 : DelayedWaitOn = ZeroCounts;
553 5 : LastIssued = ZeroCounts;
554 5 : LastOpcodeType = OTHER;
555 5 : LastInstWritesM0 = false;
556 5 : IsFlatOutstanding = false;
557 5 : ReturnsVoid = MFI->returnsVoid();
558 :
559 5 : memset(&UsedRegs, 0, sizeof(UsedRegs));
560 5 : memset(&DefinedRegs, 0, sizeof(DefinedRegs));
561 :
562 : SmallVector<MachineInstr *, 4> RemoveMI;
563 : SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
564 :
565 : bool HaveScalarStores = false;
566 :
567 : for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
568 21 : BI != BE; ++BI) {
569 : MachineBasicBlock &MBB = *BI;
570 :
571 16 : for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
572 71 : I != E; ++I) {
573 110 : if (!HaveScalarStores && TII->isScalarStore(*I))
574 : HaveScalarStores = true;
575 :
576 55 : if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
577 : // There is a hardware bug on CI/SI where SMRD instruction may corrupt
578 : // vccz bit, so when we detect that an instruction may read from a
579 : // corrupt vccz bit, we need to:
580 : // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
581 : // complete.
582 : // 2. Restore the correct value of vccz by writing the current value
583 : // of vcc back to vcc.
584 :
585 64 : if (TII->isSMRD(I->getOpcode())) {
586 3 : VCCZCorrupt = true;
587 52 : } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
588 : // FIXME: We only care about SMRD instructions here, not LDS or GDS.
589 : // Whenever we store a value in vcc, the correct value of vccz is
590 : // restored.
591 0 : VCCZCorrupt = false;
592 : }
593 :
594 : // Check if we need to apply the bug work-around
595 32 : if (VCCZCorrupt && readsVCCZ(*I)) {
596 : DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
597 :
598 : // Wait on everything, not just LGKM. vccz reads usually come from
599 : // terminators, and we always wait on everything at the end of the
600 : // block, so if we only wait on LGKM here, we might end up with
601 : // another s_waitcnt inserted right after this if there are non-LGKM
602 : // instructions still outstanding.
603 1 : insertWait(MBB, I, LastIssued);
604 :
605 : // Restore the vccz bit. Any time a value is written to vcc, the vcc
606 : // bit is updated, so we can restore the bit by reading the value of
607 : // vcc and then writing it back to the register.
608 3 : BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
609 1 : AMDGPU::VCC)
610 1 : .addReg(AMDGPU::VCC);
611 : }
612 : }
613 :
614 : // Record pre-existing, explicitly requested waits
615 110 : if (I->getOpcode() == AMDGPU::S_WAITCNT) {
616 0 : handleExistingWait(*I);
617 0 : RemoveMI.push_back(&*I);
618 0 : continue;
619 : }
620 :
621 : Counters Required;
622 :
623 : // Wait for everything before a barrier.
624 : //
625 : // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
626 : // but we also want to wait for any other outstanding transfers before
627 : // signalling other hardware blocks
628 0 : if ((I->getOpcode() == AMDGPU::S_BARRIER &&
629 55 : !ST->hasAutoWaitcntBeforeBarrier()) ||
630 110 : I->getOpcode() == AMDGPU::S_SENDMSG ||
631 : I->getOpcode() == AMDGPU::S_SENDMSGHALT)
632 0 : Required = LastIssued;
633 : else
634 55 : Required = handleOperands(*I);
635 :
636 55 : Counters Increment = getHwCounts(*I);
637 :
638 91 : if (countersNonZero(Required) || countersNonZero(Increment))
639 : increaseCounters(Required, DelayedWaitOn);
640 :
641 55 : Changes |= insertWait(MBB, I, Required);
642 :
643 55 : pushInstruction(MBB, I, Increment);
644 55 : handleSendMsg(MBB, I);
645 :
646 110 : if (I->getOpcode() == AMDGPU::S_ENDPGM ||
647 : I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
648 6 : EndPgmBlocks.push_back(&MBB);
649 : }
650 :
651 : // Wait for everything at the end of the MBB. If there is only one
652 : // successor, we can defer this until the uses there.
653 16 : if (!hasTrivialSuccessor(MBB))
654 13 : Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
655 : }
656 :
657 5 : if (HaveScalarStores) {
658 : // If scalar writes are used, the cache must be flushed or else the next
659 : // wave to reuse the same scratch memory can be clobbered.
660 : //
661 : // Insert s_dcache_wb at wave termination points if there were any scalar
662 : // stores, and only if the cache hasn't already been flushed. This could be
663 : // improved by looking across blocks for flushes in postdominating blocks
664 : // from the stores but an explicitly requested flush is probably very rare.
665 0 : for (MachineBasicBlock *MBB : EndPgmBlocks) {
666 : bool SeenDCacheWB = false;
667 :
668 0 : for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
669 0 : I != E; ++I) {
670 0 : if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
671 : SeenDCacheWB = true;
672 0 : else if (TII->isScalarStore(*I))
673 : SeenDCacheWB = false;
674 :
675 : // FIXME: It would be better to insert this before a waitcnt if any.
676 0 : if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
677 0 : I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) {
678 : Changes = true;
679 0 : BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
680 : }
681 : }
682 : }
683 : }
684 :
685 5 : for (MachineInstr *I : RemoveMI)
686 0 : I->eraseFromParent();
687 :
688 5 : if (!MFI->isEntryFunction()) {
689 : // Wait for any outstanding memory operations that the input registers may
690 : // depend on. We can't track them and it's better to the wait after the
691 : // costly call sequence.
692 :
693 : // TODO: Could insert earlier and schedule more liberally with operations
694 : // that only use caller preserved registers.
695 : MachineBasicBlock &EntryBB = MF.front();
696 0 : BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
697 : .addImm(0);
698 :
699 : Changes = true;
700 : }
701 :
702 5 : return Changes;
703 : }
|