LLVM  6.0.0svn
SIInsertWaits.cpp
Go to the documentation of this file.
1 //===- SILowerControlFlow.cpp - Use predicates for control flow -----------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Insert wait instructions for memory reads and writes.
12 ///
13 /// Memory reads and writes are issued asynchronously, so we need to insert
14 /// S_WAITCNT instructions when we want to access any of their results or
15 /// overwrite any register that's used asynchronously.
16 //
17 //===----------------------------------------------------------------------===//
18 
19 #include "AMDGPU.h"
20 #include "AMDGPUSubtarget.h"
21 #include "SIDefines.h"
22 #include "SIInstrInfo.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "SIRegisterInfo.h"
25 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/ADT/SmallVector.h"
27 #include "llvm/ADT/StringRef.h"
35 #include "llvm/IR/DebugLoc.h"
36 #include "llvm/MC/MCInstrDesc.h"
37 #include "llvm/Pass.h"
38 #include "llvm/Support/Debug.h"
40 #include <algorithm>
41 #include <cassert>
42 #include <cstdint>
43 #include <cstring>
44 #include <utility>
45 
46 #define DEBUG_TYPE "si-insert-waits"
47 
48 using namespace llvm;
49 
50 namespace {
51 
52 /// \brief One variable for each of the hardware counters
53 using Counters = union {
54  struct {
55  unsigned VM;
56  unsigned EXP;
57  unsigned LGKM;
58  } Named;
59  unsigned Array[3];
60 };
61 
62 using InstType = enum {
63  OTHER,
64  SMEM,
65  VMEM
66 };
67 
68 using RegCounters = Counters[512];
69 using RegInterval = std::pair<unsigned, unsigned>;
70 
71 class SIInsertWaits : public MachineFunctionPass {
72 private:
73  const SISubtarget *ST = nullptr;
74  const SIInstrInfo *TII = nullptr;
75  const SIRegisterInfo *TRI = nullptr;
76  const MachineRegisterInfo *MRI;
78 
79  /// \brief Constant zero value
80  static const Counters ZeroCounts;
81 
82  /// \brief Hardware limits
83  Counters HardwareLimits;
84 
85  /// \brief Counter values we have already waited on.
86  Counters WaitedOn;
87 
88  /// \brief Counter values that we must wait on before the next counter
89  /// increase.
90  Counters DelayedWaitOn;
91 
92  /// \brief Counter values for last instruction issued.
93  Counters LastIssued;
94 
95  /// \brief Registers used by async instructions.
96  RegCounters UsedRegs;
97 
98  /// \brief Registers defined by async instructions.
99  RegCounters DefinedRegs;
100 
101  /// \brief Different export instruction types seen since last wait.
102  unsigned ExpInstrTypesSeen = 0;
103 
104  /// \brief Type of the last opcode.
105  InstType LastOpcodeType;
106 
107  bool LastInstWritesM0;
108 
109  /// Whether or not we have flat operations outstanding.
110  bool IsFlatOutstanding;
111 
112  /// \brief Whether the machine function returns void
113  bool ReturnsVoid;
114 
115  /// Whether the VCCZ bit is possibly corrupt
116  bool VCCZCorrupt = false;
117 
118  /// \brief Get increment/decrement amount for this instruction.
119  Counters getHwCounts(MachineInstr &MI);
120 
121  /// \brief Is operand relevant for async execution?
122  bool isOpRelevant(MachineOperand &Op);
123 
124  /// \brief Get register interval an operand affects.
125  RegInterval getRegInterval(const TargetRegisterClass *RC,
126  const MachineOperand &Reg) const;
127 
128  /// \brief Handle instructions async components
129  void pushInstruction(MachineBasicBlock &MBB,
131  const Counters& Increment);
132 
133  /// \brief Insert the actual wait instruction
134  bool insertWait(MachineBasicBlock &MBB,
136  const Counters &Counts);
137 
138  /// \brief Handle existing wait instructions (from intrinsics)
139  void handleExistingWait(MachineBasicBlock::iterator I);
140 
141  /// \brief Do we need def2def checks?
142  bool unorderedDefines(MachineInstr &MI);
143 
144  /// \brief Resolve all operand dependencies to counter requirements
145  Counters handleOperands(MachineInstr &MI);
146 
147  /// \brief Insert S_NOP between an instruction writing M0 and S_SENDMSG.
148  void handleSendMsg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
149 
150  /// Return true if there are LGKM instrucitons that haven't been waited on
151  /// yet.
152  bool hasOutstandingLGKM() const;
153 
154 public:
155  static char ID;
156 
157  SIInsertWaits() : MachineFunctionPass(ID) {}
158 
159  bool runOnMachineFunction(MachineFunction &MF) override;
160 
161  StringRef getPassName() const override {
162  return "SI insert wait instructions";
163  }
164 
165  void getAnalysisUsage(AnalysisUsage &AU) const override {
166  AU.setPreservesCFG();
168  }
169 };
170 
171 } // end anonymous namespace
172 
173 INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE,
174  "SI Insert Waits", false, false)
176  "SI Insert Waits", false, false)
177 
178 char SIInsertWaits::ID = 0;
179 
180 char &llvm::SIInsertWaitsID = SIInsertWaits::ID;
181 
183  return new SIInsertWaits();
184 }
185 
186 const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
187 
188 static bool readsVCCZ(const MachineInstr &MI) {
189  unsigned Opc = MI.getOpcode();
190  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
191  !MI.getOperand(1).isUndef();
192 }
193 
194 bool SIInsertWaits::hasOutstandingLGKM() const {
195  return WaitedOn.Named.LGKM != LastIssued.Named.LGKM;
196 }
197 
198 Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
199  uint64_t TSFlags = MI.getDesc().TSFlags;
200  Counters Result = { { 0, 0, 0 } };
201 
202  Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
203 
204  // Only consider stores or EXP for EXP_CNT
205  Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT) && MI.mayStore();
206 
207  // LGKM may uses larger values
208  if (TSFlags & SIInstrFlags::LGKM_CNT) {
209 
210  if (TII->isSMRD(MI)) {
211 
212  if (MI.getNumOperands() != 0) {
213  assert(MI.getOperand(0).isReg() &&
214  "First LGKM operand must be a register!");
215 
216  // XXX - What if this is a write into a super register?
217  const TargetRegisterClass *RC = TII->getOpRegClass(MI, 0);
218  unsigned Size = TRI->getRegSizeInBits(*RC);
219  Result.Named.LGKM = Size > 32 ? 2 : 1;
220  } else {
221  // s_dcache_inv etc. do not have a a destination register. Assume we
222  // want a wait on these.
223  // XXX - What is the right value?
224  Result.Named.LGKM = 1;
225  }
226  } else {
227  // DS
228  Result.Named.LGKM = 1;
229  }
230 
231  } else {
232  Result.Named.LGKM = 0;
233  }
234 
235  return Result;
236 }
237 
238 bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
239  // Constants are always irrelevant
240  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
241  return false;
242 
243  // Defines are always relevant
244  if (Op.isDef())
245  return true;
246 
247  // For exports all registers are relevant.
248  // TODO: Skip undef/disabled registers.
249  MachineInstr &MI = *Op.getParent();
250  if (TII->isEXP(MI))
251  return true;
252 
253  // For stores the stored value is also relevant
254  if (!MI.getDesc().mayStore())
255  return false;
256 
257  // Check if this operand is the value being stored.
258  // Special case for DS/FLAT instructions, since the address
259  // operand comes before the value operand and it may have
260  // multiple data operands.
261 
262  if (TII->isDS(MI)) {
263  MachineOperand *Data0 = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
264  if (Data0 && Op.isIdenticalTo(*Data0))
265  return true;
266 
267  MachineOperand *Data1 = TII->getNamedOperand(MI, AMDGPU::OpName::data1);
268  return Data1 && Op.isIdenticalTo(*Data1);
269  }
270 
271  if (TII->isFLAT(MI)) {
272  MachineOperand *Data = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
273  if (Data && Op.isIdenticalTo(*Data))
274  return true;
275  }
276 
277  // NOTE: This assumes that the value operand is before the
278  // address operand, and that there is only one value operand.
280  E = MI.operands_end(); I != E; ++I) {
281 
282  if (I->isReg() && I->isUse())
283  return Op.isIdenticalTo(*I);
284  }
285 
286  return false;
287 }
288 
289 RegInterval SIInsertWaits::getRegInterval(const TargetRegisterClass *RC,
290  const MachineOperand &Reg) const {
291  unsigned Size = TRI->getRegSizeInBits(*RC);
292  assert(Size >= 32);
293 
294  RegInterval Result;
295  Result.first = TRI->getEncodingValue(Reg.getReg());
296  Result.second = Result.first + Size / 32;
297 
298  return Result;
299 }
300 
301 void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
303  const Counters &Increment) {
304  // Get the hardware counter increments and sum them up
305  Counters Limit = ZeroCounts;
306  unsigned Sum = 0;
307 
308  if (TII->mayAccessFlatAddressSpace(*I))
309  IsFlatOutstanding = true;
310 
311  for (unsigned i = 0; i < 3; ++i) {
312  LastIssued.Array[i] += Increment.Array[i];
313  if (Increment.Array[i])
314  Limit.Array[i] = LastIssued.Array[i];
315  Sum += Increment.Array[i];
316  }
317 
318  // If we don't increase anything then that's it
319  if (Sum == 0) {
320  LastOpcodeType = OTHER;
321  return;
322  }
323 
324  if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
325  // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
326  // or SMEM clause, respectively.
327  //
328  // The temporary workaround is to break the clauses with S_NOP.
329  //
330  // The proper solution would be to allocate registers such that all source
331  // and destination registers don't overlap, e.g. this is illegal:
332  // r0 = load r2
333  // r2 = load r0
334  if (LastOpcodeType == VMEM && Increment.Named.VM) {
335  // Insert a NOP to break the clause.
336  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP))
337  .addImm(0);
338  LastInstWritesM0 = false;
339  }
340 
341  if (TII->isSMRD(*I))
342  LastOpcodeType = SMEM;
343  else if (Increment.Named.VM)
344  LastOpcodeType = VMEM;
345  }
346 
347  // Remember which export instructions we have seen
348  if (Increment.Named.EXP) {
349  ExpInstrTypesSeen |= TII->isEXP(*I) ? 1 : 2;
350  }
351 
352  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
353  MachineOperand &Op = I->getOperand(i);
354  if (!isOpRelevant(Op))
355  continue;
356 
357  const TargetRegisterClass *RC = TII->getOpRegClass(*I, i);
358  RegInterval Interval = getRegInterval(RC, Op);
359  for (unsigned j = Interval.first; j < Interval.second; ++j) {
360 
361  // Remember which registers we define
362  if (Op.isDef())
363  DefinedRegs[j] = Limit;
364 
365  // and which one we are using
366  if (Op.isUse())
367  UsedRegs[j] = Limit;
368  }
369  }
370 }
371 
372 bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
374  const Counters &Required) {
375  // End of program? No need to wait on anything
376  // A function not returning void needs to wait, because other bytecode will
377  // be appended after it and we don't know what it will be.
378  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid)
379  return false;
380 
381  // Figure out if the async instructions execute in order
382  bool Ordered[3];
383 
384  // VM_CNT is always ordered except when there are flat instructions, which
385  // can return out of order.
386  Ordered[0] = !IsFlatOutstanding;
387 
388  // EXP_CNT is unordered if we have both EXP & VM-writes
389  Ordered[1] = ExpInstrTypesSeen == 3;
390 
391  // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
392  Ordered[2] = false;
393 
394  // The values we are going to put into the S_WAITCNT instruction
395  Counters Counts = HardwareLimits;
396 
397  // Do we really need to wait?
398  bool NeedWait = false;
399 
400  for (unsigned i = 0; i < 3; ++i) {
401  if (Required.Array[i] <= WaitedOn.Array[i])
402  continue;
403 
404  NeedWait = true;
405 
406  if (Ordered[i]) {
407  unsigned Value = LastIssued.Array[i] - Required.Array[i];
408 
409  // Adjust the value to the real hardware possibilities.
410  Counts.Array[i] = std::min(Value, HardwareLimits.Array[i]);
411  } else
412  Counts.Array[i] = 0;
413 
414  // Remember on what we have waited on.
415  WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
416  }
417 
418  if (!NeedWait)
419  return false;
420 
421  // Reset EXP_CNT instruction types
422  if (Counts.Named.EXP == 0)
423  ExpInstrTypesSeen = 0;
424 
425  // Build the wait instruction
426  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
427  .addImm(AMDGPU::encodeWaitcnt(ISA,
428  Counts.Named.VM,
429  Counts.Named.EXP,
430  Counts.Named.LGKM));
431 
432  LastOpcodeType = OTHER;
433  LastInstWritesM0 = false;
434  IsFlatOutstanding = false;
435  return true;
436 }
437 
438 /// \brief helper function for handleOperands
439 static void increaseCounters(Counters &Dst, const Counters &Src) {
440  for (unsigned i = 0; i < 3; ++i)
441  Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
442 }
443 
444 /// \brief check whether any of the counters is non-zero
445 static bool countersNonZero(const Counters &Counter) {
446  for (unsigned i = 0; i < 3; ++i)
447  if (Counter.Array[i])
448  return true;
449  return false;
450 }
451 
452 void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
453  assert(I->getOpcode() == AMDGPU::S_WAITCNT);
454 
455  unsigned Imm = I->getOperand(0).getImm();
456  Counters Counts, WaitOn;
457 
458  Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm);
459  Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm);
460  Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm);
461 
462  for (unsigned i = 0; i < 3; ++i) {
463  if (Counts.Array[i] <= LastIssued.Array[i])
464  WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
465  else
466  WaitOn.Array[i] = 0;
467  }
468 
469  increaseCounters(DelayedWaitOn, WaitOn);
470 }
471 
472 Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
473  Counters Result = ZeroCounts;
474 
475  // For each register affected by this instruction increase the result
476  // sequence.
477  //
478  // TODO: We could probably just look at explicit operands if we removed VCC /
479  // EXEC from SMRD dest reg classes.
480  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
481  MachineOperand &Op = MI.getOperand(i);
482  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
483  continue;
484 
485  const TargetRegisterClass *RC = TII->getOpRegClass(MI, i);
486  RegInterval Interval = getRegInterval(RC, Op);
487  for (unsigned j = Interval.first; j < Interval.second; ++j) {
488  if (Op.isDef()) {
489  increaseCounters(Result, UsedRegs[j]);
490  increaseCounters(Result, DefinedRegs[j]);
491  }
492 
493  if (Op.isUse())
494  increaseCounters(Result, DefinedRegs[j]);
495  }
496  }
497 
498  return Result;
499 }
500 
501 void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
503  if (ST->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
504  return;
505 
506  // There must be "S_NOP 0" between an instruction writing M0 and S_SENDMSG.
507  if (LastInstWritesM0 && (I->getOpcode() == AMDGPU::S_SENDMSG || I->getOpcode() == AMDGPU::S_SENDMSGHALT)) {
508  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
509  LastInstWritesM0 = false;
510  return;
511  }
512 
513  // Set whether this instruction sets M0
514  LastInstWritesM0 = false;
515 
516  unsigned NumOperands = I->getNumOperands();
517  for (unsigned i = 0; i < NumOperands; i++) {
518  const MachineOperand &Op = I->getOperand(i);
519 
520  if (Op.isReg() && Op.isDef() && Op.getReg() == AMDGPU::M0)
521  LastInstWritesM0 = true;
522  }
523 }
524 
525 /// Return true if \p MBB has one successor immediately following, and is its
526 /// only predecessor
527 static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) {
528  if (MBB.succ_size() != 1)
529  return false;
530 
531  const MachineBasicBlock *Succ = *MBB.succ_begin();
532  return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ);
533 }
534 
535 // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
536 // around other non-memory instructions.
537 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
538  bool Changes = false;
539 
540  ST = &MF.getSubtarget<SISubtarget>();
541  TII = ST->getInstrInfo();
542  TRI = &TII->getRegisterInfo();
543  MRI = &MF.getRegInfo();
544  ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
546 
547  HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA);
548  HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA);
549  HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA);
550 
551  WaitedOn = ZeroCounts;
552  DelayedWaitOn = ZeroCounts;
553  LastIssued = ZeroCounts;
554  LastOpcodeType = OTHER;
555  LastInstWritesM0 = false;
556  IsFlatOutstanding = false;
557  ReturnsVoid = MFI->returnsVoid();
558 
559  memset(&UsedRegs, 0, sizeof(UsedRegs));
560  memset(&DefinedRegs, 0, sizeof(DefinedRegs));
561 
564 
565  bool HaveScalarStores = false;
566 
567  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
568  BI != BE; ++BI) {
569  MachineBasicBlock &MBB = *BI;
570 
571  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
572  I != E; ++I) {
573  if (!HaveScalarStores && TII->isScalarStore(*I))
574  HaveScalarStores = true;
575 
576  if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
577  // There is a hardware bug on CI/SI where SMRD instruction may corrupt
578  // vccz bit, so when we detect that an instruction may read from a
579  // corrupt vccz bit, we need to:
580  // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD operations to
581  // complete.
582  // 2. Restore the correct value of vccz by writing the current value
583  // of vcc back to vcc.
584 
585  if (TII->isSMRD(I->getOpcode())) {
586  VCCZCorrupt = true;
587  } else if (!hasOutstandingLGKM() && I->modifiesRegister(AMDGPU::VCC, TRI)) {
588  // FIXME: We only care about SMRD instructions here, not LDS or GDS.
589  // Whenever we store a value in vcc, the correct value of vccz is
590  // restored.
591  VCCZCorrupt = false;
592  }
593 
594  // Check if we need to apply the bug work-around
595  if (VCCZCorrupt && readsVCCZ(*I)) {
596  DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
597 
598  // Wait on everything, not just LGKM. vccz reads usually come from
599  // terminators, and we always wait on everything at the end of the
600  // block, so if we only wait on LGKM here, we might end up with
601  // another s_waitcnt inserted right after this if there are non-LGKM
602  // instructions still outstanding.
603  insertWait(MBB, I, LastIssued);
604 
605  // Restore the vccz bit. Any time a value is written to vcc, the vcc
606  // bit is updated, so we can restore the bit by reading the value of
607  // vcc and then writing it back to the register.
608  BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
609  AMDGPU::VCC)
610  .addReg(AMDGPU::VCC);
611  }
612  }
613 
614  // Record pre-existing, explicitly requested waits
615  if (I->getOpcode() == AMDGPU::S_WAITCNT) {
616  handleExistingWait(*I);
617  RemoveMI.push_back(&*I);
618  continue;
619  }
620 
621  Counters Required;
622 
623  // Wait for everything before a barrier.
624  //
625  // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
626  // but we also want to wait for any other outstanding transfers before
627  // signalling other hardware blocks
628  if ((I->getOpcode() == AMDGPU::S_BARRIER &&
629  !ST->hasAutoWaitcntBeforeBarrier()) ||
630  I->getOpcode() == AMDGPU::S_SENDMSG ||
631  I->getOpcode() == AMDGPU::S_SENDMSGHALT)
632  Required = LastIssued;
633  else
634  Required = handleOperands(*I);
635 
636  Counters Increment = getHwCounts(*I);
637 
638  if (countersNonZero(Required) || countersNonZero(Increment))
639  increaseCounters(Required, DelayedWaitOn);
640 
641  Changes |= insertWait(MBB, I, Required);
642 
643  pushInstruction(MBB, I, Increment);
644  handleSendMsg(MBB, I);
645 
646  if (I->getOpcode() == AMDGPU::S_ENDPGM ||
647  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
648  EndPgmBlocks.push_back(&MBB);
649  }
650 
651  // Wait for everything at the end of the MBB. If there is only one
652  // successor, we can defer this until the uses there.
653  if (!hasTrivialSuccessor(MBB))
654  Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
655  }
656 
657  if (HaveScalarStores) {
658  // If scalar writes are used, the cache must be flushed or else the next
659  // wave to reuse the same scratch memory can be clobbered.
660  //
661  // Insert s_dcache_wb at wave termination points if there were any scalar
662  // stores, and only if the cache hasn't already been flushed. This could be
663  // improved by looking across blocks for flushes in postdominating blocks
664  // from the stores but an explicitly requested flush is probably very rare.
665  for (MachineBasicBlock *MBB : EndPgmBlocks) {
666  bool SeenDCacheWB = false;
667 
668  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
669  I != E; ++I) {
670  if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
671  SeenDCacheWB = true;
672  else if (TII->isScalarStore(*I))
673  SeenDCacheWB = false;
674 
675  // FIXME: It would be better to insert this before a waitcnt if any.
676  if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
677  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) {
678  Changes = true;
679  BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
680  }
681  }
682  }
683  }
684 
685  for (MachineInstr *I : RemoveMI)
686  I->eraseFromParent();
687 
688  if (!MFI->isEntryFunction()) {
689  // Wait for any outstanding memory operations that the input registers may
690  // depend on. We can't track them and it's better to to the wait after the
691  // costly call sequence.
692 
693  // TODO: Could insert earlier and schedule more liberally with operations
694  // that only use caller preserved registers.
695  MachineBasicBlock &EntryBB = MF.front();
696  BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
697  .addImm(0);
698 
699  Changes = true;
700  }
701 
702  return Changes;
703 }
void push_back(const T &Elt)
Definition: SmallVector.h:212
mop_iterator operands_end()
Definition: MachineInstr.h:327
Interface definition for SIRegisterInfo.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
AMDGPU specific subclass of TargetSubtarget.
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:37
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
unsigned getReg() const
getReg - Returns the register number.
A debug info location.
Definition: DebugLoc.h:34
char & SIInsertWaitsID
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
unsigned getNumOperands() const
Access to explicit operands of the instruction.
Definition: MachineInstr.h:293
Reg
All possible values of the reg field in the ModR/M byte.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:290
unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt)
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Definition: MachineInstr.h:287
static void increaseCounters(Counters &Dst, const Counters &Src)
helper function for handleOperands
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:639
unsigned const MachineRegisterInfo * MRI
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool hasTrivialSuccessor(const MachineBasicBlock &MBB)
Return true if MBB has one successor immediately following, and is its only predecessor.
INITIALIZE_PASS_BEGIN(SIInsertWaits, DEBUG_TYPE, "SI Insert Waits", false, false) INITIALIZE_PASS_END(SIInsertWaits
Represent the analysis usage information of a pass.
static bool countersNonZero(const Counters &Counter)
check whether any of the counters is non-zero
Instruction set architecture version.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
SI Insert Waits
const MachineBasicBlock & front() const
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
static bool readsVCCZ(const MachineInstr &MI)
unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt)
unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version)
Iterator for intrusive lists based on ilist_node.
const SIRegisterInfo * getRegisterInfo() const override
MachineOperand class - Representation of each machine instruction operand.
unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version)
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:285
unsigned pred_size() const
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
bool mayStore() const
Return true if this instruction could possibly modify memory.
Definition: MCInstrDesc.h:393
unsigned succ_size() const
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Representation of each machine instruction.
Definition: MachineInstr.h:59
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt)
unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version)
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
#define DEBUG_TYPE
FunctionPass * createSIInsertWaitsPass()
bool isReg() const
isReg - Tests if this is a MO_Register operand.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
Definition: Value.h:73
mop_iterator operands_begin()
Definition: MachineInstr.h:326
constexpr char Size[]
Key for Kernel::Arg::Metadata::mSize.
#define DEBUG(X)
Definition: Debug.h:118
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:295
IsaVersion getIsaVersion(const FeatureBitset &Features)