LLVM  9.0.0svn
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert wait instructions for memory reads and writes.
11 ///
12 /// Memory reads and writes are issued asynchronously, so we need to insert
13 /// S_WAITCNT instructions when we want to access any of their results or
14 /// overwrite any register that's used asynchronously.
15 ///
16 /// TODO: This pass currently keeps one timeline per hardware counter. A more
17 /// finely-grained approach that keeps one timeline per event type could
18 /// sometimes get away with generating weaker s_waitcnt instructions. For
19 /// example, when both SMEM and LDS are in flight and we need to wait for
20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21 /// but the pass will currently generate a conservative lgkmcnt(0) because
22 /// multiple event types are in flight.
23 //
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPU.h"
27 #include "AMDGPUSubtarget.h"
28 #include "SIDefines.h"
29 #include "SIInstrInfo.h"
30 #include "SIMachineFunctionInfo.h"
31 #include "SIRegisterInfo.h"
32 #include "Utils/AMDGPUBaseInfo.h"
33 #include "llvm/ADT/DenseMap.h"
34 #include "llvm/ADT/DenseSet.h"
36 #include "llvm/ADT/STLExtras.h"
37 #include "llvm/ADT/SmallVector.h"
46 #include "llvm/IR/DebugLoc.h"
47 #include "llvm/Pass.h"
48 #include "llvm/Support/Debug.h"
52 #include <algorithm>
53 #include <cassert>
54 #include <cstdint>
55 #include <cstring>
56 #include <memory>
57 #include <utility>
58 #include <vector>
59 
60 using namespace llvm;
61 
62 #define DEBUG_TYPE "si-insert-waitcnts"
63 
64 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
65  "Force emit s_waitcnt expcnt(0) instrs");
66 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
67  "Force emit s_waitcnt lgkmcnt(0) instrs");
68 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
69  "Force emit s_waitcnt vmcnt(0) instrs");
70 
72  "amdgpu-waitcnt-forcezero",
73  cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
74  cl::init(false), cl::Hidden);
75 
76 namespace {
77 
78 template <typename EnumT>
79 class enum_iterator
80  : public iterator_facade_base<enum_iterator<EnumT>,
81  std::forward_iterator_tag, const EnumT> {
82  EnumT Value;
83 public:
84  enum_iterator() = default;
85  enum_iterator(EnumT Value) : Value(Value) {}
86 
87  enum_iterator &operator++() {
88  Value = static_cast<EnumT>(Value + 1);
89  return *this;
90  }
91 
92  bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
93 
94  EnumT operator*() const { return Value; }
95 };
96 
97 // Class of object that encapsulates latest instruction counter score
98 // associated with the operand. Used for determining whether
99 // s_waitcnt instruction needs to be emited.
100 
101 #define CNT_MASK(t) (1u << (t))
102 
103 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
104 
105 iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
106  return make_range(enum_iterator<InstCounterType>(VM_CNT),
107  enum_iterator<InstCounterType>(NUM_INST_CNTS));
108 }
109 
110 using RegInterval = std::pair<signed, signed>;
111 
112 struct {
113  uint32_t VmcntMax;
114  uint32_t ExpcntMax;
115  uint32_t LgkmcntMax;
116  uint32_t VscntMax;
117  int32_t NumVGPRsMax;
118  int32_t NumSGPRsMax;
119 } HardwareLimits;
120 
121 struct {
122  unsigned VGPR0;
123  unsigned VGPRL;
124  unsigned SGPR0;
125  unsigned SGPRL;
126 } RegisterEncoding;
127 
129  VMEM_ACCESS, // vector-memory read & write
130  VMEM_READ_ACCESS, // vector-memory read
131  VMEM_WRITE_ACCESS,// vector-memory write
132  LDS_ACCESS, // lds read & write
133  GDS_ACCESS, // gds read & write
134  SQ_MESSAGE, // send message
135  SMEM_ACCESS, // scalar-memory read & write
136  EXP_GPR_LOCK, // export holding on its data src
137  GDS_GPR_LOCK, // GDS holding on its data and addr src
138  EXP_POS_ACCESS, // write to export position
139  EXP_PARAM_ACCESS, // write to export parameter
140  VMW_GPR_LOCK, // vector-memory write holding on its data src
141  NUM_WAIT_EVENTS,
142 };
143 
144 static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
145  (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
146  (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
147  (1 << SQ_MESSAGE),
148  (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
149  (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
150  (1 << VMEM_WRITE_ACCESS)
151 };
152 
153 // The mapping is:
154 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
155 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
156 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
157 // We reserve a fixed number of VGPR slots in the scoring tables for
158 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
160  SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
161  SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
162  NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
163  EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
164  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
165 };
166 
167 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
168  switch (T) {
169  case VM_CNT:
170  Wait.VmCnt = std::min(Wait.VmCnt, Count);
171  break;
172  case EXP_CNT:
173  Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
174  break;
175  case LGKM_CNT:
176  Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
177  break;
178  case VS_CNT:
179  Wait.VsCnt = std::min(Wait.VsCnt, Count);
180  break;
181  default:
182  llvm_unreachable("bad InstCounterType");
183  }
184 }
185 
186 // This objects maintains the current score brackets of each wait counter, and
187 // a per-register scoreboard for each wait counter.
188 //
189 // We also maintain the latest score for every event type that can change the
190 // waitcnt in order to know if there are multiple types of events within
191 // the brackets. When multiple types of event happen in the bracket,
192 // wait count may get decreased out of order, therefore we need to put in
193 // "s_waitcnt 0" before use.
194 class WaitcntBrackets {
195 public:
196  WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
197  for (auto T : inst_counter_types())
198  memset(VgprScores[T], 0, sizeof(VgprScores[T]));
199  }
200 
201  static uint32_t getWaitCountMax(InstCounterType T) {
202  switch (T) {
203  case VM_CNT:
204  return HardwareLimits.VmcntMax;
205  case LGKM_CNT:
206  return HardwareLimits.LgkmcntMax;
207  case EXP_CNT:
208  return HardwareLimits.ExpcntMax;
209  case VS_CNT:
210  return HardwareLimits.VscntMax;
211  default:
212  break;
213  }
214  return 0;
215  }
216 
217  uint32_t getScoreLB(InstCounterType T) const {
218  assert(T < NUM_INST_CNTS);
219  if (T >= NUM_INST_CNTS)
220  return 0;
221  return ScoreLBs[T];
222  }
223 
224  uint32_t getScoreUB(InstCounterType T) const {
225  assert(T < NUM_INST_CNTS);
226  if (T >= NUM_INST_CNTS)
227  return 0;
228  return ScoreUBs[T];
229  }
230 
231  // Mapping from event to counter.
232  InstCounterType eventCounter(WaitEventType E) {
233  if (WaitEventMaskForInst[VM_CNT] & (1 << E))
234  return VM_CNT;
235  if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
236  return LGKM_CNT;
237  if (WaitEventMaskForInst[VS_CNT] & (1 << E))
238  return VS_CNT;
239  assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
240  return EXP_CNT;
241  }
242 
243  uint32_t getRegScore(int GprNo, InstCounterType T) {
244  if (GprNo < NUM_ALL_VGPRS) {
245  return VgprScores[T][GprNo];
246  }
247  assert(T == LGKM_CNT);
248  return SgprScores[GprNo - NUM_ALL_VGPRS];
249  }
250 
251  void clear() {
252  memset(ScoreLBs, 0, sizeof(ScoreLBs));
253  memset(ScoreUBs, 0, sizeof(ScoreUBs));
254  PendingEvents = 0;
255  memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
256  for (auto T : inst_counter_types())
257  memset(VgprScores[T], 0, sizeof(VgprScores[T]));
258  memset(SgprScores, 0, sizeof(SgprScores));
259  }
260 
261  bool merge(const WaitcntBrackets &Other);
262 
263  RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
264  const MachineRegisterInfo *MRI,
265  const SIRegisterInfo *TRI, unsigned OpNo,
266  bool Def) const;
267 
268  int32_t getMaxVGPR() const { return VgprUB; }
269  int32_t getMaxSGPR() const { return SgprUB; }
270 
271  bool counterOutOfOrder(InstCounterType T) const;
272  bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
273  bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
274  void determineWait(InstCounterType T, uint32_t ScoreToWait,
275  AMDGPU::Waitcnt &Wait) const;
276  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
277  void applyWaitcnt(InstCounterType T, unsigned Count);
278  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
279  const MachineRegisterInfo *MRI, WaitEventType E,
280  MachineInstr &MI);
281 
282  bool hasPending() const { return PendingEvents != 0; }
283  bool hasPendingEvent(WaitEventType E) const {
284  return PendingEvents & (1 << E);
285  }
286 
287  bool hasPendingFlat() const {
288  return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
289  LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
290  (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
291  LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
292  }
293 
294  void setPendingFlat() {
295  LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
296  LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
297  }
298 
299  void print(raw_ostream &);
300  void dump() { print(dbgs()); }
301 
302 private:
303  struct MergeInfo {
304  uint32_t OldLB;
305  uint32_t OtherLB;
306  uint32_t MyShift;
307  uint32_t OtherShift;
308  };
309  static bool mergeScore(const MergeInfo &M, uint32_t &Score,
310  uint32_t OtherScore);
311 
312  void setScoreLB(InstCounterType T, uint32_t Val) {
313  assert(T < NUM_INST_CNTS);
314  if (T >= NUM_INST_CNTS)
315  return;
316  ScoreLBs[T] = Val;
317  }
318 
319  void setScoreUB(InstCounterType T, uint32_t Val) {
320  assert(T < NUM_INST_CNTS);
321  if (T >= NUM_INST_CNTS)
322  return;
323  ScoreUBs[T] = Val;
324  if (T == EXP_CNT) {
325  uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
326  if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
327  ScoreLBs[T] = UB;
328  }
329  }
330 
331  void setRegScore(int GprNo, InstCounterType T, uint32_t Val) {
332  if (GprNo < NUM_ALL_VGPRS) {
333  if (GprNo > VgprUB) {
334  VgprUB = GprNo;
335  }
336  VgprScores[T][GprNo] = Val;
337  } else {
338  assert(T == LGKM_CNT);
339  if (GprNo - NUM_ALL_VGPRS > SgprUB) {
340  SgprUB = GprNo - NUM_ALL_VGPRS;
341  }
342  SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
343  }
344  }
345 
346  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
347  const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
348  unsigned OpNo, uint32_t Val);
349 
350  const GCNSubtarget *ST = nullptr;
351  uint32_t ScoreLBs[NUM_INST_CNTS] = {0};
352  uint32_t ScoreUBs[NUM_INST_CNTS] = {0};
353  uint32_t PendingEvents = 0;
354  bool MixedPendingEvents[NUM_INST_CNTS] = {false};
355  // Remember the last flat memory operation.
356  uint32_t LastFlat[NUM_INST_CNTS] = {0};
357  // wait_cnt scores for every vgpr.
358  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
359  int32_t VgprUB = 0;
360  int32_t SgprUB = 0;
361  uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
362  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
363  uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
364 };
365 
366 class SIInsertWaitcnts : public MachineFunctionPass {
367 private:
368  const GCNSubtarget *ST = nullptr;
369  const SIInstrInfo *TII = nullptr;
370  const SIRegisterInfo *TRI = nullptr;
371  const MachineRegisterInfo *MRI = nullptr;
372  AMDGPU::IsaVersion IV;
373 
374  DenseSet<MachineInstr *> TrackedWaitcntSet;
375  DenseSet<MachineInstr *> VCCZBugHandledSet;
376 
377  struct BlockInfo {
378  MachineBasicBlock *MBB;
379  std::unique_ptr<WaitcntBrackets> Incoming;
380  bool Dirty = true;
381 
382  explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
383  };
384 
385  std::vector<BlockInfo> BlockInfos; // by reverse post-order traversal index
387 
388  // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
389  // because of amdgpu-waitcnt-forcezero flag
390  bool ForceEmitZeroWaitcnts;
391  bool ForceEmitWaitcnt[NUM_INST_CNTS];
392 
393 public:
394  static char ID;
395 
396  SIInsertWaitcnts() : MachineFunctionPass(ID) {
397  (void)ForceExpCounter;
398  (void)ForceLgkmCounter;
399  (void)ForceVMCounter;
400  }
401 
402  bool runOnMachineFunction(MachineFunction &MF) override;
403 
404  StringRef getPassName() const override {
405  return "SI insert wait instructions";
406  }
407 
408  void getAnalysisUsage(AnalysisUsage &AU) const override {
409  AU.setPreservesCFG();
411  }
412 
413  bool isForceEmitWaitcnt() const {
414  for (auto T : inst_counter_types())
415  if (ForceEmitWaitcnt[T])
416  return true;
417  return false;
418  }
419 
420  void setForceEmitWaitcnt() {
421 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
422 // For debug builds, get the debug counter info and adjust if need be
423 #ifndef NDEBUG
424  if (DebugCounter::isCounterSet(ForceExpCounter) &&
425  DebugCounter::shouldExecute(ForceExpCounter)) {
426  ForceEmitWaitcnt[EXP_CNT] = true;
427  } else {
428  ForceEmitWaitcnt[EXP_CNT] = false;
429  }
430 
431  if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
432  DebugCounter::shouldExecute(ForceLgkmCounter)) {
433  ForceEmitWaitcnt[LGKM_CNT] = true;
434  } else {
435  ForceEmitWaitcnt[LGKM_CNT] = false;
436  }
437 
438  if (DebugCounter::isCounterSet(ForceVMCounter) &&
439  DebugCounter::shouldExecute(ForceVMCounter)) {
440  ForceEmitWaitcnt[VM_CNT] = true;
441  } else {
442  ForceEmitWaitcnt[VM_CNT] = false;
443  }
444 #endif // NDEBUG
445  }
446 
447  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
448  bool generateWaitcntInstBefore(MachineInstr &MI,
449  WaitcntBrackets &ScoreBrackets,
450  MachineInstr *OldWaitcntInstr);
451  void updateEventWaitcntAfter(MachineInstr &Inst,
452  WaitcntBrackets *ScoreBrackets);
453  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
454  WaitcntBrackets &ScoreBrackets);
455 };
456 
457 } // end anonymous namespace
458 
459 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
460  const SIInstrInfo *TII,
461  const MachineRegisterInfo *MRI,
462  const SIRegisterInfo *TRI,
463  unsigned OpNo, bool Def) const {
464  const MachineOperand &Op = MI->getOperand(OpNo);
465  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
466  (Def && !Op.isDef()))
467  return {-1, -1};
468 
469  // A use via a PW operand does not need a waitcnt.
470  // A partial write is not a WAW.
471  assert(!Op.getSubReg() || !Op.isUndef());
472 
473  RegInterval Result;
474  const MachineRegisterInfo &MRIA = *MRI;
475 
476  unsigned Reg = TRI->getEncodingValue(Op.getReg());
477 
478  if (TRI->isVGPR(MRIA, Op.getReg())) {
479  assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
480  Result.first = Reg - RegisterEncoding.VGPR0;
481  assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
482  } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
483  assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
484  Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
485  assert(Result.first >= NUM_ALL_VGPRS &&
486  Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
487  }
488  // TODO: Handle TTMP
489  // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
490  else
491  return {-1, -1};
492 
493  const MachineInstr &MIA = *MI;
494  const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
495  unsigned Size = TRI->getRegSizeInBits(*RC);
496  Result.second = Result.first + (Size / 32);
497 
498  return Result;
499 }
500 
501 void WaitcntBrackets::setExpScore(const MachineInstr *MI,
502  const SIInstrInfo *TII,
503  const SIRegisterInfo *TRI,
504  const MachineRegisterInfo *MRI, unsigned OpNo,
505  uint32_t Val) {
506  RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
507  LLVM_DEBUG({
508  const MachineOperand &Opnd = MI->getOperand(OpNo);
509  assert(TRI->isVGPR(*MRI, Opnd.getReg()));
510  });
511  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
512  setRegScore(RegNo, EXP_CNT, Val);
513  }
514 }
515 
516 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
517  const SIRegisterInfo *TRI,
518  const MachineRegisterInfo *MRI,
519  WaitEventType E, MachineInstr &Inst) {
520  const MachineRegisterInfo &MRIA = *MRI;
521  InstCounterType T = eventCounter(E);
522  uint32_t CurrScore = getScoreUB(T) + 1;
523  if (CurrScore == 0)
524  report_fatal_error("InsertWaitcnt score wraparound");
525  // PendingEvents and ScoreUB need to be update regardless if this event
526  // changes the score of a register or not.
527  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
528  if (!hasPendingEvent(E)) {
529  if (PendingEvents & WaitEventMaskForInst[T])
530  MixedPendingEvents[T] = true;
531  PendingEvents |= 1 << E;
532  }
533  setScoreUB(T, CurrScore);
534 
535  if (T == EXP_CNT) {
536  // Put score on the source vgprs. If this is a store, just use those
537  // specific register(s).
538  if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
539  // All GDS operations must protect their address register (same as
540  // export.)
541  if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
542  Inst.getOpcode() != AMDGPU::DS_CONSUME) {
543  setExpScore(
544  &Inst, TII, TRI, MRI,
545  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
546  CurrScore);
547  }
548  if (Inst.mayStore()) {
550  AMDGPU::OpName::data0) != -1) {
551  setExpScore(
552  &Inst, TII, TRI, MRI,
553  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
554  CurrScore);
555  }
557  AMDGPU::OpName::data1) != -1) {
558  setExpScore(&Inst, TII, TRI, MRI,
560  AMDGPU::OpName::data1),
561  CurrScore);
562  }
563  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
564  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
565  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
566  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
567  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
568  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
569  Inst.getOpcode() != AMDGPU::DS_APPEND &&
570  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
572  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
573  const MachineOperand &Op = Inst.getOperand(I);
574  if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
575  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
576  }
577  }
578  }
579  } else if (TII->isFLAT(Inst)) {
580  if (Inst.mayStore()) {
581  setExpScore(
582  &Inst, TII, TRI, MRI,
583  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
584  CurrScore);
585  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
586  setExpScore(
587  &Inst, TII, TRI, MRI,
588  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
589  CurrScore);
590  }
591  } else if (TII->isMIMG(Inst)) {
592  if (Inst.mayStore()) {
593  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
594  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
595  setExpScore(
596  &Inst, TII, TRI, MRI,
597  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
598  CurrScore);
599  }
600  } else if (TII->isMTBUF(Inst)) {
601  if (Inst.mayStore()) {
602  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
603  }
604  } else if (TII->isMUBUF(Inst)) {
605  if (Inst.mayStore()) {
606  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
607  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
608  setExpScore(
609  &Inst, TII, TRI, MRI,
610  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
611  CurrScore);
612  }
613  } else {
614  if (TII->isEXP(Inst)) {
615  // For export the destination registers are really temps that
616  // can be used as the actual source after export patching, so
617  // we need to treat them like sources and set the EXP_CNT
618  // score.
619  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
620  MachineOperand &DefMO = Inst.getOperand(I);
621  if (DefMO.isReg() && DefMO.isDef() &&
622  TRI->isVGPR(MRIA, DefMO.getReg())) {
623  setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
624  CurrScore);
625  }
626  }
627  }
628  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
629  MachineOperand &MO = Inst.getOperand(I);
630  if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
631  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
632  }
633  }
634  }
635 #if 0 // TODO: check if this is handled by MUBUF code above.
636  } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
637  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
638  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
639  MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
640  unsigned OpNo;//TODO: find the OpNo for this operand;
641  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
642  for (signed RegNo = Interval.first; RegNo < Interval.second;
643  ++RegNo) {
644  setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
645  }
646 #endif
647  } else {
648  // Match the score to the destination registers.
649  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
650  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
651  if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
652  continue;
653  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
654  setRegScore(RegNo, T, CurrScore);
655  }
656  }
657  if (TII->isDS(Inst) && Inst.mayStore()) {
658  setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
659  }
660  }
661 }
662 
664  OS << '\n';
665  for (auto T : inst_counter_types()) {
666  uint32_t LB = getScoreLB(T);
667  uint32_t UB = getScoreUB(T);
668 
669  switch (T) {
670  case VM_CNT:
671  OS << " VM_CNT(" << UB - LB << "): ";
672  break;
673  case LGKM_CNT:
674  OS << " LGKM_CNT(" << UB - LB << "): ";
675  break;
676  case EXP_CNT:
677  OS << " EXP_CNT(" << UB - LB << "): ";
678  break;
679  case VS_CNT:
680  OS << " VS_CNT(" << UB - LB << "): ";
681  break;
682  default:
683  OS << " UNKNOWN(" << UB - LB << "): ";
684  break;
685  }
686 
687  if (LB < UB) {
688  // Print vgpr scores.
689  for (int J = 0; J <= getMaxVGPR(); J++) {
690  uint32_t RegScore = getRegScore(J, T);
691  if (RegScore <= LB)
692  continue;
693  uint32_t RelScore = RegScore - LB - 1;
694  if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
695  OS << RelScore << ":v" << J << " ";
696  } else {
697  OS << RelScore << ":ds ";
698  }
699  }
700  // Also need to print sgpr scores for lgkm_cnt.
701  if (T == LGKM_CNT) {
702  for (int J = 0; J <= getMaxSGPR(); J++) {
703  uint32_t RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
704  if (RegScore <= LB)
705  continue;
706  uint32_t RelScore = RegScore - LB - 1;
707  OS << RelScore << ":s" << J << " ";
708  }
709  }
710  }
711  OS << '\n';
712  }
713  OS << '\n';
714 }
715 
716 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
717 /// whether a waitcnt instruction is needed at all.
718 bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
719  return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
720  simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
721  simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) |
722  simplifyWaitcnt(VS_CNT, Wait.VsCnt);
723 }
724 
725 bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
726  unsigned &Count) const {
727  const uint32_t LB = getScoreLB(T);
728  const uint32_t UB = getScoreUB(T);
729  if (Count < UB && UB - Count > LB)
730  return true;
731 
732  Count = ~0u;
733  return false;
734 }
735 
736 void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait,
737  AMDGPU::Waitcnt &Wait) const {
738  // If the score of src_operand falls within the bracket, we need an
739  // s_waitcnt instruction.
740  const uint32_t LB = getScoreLB(T);
741  const uint32_t UB = getScoreUB(T);
742  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
743  if ((T == VM_CNT || T == LGKM_CNT) &&
744  hasPendingFlat() &&
745  !ST->hasFlatLgkmVMemCountInOrder()) {
746  // If there is a pending FLAT operation, and this is a VMem or LGKM
747  // waitcnt and the target can report early completion, then we need
748  // to force a waitcnt 0.
749  addWait(Wait, T, 0);
750  } else if (counterOutOfOrder(T)) {
751  // Counter can get decremented out-of-order when there
752  // are multiple types event in the bracket. Also emit an s_wait counter
753  // with a conservative value of 0 for the counter.
754  addWait(Wait, T, 0);
755  } else {
756  addWait(Wait, T, UB - ScoreToWait);
757  }
758  }
759 }
760 
761 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
762  applyWaitcnt(VM_CNT, Wait.VmCnt);
763  applyWaitcnt(EXP_CNT, Wait.ExpCnt);
764  applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
765  applyWaitcnt(VS_CNT, Wait.VsCnt);
766 }
767 
768 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
769  const uint32_t UB = getScoreUB(T);
770  if (Count >= UB)
771  return;
772  if (Count != 0) {
773  if (counterOutOfOrder(T))
774  return;
775  setScoreLB(T, std::max(getScoreLB(T), UB - Count));
776  } else {
777  setScoreLB(T, UB);
778  MixedPendingEvents[T] = false;
779  PendingEvents &= ~WaitEventMaskForInst[T];
780  }
781 }
782 
783 // Where there are multiple types of event in the bracket of a counter,
784 // the decrement may go out of order.
785 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
786  // Scalar memory read always can go out of order.
787  if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
788  return true;
789  return MixedPendingEvents[T];
790 }
791 
792 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
793  false)
794 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
795  false)
796 
797 char SIInsertWaitcnts::ID = 0;
798 
799 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
800 
802  return new SIInsertWaitcnts();
803 }
804 
805 static bool readsVCCZ(const MachineInstr &MI) {
806  unsigned Opc = MI.getOpcode();
807  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
808  !MI.getOperand(1).isUndef();
809 }
810 
811 /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
813  // Currently all conventions wait, but this may not always be the case.
814  //
815  // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
816  // senses to omit the wait and do it in the caller.
817  return true;
818 }
819 
820 /// \returns true if the callee is expected to wait for any outstanding waits
821 /// before returning.
823  return true;
824 }
825 
826 /// Generate s_waitcnt instruction to be placed before cur_Inst.
827 /// Instructions of a given type are returned in order,
828 /// but instructions of different types can complete out of order.
829 /// We rely on this in-order completion
830 /// and simply assign a score to the memory access instructions.
831 /// We keep track of the active "score bracket" to determine
832 /// if an access of a memory read requires an s_waitcnt
833 /// and if so what the value of each counter is.
834 /// The "score bracket" is bound by the lower bound and upper bound
835 /// scores (*_score_LB and *_score_ub respectively).
836 bool SIInsertWaitcnts::generateWaitcntInstBefore(
837  MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
838  MachineInstr *OldWaitcntInstr) {
839  setForceEmitWaitcnt();
840  bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
841 
842  if (MI.isDebugInstr())
843  return false;
844 
845  AMDGPU::Waitcnt Wait;
846 
847  // See if this instruction has a forced S_WAITCNT VM.
848  // TODO: Handle other cases of NeedsWaitcntVmBefore()
849  if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
850  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
851  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
852  MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
853  MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
854  Wait.VmCnt = 0;
855  }
856 
857  // All waits must be resolved at call return.
858  // NOTE: this could be improved with knowledge of all call sites or
859  // with knowledge of the called routines.
860  if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
861  MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
862  (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
863  Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
864  }
865  // Resolve vm waits before gs-done.
866  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
867  MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
870  Wait.VmCnt = 0;
871  }
872 #if 0 // TODO: the following blocks of logic when we have fence.
873  else if (MI.getOpcode() == SC_FENCE) {
874  const unsigned int group_size =
875  context->shader_info->GetMaxThreadGroupSize();
876  // group_size == 0 means thread group size is unknown at compile time
877  const bool group_is_multi_wave =
878  (group_size == 0 || group_size > target_info->GetWaveFrontSize());
879  const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
880 
881  for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
882  SCRegType src_type = Inst->GetSrcType(i);
883  switch (src_type) {
884  case SCMEM_LDS:
885  if (group_is_multi_wave ||
886  context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
887  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
888  ScoreBrackets->getScoreUB(LGKM_CNT));
889  // LDS may have to wait for VM_CNT after buffer load to LDS
890  if (target_info->HasBufferLoadToLDS()) {
891  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
892  ScoreBrackets->getScoreUB(VM_CNT));
893  }
894  }
895  break;
896 
897  case SCMEM_GDS:
898  if (group_is_multi_wave || fence_is_global) {
899  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
900  ScoreBrackets->getScoreUB(EXP_CNT));
901  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
902  ScoreBrackets->getScoreUB(LGKM_CNT));
903  }
904  break;
905 
906  case SCMEM_UAV:
907  case SCMEM_TFBUF:
908  case SCMEM_RING:
909  case SCMEM_SCATTER:
910  if (group_is_multi_wave || fence_is_global) {
911  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
912  ScoreBrackets->getScoreUB(EXP_CNT));
913  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
914  ScoreBrackets->getScoreUB(VM_CNT));
915  }
916  break;
917 
918  case SCMEM_SCRATCH:
919  default:
920  break;
921  }
922  }
923  }
924 #endif
925 
926  // Export & GDS instructions do not read the EXEC mask until after the export
927  // is granted (which can occur well after the instruction is issued).
928  // The shader program must flush all EXP operations on the export-count
929  // before overwriting the EXEC mask.
930  else {
931  if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
932  // Export and GDS are tracked individually, either may trigger a waitcnt
933  // for EXEC.
934  if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
935  ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
936  ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
937  ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
938  Wait.ExpCnt = 0;
939  }
940  }
941 
942  if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
943  // Don't bother waiting on anything except the call address. The function
944  // is going to insert a wait on everything in its prolog. This still needs
945  // to be careful if the call target is a load (e.g. a GOT load).
946  Wait = AMDGPU::Waitcnt();
947 
948  int CallAddrOpIdx =
949  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
950  RegInterval Interval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI,
951  CallAddrOpIdx, false);
952  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
953  ScoreBrackets.determineWait(
954  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
955  }
956  } else {
957  // FIXME: Should not be relying on memoperands.
958  // Look at the source operands of every instruction to see if
959  // any of them results from a previous memory operation that affects
960  // its current usage. If so, an s_waitcnt instruction needs to be
961  // emitted.
962  // If the source operand was defined by a load, add the s_waitcnt
963  // instruction.
964  for (const MachineMemOperand *Memop : MI.memoperands()) {
965  unsigned AS = Memop->getAddrSpace();
966  if (AS != AMDGPUAS::LOCAL_ADDRESS)
967  continue;
968  unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
969  // VM_CNT is only relevant to vgpr or LDS.
970  ScoreBrackets.determineWait(
971  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
972  }
973 
974  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
975  const MachineOperand &Op = MI.getOperand(I);
976  const MachineRegisterInfo &MRIA = *MRI;
977  RegInterval Interval =
978  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
979  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
980  if (TRI->isVGPR(MRIA, Op.getReg())) {
981  // VM_CNT is only relevant to vgpr or LDS.
982  ScoreBrackets.determineWait(
983  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
984  }
985  ScoreBrackets.determineWait(
986  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
987  }
988  }
989  // End of for loop that looks at all source operands to decide vm_wait_cnt
990  // and lgk_wait_cnt.
991 
992  // Two cases are handled for destination operands:
993  // 1) If the destination operand was defined by a load, add the s_waitcnt
994  // instruction to guarantee the right WAW order.
995  // 2) If a destination operand that was used by a recent export/store ins,
996  // add s_waitcnt on exp_cnt to guarantee the WAR order.
997  if (MI.mayStore()) {
998  // FIXME: Should not be relying on memoperands.
999  for (const MachineMemOperand *Memop : MI.memoperands()) {
1000  unsigned AS = Memop->getAddrSpace();
1001  if (AS != AMDGPUAS::LOCAL_ADDRESS)
1002  continue;
1003  unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1004  ScoreBrackets.determineWait(
1005  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1006  ScoreBrackets.determineWait(
1007  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1008  }
1009  }
1010  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1011  MachineOperand &Def = MI.getOperand(I);
1012  const MachineRegisterInfo &MRIA = *MRI;
1013  RegInterval Interval =
1014  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
1015  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1016  if (TRI->isVGPR(MRIA, Def.getReg())) {
1017  ScoreBrackets.determineWait(
1018  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1019  ScoreBrackets.determineWait(
1020  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1021  }
1022  ScoreBrackets.determineWait(
1023  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1024  }
1025  } // End of for loop that looks at all dest operands.
1026  }
1027  }
1028 
1029  // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1030  // occurs before the instruction. Doing it here prevents any additional
1031  // S_WAITCNTs from being emitted if the instruction was marked as
1032  // requiring a WAITCNT beforehand.
1033  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1034  !ST->hasAutoWaitcntBeforeBarrier()) {
1035  Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
1036  }
1037 
1038  // TODO: Remove this work-around, enable the assert for Bug 457939
1039  // after fixing the scheduler. Also, the Shader Compiler code is
1040  // independent of target.
1041  if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
1042  if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1043  ScoreBrackets.getScoreUB(LGKM_CNT) &&
1044  ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1045  Wait.LgkmCnt = 0;
1046  }
1047  }
1048 
1049  // Early-out if no wait is indicated.
1050  if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
1051  bool Modified = false;
1052  if (OldWaitcntInstr) {
1053  for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
1054  &*II != &MI; II = NextI, ++NextI) {
1055  if (II->isDebugInstr())
1056  continue;
1057 
1058  if (TrackedWaitcntSet.count(&*II)) {
1059  TrackedWaitcntSet.erase(&*II);
1060  II->eraseFromParent();
1061  Modified = true;
1062  } else if (II->getOpcode() == AMDGPU::S_WAITCNT) {
1063  int64_t Imm = II->getOperand(0).getImm();
1064  ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1065  } else {
1066  assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
1067  assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1068  ScoreBrackets.applyWaitcnt(
1069  AMDGPU::Waitcnt(0, 0, 0, II->getOperand(1).getImm()));
1070  }
1071  }
1072  }
1073  return Modified;
1074  }
1075 
1076  if (ForceEmitZeroWaitcnts)
1077  Wait = AMDGPU::Waitcnt::allZero(IV);
1078 
1079  if (ForceEmitWaitcnt[VM_CNT])
1080  Wait.VmCnt = 0;
1081  if (ForceEmitWaitcnt[EXP_CNT])
1082  Wait.ExpCnt = 0;
1083  if (ForceEmitWaitcnt[LGKM_CNT])
1084  Wait.LgkmCnt = 0;
1085  if (ForceEmitWaitcnt[VS_CNT])
1086  Wait.VsCnt = 0;
1087 
1088  ScoreBrackets.applyWaitcnt(Wait);
1089 
1090  AMDGPU::Waitcnt OldWait;
1091  bool Modified = false;
1092 
1093  if (OldWaitcntInstr) {
1094  for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
1095  &*II != &MI; II = NextI, NextI++) {
1096  if (II->isDebugInstr())
1097  continue;
1098 
1099  if (II->getOpcode() == AMDGPU::S_WAITCNT) {
1100  unsigned IEnc = II->getOperand(0).getImm();
1101  AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1102  OldWait = OldWait.combined(IWait);
1103  if (!TrackedWaitcntSet.count(&*II))
1104  Wait = Wait.combined(IWait);
1105  unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
1106  if (IEnc != NewEnc) {
1107  II->getOperand(0).setImm(NewEnc);
1108  Modified = true;
1109  }
1110  Wait.VmCnt = ~0u;
1111  Wait.LgkmCnt = ~0u;
1112  Wait.ExpCnt = ~0u;
1113  } else {
1114  assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
1115  assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1116 
1117  unsigned ICnt = II->getOperand(1).getImm();
1118  OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt);
1119  if (!TrackedWaitcntSet.count(&*II))
1120  Wait.VsCnt = std::min(Wait.VsCnt, ICnt);
1121  if (Wait.VsCnt != ICnt) {
1122  II->getOperand(1).setImm(Wait.VsCnt);
1123  Modified = true;
1124  }
1125  Wait.VsCnt = ~0u;
1126  }
1127 
1128  LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
1129  << "Old Instr: " << MI << '\n'
1130  << "New Instr: " << *II << '\n');
1131 
1132  if (!Wait.hasWait())
1133  return Modified;
1134  }
1135  }
1136 
1137  if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) {
1138  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1139  auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1140  MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1141  .addImm(Enc);
1142  TrackedWaitcntSet.insert(SWaitInst);
1143  Modified = true;
1144 
1145  LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1146  << "Old Instr: " << MI << '\n'
1147  << "New Instr: " << *SWaitInst << '\n');
1148  }
1149 
1150  if (Wait.VsCnt != ~0u) {
1151  assert(ST->hasVscnt());
1152 
1153  auto SWaitInst =
1154  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1155  TII->get(AMDGPU::S_WAITCNT_VSCNT))
1156  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1157  .addImm(Wait.VsCnt);
1158  TrackedWaitcntSet.insert(SWaitInst);
1159  Modified = true;
1160 
1161  LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1162  << "Old Instr: " << MI << '\n'
1163  << "New Instr: " << *SWaitInst << '\n');
1164  }
1165 
1166  return Modified;
1167 }
1168 
1169 // This is a flat memory operation. Check to see if it has memory
1170 // tokens for both LDS and Memory, and if so mark it as a flat.
1171 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1172  if (MI.memoperands_empty())
1173  return true;
1174 
1175  for (const MachineMemOperand *Memop : MI.memoperands()) {
1176  unsigned AS = Memop->getAddrSpace();
1178  return true;
1179  }
1180 
1181  return false;
1182 }
1183 
1184 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1185  WaitcntBrackets *ScoreBrackets) {
1186  // Now look at the instruction opcode. If it is a memory access
1187  // instruction, update the upper-bound of the appropriate counter's
1188  // bracket and the destination operand scores.
1189  // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1190  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1191  if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1192  TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1193  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1194  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1195  } else {
1196  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1197  }
1198  } else if (TII->isFLAT(Inst)) {
1199  assert(Inst.mayLoad() || Inst.mayStore());
1200 
1201  if (TII->usesVM_CNT(Inst)) {
1202  if (!ST->hasVscnt())
1203  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1204  else if (Inst.mayLoad() &&
1205  AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1)
1206  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1207  else
1208  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1209  }
1210 
1211  if (TII->usesLGKM_CNT(Inst)) {
1212  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1213 
1214  // This is a flat memory operation, so note it - it will require
1215  // that both the VM and LGKM be flushed to zero if it is pending when
1216  // a VM or LGKM dependency occurs.
1217  if (mayAccessLDSThroughFlat(Inst))
1218  ScoreBrackets->setPendingFlat();
1219  }
1220  } else if (SIInstrInfo::isVMEM(Inst) &&
1221  // TODO: get a better carve out.
1222  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1223  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1224  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL &&
1225  Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV &&
1226  Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) {
1227  if (!ST->hasVscnt())
1228  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1229  else if ((Inst.mayLoad() &&
1230  AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) ||
1231  /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
1232  (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
1233  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1234  else if (Inst.mayStore())
1235  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1236 
1237  if (ST->vmemWriteNeedsExpWaitcnt() &&
1238  (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
1239  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1240  }
1241  } else if (TII->isSMRD(Inst)) {
1242  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1243  } else if (Inst.isCall()) {
1244  if (callWaitsOnFunctionReturn(Inst)) {
1245  // Act as a wait on everything
1246  ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(IV));
1247  } else {
1248  // May need to way wait for anything.
1249  ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
1250  }
1251  } else {
1252  switch (Inst.getOpcode()) {
1253  case AMDGPU::S_SENDMSG:
1254  case AMDGPU::S_SENDMSGHALT:
1255  ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1256  break;
1257  case AMDGPU::EXP:
1258  case AMDGPU::EXP_DONE: {
1259  int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1260  if (Imm >= 32 && Imm <= 63)
1261  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1262  else if (Imm >= 12 && Imm <= 15)
1263  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1264  else
1265  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1266  break;
1267  }
1268  case AMDGPU::S_MEMTIME:
1269  case AMDGPU::S_MEMREALTIME:
1270  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1271  break;
1272  default:
1273  break;
1274  }
1275  }
1276 }
1277 
1278 bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score,
1279  uint32_t OtherScore) {
1280  uint32_t MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
1281  uint32_t OtherShifted =
1282  OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
1283  Score = std::max(MyShifted, OtherShifted);
1284  return OtherShifted > MyShifted;
1285 }
1286 
1287 /// Merge the pending events and associater score brackets of \p Other into
1288 /// this brackets status.
1289 ///
1290 /// Returns whether the merge resulted in a change that requires tighter waits
1291 /// (i.e. the merged brackets strictly dominate the original brackets).
1292 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
1293  bool StrictDom = false;
1294 
1295  for (auto T : inst_counter_types()) {
1296  // Merge event flags for this counter
1297  const bool OldOutOfOrder = counterOutOfOrder(T);
1298  const uint32_t OldEvents = PendingEvents & WaitEventMaskForInst[T];
1299  const uint32_t OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
1300  if (OtherEvents & ~OldEvents)
1301  StrictDom = true;
1302  if (Other.MixedPendingEvents[T] ||
1303  (OldEvents && OtherEvents && OldEvents != OtherEvents))
1304  MixedPendingEvents[T] = true;
1305  PendingEvents |= OtherEvents;
1306 
1307  // Merge scores for this counter
1308  const uint32_t MyPending = ScoreUBs[T] - ScoreLBs[T];
1309  const uint32_t OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
1310  MergeInfo M;
1311  M.OldLB = ScoreLBs[T];
1312  M.OtherLB = Other.ScoreLBs[T];
1313  M.MyShift = OtherPending > MyPending ? OtherPending - MyPending : 0;
1314  M.OtherShift = ScoreUBs[T] - Other.ScoreUBs[T] + M.MyShift;
1315 
1316  const uint32_t NewUB = ScoreUBs[T] + M.MyShift;
1317  if (NewUB < ScoreUBs[T])
1318  report_fatal_error("waitcnt score overflow");
1319  ScoreUBs[T] = NewUB;
1320  ScoreLBs[T] = std::min(M.OldLB + M.MyShift, M.OtherLB + M.OtherShift);
1321 
1322  StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
1323 
1324  bool RegStrictDom = false;
1325  for (int J = 0, E = std::max(getMaxVGPR(), Other.getMaxVGPR()) + 1; J != E;
1326  J++) {
1327  RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
1328  }
1329 
1330  if (T == LGKM_CNT) {
1331  for (int J = 0, E = std::max(getMaxSGPR(), Other.getMaxSGPR()) + 1;
1332  J != E; J++) {
1333  RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
1334  }
1335  }
1336 
1337  if (RegStrictDom && !OldOutOfOrder)
1338  StrictDom = true;
1339  }
1340 
1341  VgprUB = std::max(getMaxVGPR(), Other.getMaxVGPR());
1342  SgprUB = std::max(getMaxSGPR(), Other.getMaxSGPR());
1343 
1344  return StrictDom;
1345 }
1346 
1347 // Generate s_waitcnt instructions where needed.
1348 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1349  MachineBasicBlock &Block,
1350  WaitcntBrackets &ScoreBrackets) {
1351  bool Modified = false;
1352 
1353  LLVM_DEBUG({
1354  dbgs() << "*** Block" << Block.getNumber() << " ***";
1355  ScoreBrackets.dump();
1356  });
1357 
1358  // Walk over the instructions.
1359  MachineInstr *OldWaitcntInstr = nullptr;
1360 
1361  for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1362  Iter != E;) {
1363  MachineInstr &Inst = *Iter;
1364 
1365  // Track pre-existing waitcnts from earlier iterations.
1366  if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
1367  (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1368  Inst.getOperand(0).isReg() &&
1369  Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL)) {
1370  if (!OldWaitcntInstr)
1371  OldWaitcntInstr = &Inst;
1372  ++Iter;
1373  continue;
1374  }
1375 
1376  bool VCCZBugWorkAround = false;
1377  if (readsVCCZ(Inst) &&
1378  (!VCCZBugHandledSet.count(&Inst))) {
1379  if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1380  ScoreBrackets.getScoreUB(LGKM_CNT) &&
1381  ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1382  if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
1383  VCCZBugWorkAround = true;
1384  }
1385  }
1386 
1387  // Generate an s_waitcnt instruction to be placed before
1388  // cur_Inst, if needed.
1389  Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
1390  OldWaitcntInstr = nullptr;
1391 
1392  updateEventWaitcntAfter(Inst, &ScoreBrackets);
1393 
1394 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1395  // If this instruction generates a S_SETVSKIP because it is an
1396  // indexed resource, and we are on Tahiti, then it will also force
1397  // an S_WAITCNT vmcnt(0)
1398  if (RequireCheckResourceType(Inst, context)) {
1399  // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1400  ScoreBrackets->setScoreLB(VM_CNT,
1401  ScoreBrackets->getScoreUB(VM_CNT));
1402  }
1403 #endif
1404 
1405  LLVM_DEBUG({
1406  Inst.print(dbgs());
1407  ScoreBrackets.dump();
1408  });
1409 
1410  // Check to see if this is a GWS instruction. If so, and if this is CI or
1411  // VI, then the generated code sequence will include an S_WAITCNT 0.
1412  // TODO: Are these the only GWS instructions?
1413  if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1414  Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1415  Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1416  Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1417  Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1418  // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
1419  ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt());
1420  }
1421 
1422  // TODO: Remove this work-around after fixing the scheduler and enable the
1423  // assert above.
1424  if (VCCZBugWorkAround) {
1425  // Restore the vccz bit. Any time a value is written to vcc, the vcc
1426  // bit is updated, so we can restore the bit by reading the value of
1427  // vcc and then writing it back to the register.
1428  BuildMI(Block, Inst, Inst.getDebugLoc(),
1429  TII->get(AMDGPU::S_MOV_B64),
1430  AMDGPU::VCC)
1431  .addReg(AMDGPU::VCC);
1432  VCCZBugHandledSet.insert(&Inst);
1433  Modified = true;
1434  }
1435 
1436  ++Iter;
1437  }
1438 
1439  return Modified;
1440 }
1441 
1442 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1443  ST = &MF.getSubtarget<GCNSubtarget>();
1444  TII = ST->getInstrInfo();
1445  TRI = &TII->getRegisterInfo();
1446  MRI = &MF.getRegInfo();
1447  IV = AMDGPU::getIsaVersion(ST->getCPU());
1449 
1450  ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1451  for (auto T : inst_counter_types())
1452  ForceEmitWaitcnt[T] = false;
1453 
1454  HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1455  HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1456  HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1457  HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;
1458 
1459  HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1460  HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1461  assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1462  assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1463 
1464  RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1465  RegisterEncoding.VGPRL =
1466  RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1467  RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1468  RegisterEncoding.SGPRL =
1469  RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1470 
1471  TrackedWaitcntSet.clear();
1472  VCCZBugHandledSet.clear();
1473  RpotIdxMap.clear();
1474  BlockInfos.clear();
1475 
1476  // Keep iterating over the blocks in reverse post order, inserting and
1477  // updating s_waitcnt where needed, until a fix point is reached.
1478  for (MachineBasicBlock *MBB :
1480  RpotIdxMap[MBB] = BlockInfos.size();
1481  BlockInfos.emplace_back(MBB);
1482  }
1483 
1484  std::unique_ptr<WaitcntBrackets> Brackets;
1485  bool Modified = false;
1486  bool Repeat;
1487  do {
1488  Repeat = false;
1489 
1490  for (BlockInfo &BI : BlockInfos) {
1491  if (!BI.Dirty)
1492  continue;
1493 
1494  unsigned Idx = std::distance(&*BlockInfos.begin(), &BI);
1495 
1496  if (BI.Incoming) {
1497  if (!Brackets)
1498  Brackets = llvm::make_unique<WaitcntBrackets>(*BI.Incoming);
1499  else
1500  *Brackets = *BI.Incoming;
1501  } else {
1502  if (!Brackets)
1503  Brackets = llvm::make_unique<WaitcntBrackets>(ST);
1504  else
1505  Brackets->clear();
1506  }
1507 
1508  Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
1509  BI.Dirty = false;
1510 
1511  if (Brackets->hasPending()) {
1512  BlockInfo *MoveBracketsToSucc = nullptr;
1513  for (MachineBasicBlock *Succ : BI.MBB->successors()) {
1514  unsigned SuccIdx = RpotIdxMap[Succ];
1515  BlockInfo &SuccBI = BlockInfos[SuccIdx];
1516  if (!SuccBI.Incoming) {
1517  SuccBI.Dirty = true;
1518  if (SuccIdx <= Idx)
1519  Repeat = true;
1520  if (!MoveBracketsToSucc) {
1521  MoveBracketsToSucc = &SuccBI;
1522  } else {
1523  SuccBI.Incoming = llvm::make_unique<WaitcntBrackets>(*Brackets);
1524  }
1525  } else if (SuccBI.Incoming->merge(*Brackets)) {
1526  SuccBI.Dirty = true;
1527  if (SuccIdx <= Idx)
1528  Repeat = true;
1529  }
1530  }
1531  if (MoveBracketsToSucc)
1532  MoveBracketsToSucc->Incoming = std::move(Brackets);
1533  }
1534  }
1535  } while (Repeat);
1536 
1538 
1539  bool HaveScalarStores = false;
1540 
1541  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1542  ++BI) {
1543  MachineBasicBlock &MBB = *BI;
1544 
1545  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1546  ++I) {
1547  if (!HaveScalarStores && TII->isScalarStore(*I))
1548  HaveScalarStores = true;
1549 
1550  if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1551  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1552  EndPgmBlocks.push_back(&MBB);
1553  }
1554  }
1555 
1556  if (HaveScalarStores) {
1557  // If scalar writes are used, the cache must be flushed or else the next
1558  // wave to reuse the same scratch memory can be clobbered.
1559  //
1560  // Insert s_dcache_wb at wave termination points if there were any scalar
1561  // stores, and only if the cache hasn't already been flushed. This could be
1562  // improved by looking across blocks for flushes in postdominating blocks
1563  // from the stores but an explicitly requested flush is probably very rare.
1564  for (MachineBasicBlock *MBB : EndPgmBlocks) {
1565  bool SeenDCacheWB = false;
1566 
1567  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1568  ++I) {
1569  if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1570  SeenDCacheWB = true;
1571  else if (TII->isScalarStore(*I))
1572  SeenDCacheWB = false;
1573 
1574  // FIXME: It would be better to insert this before a waitcnt if any.
1575  if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1576  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1577  !SeenDCacheWB) {
1578  Modified = true;
1579  BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1580  }
1581  }
1582  }
1583  }
1584 
1585  if (!MFI->isEntryFunction()) {
1586  // Wait for any outstanding memory operations that the input registers may
1587  // depend on. We can't track them and it's better to the wait after the
1588  // costly call sequence.
1589 
1590  // TODO: Could insert earlier and schedule more liberally with operations
1591  // that only use caller preserved registers.
1592  MachineBasicBlock &EntryBB = MF.front();
1593  if (ST->hasVscnt())
1594  BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(),
1595  TII->get(AMDGPU::S_WAITCNT_VSCNT))
1596  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1597  .addImm(0);
1598  BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1599  .addImm(0);
1600 
1601  Modified = true;
1602  }
1603 
1604  return Modified;
1605 }
Interface definition for SIRegisterInfo.
bool modifiesRegister(unsigned Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register...
Address space for flat memory.
Definition: AMDGPU.h:253
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
AMDGPU specific subclass of TargetSubtarget.
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:634
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:139
This class represents lattice values for constants.
Definition: AllocatorList.h:23
SI Insert Waitcnts
unsigned getExpcntBitMask(const IsaVersion &Version)
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:36
Implements a dense probed hash-table based set.
Definition: DenseSet.h:249
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:384
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
unsigned getReg() const
getReg - Returns the register number.
unsigned Reg
unsigned getSubReg() const
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:33
LLVM_READONLY int getAtomicNoRetOp(uint16_t Opcode)
Address space for local memory.
Definition: AMDGPU.h:258
FunctionPass * createSIInsertWaitcntsPass()
DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", "Force emit s_waitcnt expcnt(0) instrs")
iterator_range< succ_iterator > successors()
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:459
static const AMDGPUSubtarget & get(const MachineFunction &MF)
A description of a memory reference used in the backend.
LLVM_READONLY int getAtomicRetOp(uint16_t Opcode)
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:485
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:413
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:469
static bool readsVCCZ(const MachineInstr &MI)
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:410
This file provides an implementation of debug counters.
ELFYAML::ELF_STO Other
Definition: ELFYAML.cpp:870
APInt operator*(APInt a, uint64_t RHS)
Definition: APInt.h:2090
InstCounterType
bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:433
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:101
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they&#39;re not in a MachineFuncti...
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition: iterator.h:67
bool isReturn(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:624
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:821
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
unsigned const MachineRegisterInfo * MRI
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:517
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Represent the analysis usage information of a pass.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
self_iterator getIterator()
Definition: ilist_node.h:81
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:73
RegisterMapping
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
const MachineBasicBlock & front() const
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
bool isDebugInstr() const
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:501
IsaVersion getIsaVersion(StringRef GPU)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
char & SIInsertWaitcntsID
Iterator for intrusive lists based on ilist_node.
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:301
int64_t getImm() const
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
A range adaptor for a pair of iterators.
static Waitcnt allZero(const IsaVersion &Version)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
static Waitcnt allZeroExceptVsCnt()
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:211
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:255
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Representation of each machine instruction.
Definition: MachineInstr.h:63
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) INITIALIZE_PASS_END(SIInsertWaitcnts
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:441
#define DEBUG_TYPE
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:337
uint32_t Size
Definition: Profile.cpp:46
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:808
bool memoperands_empty() const
Return true if we don&#39;t have any memory operands which described the memory access done by this instr...
Definition: MachineInstr.h:547
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
Definition: Value.h:72
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:45
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
unsigned getLgkmcntBitMask(const IsaVersion &Version)
ProcessInfo Wait(const ProcessInfo &PI, unsigned SecondsToWait, bool WaitUntilTerminates, std::string *ErrMsg=nullptr)
This function waits for the process specified by PI to finish.
bool operator==(uint64_t V1, const APInt &V2)
Definition: APInt.h:1966
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:415
WaitEventType
unsigned getVmcntBitMask(const IsaVersion &Version)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
const SIRegisterInfo * getRegisterInfo() const override