LLVM  9.0.0svn
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert wait instructions for memory reads and writes.
11 ///
12 /// Memory reads and writes are issued asynchronously, so we need to insert
13 /// S_WAITCNT instructions when we want to access any of their results or
14 /// overwrite any register that's used asynchronously.
15 ///
16 /// TODO: This pass currently keeps one timeline per hardware counter. A more
17 /// finely-grained approach that keeps one timeline per event type could
18 /// sometimes get away with generating weaker s_waitcnt instructions. For
19 /// example, when both SMEM and LDS are in flight and we need to wait for
20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21 /// but the pass will currently generate a conservative lgkmcnt(0) because
22 /// multiple event types are in flight.
23 //
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPU.h"
27 #include "AMDGPUSubtarget.h"
28 #include "SIDefines.h"
29 #include "SIInstrInfo.h"
30 #include "SIMachineFunctionInfo.h"
31 #include "SIRegisterInfo.h"
32 #include "Utils/AMDGPUBaseInfo.h"
33 #include "llvm/ADT/DenseMap.h"
34 #include "llvm/ADT/DenseSet.h"
36 #include "llvm/ADT/STLExtras.h"
37 #include "llvm/ADT/SmallVector.h"
46 #include "llvm/IR/DebugLoc.h"
47 #include "llvm/Pass.h"
48 #include "llvm/Support/Debug.h"
52 #include <algorithm>
53 #include <cassert>
54 #include <cstdint>
55 #include <cstring>
56 #include <memory>
57 #include <utility>
58 #include <vector>
59 
60 using namespace llvm;
61 
62 #define DEBUG_TYPE "si-insert-waitcnts"
63 
64 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
65  "Force emit s_waitcnt expcnt(0) instrs");
66 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
67  "Force emit s_waitcnt lgkmcnt(0) instrs");
68 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
69  "Force emit s_waitcnt vmcnt(0) instrs");
70 
72  "amdgpu-waitcnt-forcezero",
73  cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
74  cl::init(0), cl::Hidden);
75 
76 namespace {
77 
78 template <typename EnumT>
79 class enum_iterator
80  : public iterator_facade_base<enum_iterator<EnumT>,
81  std::forward_iterator_tag, const EnumT> {
82  EnumT Value;
83 public:
84  enum_iterator() = default;
85  enum_iterator(EnumT Value) : Value(Value) {}
86 
87  enum_iterator &operator++() {
88  Value = static_cast<EnumT>(Value + 1);
89  return *this;
90  }
91 
92  bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
93 
94  EnumT operator*() const { return Value; }
95 };
96 
97 // Class of object that encapsulates latest instruction counter score
98 // associated with the operand. Used for determining whether
99 // s_waitcnt instruction needs to be emited.
100 
101 #define CNT_MASK(t) (1u << (t))
102 
103 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
104 
105 iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
106  return make_range(enum_iterator<InstCounterType>(VM_CNT),
107  enum_iterator<InstCounterType>(NUM_INST_CNTS));
108 }
109 
110 using RegInterval = std::pair<signed, signed>;
111 
112 struct {
113  uint32_t VmcntMax;
114  uint32_t ExpcntMax;
115  uint32_t LgkmcntMax;
116  int32_t NumVGPRsMax;
117  int32_t NumSGPRsMax;
118 } HardwareLimits;
119 
120 struct {
121  unsigned VGPR0;
122  unsigned VGPRL;
123  unsigned SGPR0;
124  unsigned SGPRL;
125 } RegisterEncoding;
126 
128  VMEM_ACCESS, // vector-memory read & write
129  LDS_ACCESS, // lds read & write
130  GDS_ACCESS, // gds read & write
131  SQ_MESSAGE, // send message
132  SMEM_ACCESS, // scalar-memory read & write
133  EXP_GPR_LOCK, // export holding on its data src
134  GDS_GPR_LOCK, // GDS holding on its data and addr src
135  EXP_POS_ACCESS, // write to export position
136  EXP_PARAM_ACCESS, // write to export parameter
137  VMW_GPR_LOCK, // vector-memory write holding on its data src
138  NUM_WAIT_EVENTS,
139 };
140 
141 static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
142  (1 << VMEM_ACCESS),
143  (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
144  (1 << SQ_MESSAGE),
145  (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
146  (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
147 };
148 
149 // The mapping is:
150 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
151 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
152 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
153 // We reserve a fixed number of VGPR slots in the scoring tables for
154 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
156  SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
157  SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
158  NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
159  EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
160  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
161 };
162 
163 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
164  switch (T) {
165  case VM_CNT:
166  Wait.VmCnt = std::min(Wait.VmCnt, Count);
167  break;
168  case EXP_CNT:
169  Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
170  break;
171  case LGKM_CNT:
172  Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
173  break;
174  default:
175  llvm_unreachable("bad InstCounterType");
176  }
177 }
178 
179 // This objects maintains the current score brackets of each wait counter, and
180 // a per-register scoreboard for each wait counter.
181 //
182 // We also maintain the latest score for every event type that can change the
183 // waitcnt in order to know if there are multiple types of events within
184 // the brackets. When multiple types of event happen in the bracket,
185 // wait count may get decreased out of order, therefore we need to put in
186 // "s_waitcnt 0" before use.
187 class WaitcntBrackets {
188 public:
189  WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
190  for (auto T : inst_counter_types())
191  memset(VgprScores[T], 0, sizeof(VgprScores[T]));
192  }
193 
194  static uint32_t getWaitCountMax(InstCounterType T) {
195  switch (T) {
196  case VM_CNT:
197  return HardwareLimits.VmcntMax;
198  case LGKM_CNT:
199  return HardwareLimits.LgkmcntMax;
200  case EXP_CNT:
201  return HardwareLimits.ExpcntMax;
202  default:
203  break;
204  }
205  return 0;
206  }
207 
208  uint32_t getScoreLB(InstCounterType T) const {
209  assert(T < NUM_INST_CNTS);
210  if (T >= NUM_INST_CNTS)
211  return 0;
212  return ScoreLBs[T];
213  }
214 
215  uint32_t getScoreUB(InstCounterType T) const {
216  assert(T < NUM_INST_CNTS);
217  if (T >= NUM_INST_CNTS)
218  return 0;
219  return ScoreUBs[T];
220  }
221 
222  // Mapping from event to counter.
223  InstCounterType eventCounter(WaitEventType E) {
224  if (E == VMEM_ACCESS)
225  return VM_CNT;
226  if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
227  return LGKM_CNT;
228  assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
229  return EXP_CNT;
230  }
231 
232  uint32_t getRegScore(int GprNo, InstCounterType T) {
233  if (GprNo < NUM_ALL_VGPRS) {
234  return VgprScores[T][GprNo];
235  }
236  assert(T == LGKM_CNT);
237  return SgprScores[GprNo - NUM_ALL_VGPRS];
238  }
239 
240  void clear() {
241  memset(ScoreLBs, 0, sizeof(ScoreLBs));
242  memset(ScoreUBs, 0, sizeof(ScoreUBs));
243  PendingEvents = 0;
244  memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
245  for (auto T : inst_counter_types())
246  memset(VgprScores[T], 0, sizeof(VgprScores[T]));
247  memset(SgprScores, 0, sizeof(SgprScores));
248  }
249 
250  bool merge(const WaitcntBrackets &Other);
251 
252  RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
253  const MachineRegisterInfo *MRI,
254  const SIRegisterInfo *TRI, unsigned OpNo,
255  bool Def) const;
256 
257  int32_t getMaxVGPR() const { return VgprUB; }
258  int32_t getMaxSGPR() const { return SgprUB; }
259 
260  bool counterOutOfOrder(InstCounterType T) const;
261  bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
262  bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
263  void determineWait(InstCounterType T, uint32_t ScoreToWait,
264  AMDGPU::Waitcnt &Wait) const;
265  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
266  void applyWaitcnt(InstCounterType T, unsigned Count);
267  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
268  const MachineRegisterInfo *MRI, WaitEventType E,
269  MachineInstr &MI);
270 
271  bool hasPending() const { return PendingEvents != 0; }
272  bool hasPendingEvent(WaitEventType E) const {
273  return PendingEvents & (1 << E);
274  }
275 
276  bool hasPendingFlat() const {
277  return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
278  LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
279  (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
280  LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
281  }
282 
283  void setPendingFlat() {
284  LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
285  LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
286  }
287 
288  void print(raw_ostream &);
289  void dump() { print(dbgs()); }
290 
291 private:
292  struct MergeInfo {
293  uint32_t OldLB;
294  uint32_t OtherLB;
295  uint32_t MyShift;
296  uint32_t OtherShift;
297  };
298  static bool mergeScore(const MergeInfo &M, uint32_t &Score,
299  uint32_t OtherScore);
300 
301  void setScoreLB(InstCounterType T, uint32_t Val) {
302  assert(T < NUM_INST_CNTS);
303  if (T >= NUM_INST_CNTS)
304  return;
305  ScoreLBs[T] = Val;
306  }
307 
308  void setScoreUB(InstCounterType T, uint32_t Val) {
309  assert(T < NUM_INST_CNTS);
310  if (T >= NUM_INST_CNTS)
311  return;
312  ScoreUBs[T] = Val;
313  if (T == EXP_CNT) {
314  uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
315  if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
316  ScoreLBs[T] = UB;
317  }
318  }
319 
320  void setRegScore(int GprNo, InstCounterType T, uint32_t Val) {
321  if (GprNo < NUM_ALL_VGPRS) {
322  if (GprNo > VgprUB) {
323  VgprUB = GprNo;
324  }
325  VgprScores[T][GprNo] = Val;
326  } else {
327  assert(T == LGKM_CNT);
328  if (GprNo - NUM_ALL_VGPRS > SgprUB) {
329  SgprUB = GprNo - NUM_ALL_VGPRS;
330  }
331  SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
332  }
333  }
334 
335  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
336  const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
337  unsigned OpNo, uint32_t Val);
338 
339  const GCNSubtarget *ST = nullptr;
340  uint32_t ScoreLBs[NUM_INST_CNTS] = {0};
341  uint32_t ScoreUBs[NUM_INST_CNTS] = {0};
342  uint32_t PendingEvents = 0;
343  bool MixedPendingEvents[NUM_INST_CNTS] = {false};
344  // Remember the last flat memory operation.
345  uint32_t LastFlat[NUM_INST_CNTS] = {0};
346  // wait_cnt scores for every vgpr.
347  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
348  int32_t VgprUB = 0;
349  int32_t SgprUB = 0;
350  uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
351  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
352  uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
353 };
354 
355 class SIInsertWaitcnts : public MachineFunctionPass {
356 private:
357  const GCNSubtarget *ST = nullptr;
358  const SIInstrInfo *TII = nullptr;
359  const SIRegisterInfo *TRI = nullptr;
360  const MachineRegisterInfo *MRI = nullptr;
361  AMDGPU::IsaVersion IV;
362 
363  DenseSet<MachineInstr *> TrackedWaitcntSet;
364  DenseSet<MachineInstr *> VCCZBugHandledSet;
365 
366  struct BlockInfo {
367  MachineBasicBlock *MBB;
368  std::unique_ptr<WaitcntBrackets> Incoming;
369  bool Dirty = true;
370 
371  explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
372  };
373 
374  std::vector<BlockInfo> BlockInfos; // by reverse post-order traversal index
376 
377  // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
378  // because of amdgpu-waitcnt-forcezero flag
379  bool ForceEmitZeroWaitcnts;
380  bool ForceEmitWaitcnt[NUM_INST_CNTS];
381 
382 public:
383  static char ID;
384 
385  SIInsertWaitcnts() : MachineFunctionPass(ID) {
386  (void)ForceExpCounter;
387  (void)ForceLgkmCounter;
388  (void)ForceVMCounter;
389  }
390 
391  bool runOnMachineFunction(MachineFunction &MF) override;
392 
393  StringRef getPassName() const override {
394  return "SI insert wait instructions";
395  }
396 
397  void getAnalysisUsage(AnalysisUsage &AU) const override {
398  AU.setPreservesCFG();
400  }
401 
402  bool isForceEmitWaitcnt() const {
403  for (auto T : inst_counter_types())
404  if (ForceEmitWaitcnt[T])
405  return true;
406  return false;
407  }
408 
409  void setForceEmitWaitcnt() {
410 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
411 // For debug builds, get the debug counter info and adjust if need be
412 #ifndef NDEBUG
413  if (DebugCounter::isCounterSet(ForceExpCounter) &&
414  DebugCounter::shouldExecute(ForceExpCounter)) {
415  ForceEmitWaitcnt[EXP_CNT] = true;
416  } else {
417  ForceEmitWaitcnt[EXP_CNT] = false;
418  }
419 
420  if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
421  DebugCounter::shouldExecute(ForceLgkmCounter)) {
422  ForceEmitWaitcnt[LGKM_CNT] = true;
423  } else {
424  ForceEmitWaitcnt[LGKM_CNT] = false;
425  }
426 
427  if (DebugCounter::isCounterSet(ForceVMCounter) &&
428  DebugCounter::shouldExecute(ForceVMCounter)) {
429  ForceEmitWaitcnt[VM_CNT] = true;
430  } else {
431  ForceEmitWaitcnt[VM_CNT] = false;
432  }
433 #endif // NDEBUG
434  }
435 
436  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
437  bool generateWaitcntInstBefore(MachineInstr &MI,
438  WaitcntBrackets &ScoreBrackets,
439  MachineInstr *OldWaitcntInstr);
440  void updateEventWaitcntAfter(MachineInstr &Inst,
441  WaitcntBrackets *ScoreBrackets);
442  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
443  WaitcntBrackets &ScoreBrackets);
444 };
445 
446 } // end anonymous namespace
447 
448 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
449  const SIInstrInfo *TII,
450  const MachineRegisterInfo *MRI,
451  const SIRegisterInfo *TRI,
452  unsigned OpNo, bool Def) const {
453  const MachineOperand &Op = MI->getOperand(OpNo);
454  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
455  (Def && !Op.isDef()))
456  return {-1, -1};
457 
458  // A use via a PW operand does not need a waitcnt.
459  // A partial write is not a WAW.
460  assert(!Op.getSubReg() || !Op.isUndef());
461 
462  RegInterval Result;
463  const MachineRegisterInfo &MRIA = *MRI;
464 
465  unsigned Reg = TRI->getEncodingValue(Op.getReg());
466 
467  if (TRI->isVGPR(MRIA, Op.getReg())) {
468  assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
469  Result.first = Reg - RegisterEncoding.VGPR0;
470  assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
471  } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
472  assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
473  Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
474  assert(Result.first >= NUM_ALL_VGPRS &&
475  Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
476  }
477  // TODO: Handle TTMP
478  // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
479  else
480  return {-1, -1};
481 
482  const MachineInstr &MIA = *MI;
483  const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
484  unsigned Size = TRI->getRegSizeInBits(*RC);
485  Result.second = Result.first + (Size / 32);
486 
487  return Result;
488 }
489 
490 void WaitcntBrackets::setExpScore(const MachineInstr *MI,
491  const SIInstrInfo *TII,
492  const SIRegisterInfo *TRI,
493  const MachineRegisterInfo *MRI, unsigned OpNo,
494  uint32_t Val) {
495  RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
496  LLVM_DEBUG({
497  const MachineOperand &Opnd = MI->getOperand(OpNo);
498  assert(TRI->isVGPR(*MRI, Opnd.getReg()));
499  });
500  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
501  setRegScore(RegNo, EXP_CNT, Val);
502  }
503 }
504 
505 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
506  const SIRegisterInfo *TRI,
507  const MachineRegisterInfo *MRI,
508  WaitEventType E, MachineInstr &Inst) {
509  const MachineRegisterInfo &MRIA = *MRI;
510  InstCounterType T = eventCounter(E);
511  uint32_t CurrScore = getScoreUB(T) + 1;
512  if (CurrScore == 0)
513  report_fatal_error("InsertWaitcnt score wraparound");
514  // PendingEvents and ScoreUB need to be update regardless if this event
515  // changes the score of a register or not.
516  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
517  if (!hasPendingEvent(E)) {
518  if (PendingEvents & WaitEventMaskForInst[T])
519  MixedPendingEvents[T] = true;
520  PendingEvents |= 1 << E;
521  }
522  setScoreUB(T, CurrScore);
523 
524  if (T == EXP_CNT) {
525  // Put score on the source vgprs. If this is a store, just use those
526  // specific register(s).
527  if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
528  // All GDS operations must protect their address register (same as
529  // export.)
530  if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
531  Inst.getOpcode() != AMDGPU::DS_CONSUME) {
532  setExpScore(
533  &Inst, TII, TRI, MRI,
534  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
535  CurrScore);
536  }
537  if (Inst.mayStore()) {
539  AMDGPU::OpName::data0) != -1) {
540  setExpScore(
541  &Inst, TII, TRI, MRI,
542  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
543  CurrScore);
544  }
546  AMDGPU::OpName::data1) != -1) {
547  setExpScore(&Inst, TII, TRI, MRI,
549  AMDGPU::OpName::data1),
550  CurrScore);
551  }
552  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
553  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
554  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
555  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
556  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
557  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
558  Inst.getOpcode() != AMDGPU::DS_APPEND &&
559  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
561  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
562  const MachineOperand &Op = Inst.getOperand(I);
563  if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
564  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
565  }
566  }
567  }
568  } else if (TII->isFLAT(Inst)) {
569  if (Inst.mayStore()) {
570  setExpScore(
571  &Inst, TII, TRI, MRI,
572  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
573  CurrScore);
574  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
575  setExpScore(
576  &Inst, TII, TRI, MRI,
577  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
578  CurrScore);
579  }
580  } else if (TII->isMIMG(Inst)) {
581  if (Inst.mayStore()) {
582  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
583  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
584  setExpScore(
585  &Inst, TII, TRI, MRI,
586  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
587  CurrScore);
588  }
589  } else if (TII->isMTBUF(Inst)) {
590  if (Inst.mayStore()) {
591  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
592  }
593  } else if (TII->isMUBUF(Inst)) {
594  if (Inst.mayStore()) {
595  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
596  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
597  setExpScore(
598  &Inst, TII, TRI, MRI,
599  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
600  CurrScore);
601  }
602  } else {
603  if (TII->isEXP(Inst)) {
604  // For export the destination registers are really temps that
605  // can be used as the actual source after export patching, so
606  // we need to treat them like sources and set the EXP_CNT
607  // score.
608  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
609  MachineOperand &DefMO = Inst.getOperand(I);
610  if (DefMO.isReg() && DefMO.isDef() &&
611  TRI->isVGPR(MRIA, DefMO.getReg())) {
612  setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
613  CurrScore);
614  }
615  }
616  }
617  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
618  MachineOperand &MO = Inst.getOperand(I);
619  if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
620  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
621  }
622  }
623  }
624 #if 0 // TODO: check if this is handled by MUBUF code above.
625  } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
626  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
627  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
628  MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
629  unsigned OpNo;//TODO: find the OpNo for this operand;
630  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
631  for (signed RegNo = Interval.first; RegNo < Interval.second;
632  ++RegNo) {
633  setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
634  }
635 #endif
636  } else {
637  // Match the score to the destination registers.
638  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
639  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
640  if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
641  continue;
642  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
643  setRegScore(RegNo, T, CurrScore);
644  }
645  }
646  if (TII->isDS(Inst) && Inst.mayStore()) {
647  setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
648  }
649  }
650 }
651 
653  OS << '\n';
654  for (auto T : inst_counter_types()) {
655  uint32_t LB = getScoreLB(T);
656  uint32_t UB = getScoreUB(T);
657 
658  switch (T) {
659  case VM_CNT:
660  OS << " VM_CNT(" << UB - LB << "): ";
661  break;
662  case LGKM_CNT:
663  OS << " LGKM_CNT(" << UB - LB << "): ";
664  break;
665  case EXP_CNT:
666  OS << " EXP_CNT(" << UB - LB << "): ";
667  break;
668  default:
669  OS << " UNKNOWN(" << UB - LB << "): ";
670  break;
671  }
672 
673  if (LB < UB) {
674  // Print vgpr scores.
675  for (int J = 0; J <= getMaxVGPR(); J++) {
676  uint32_t RegScore = getRegScore(J, T);
677  if (RegScore <= LB)
678  continue;
679  uint32_t RelScore = RegScore - LB - 1;
680  if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
681  OS << RelScore << ":v" << J << " ";
682  } else {
683  OS << RelScore << ":ds ";
684  }
685  }
686  // Also need to print sgpr scores for lgkm_cnt.
687  if (T == LGKM_CNT) {
688  for (int J = 0; J <= getMaxSGPR(); J++) {
689  uint32_t RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
690  if (RegScore <= LB)
691  continue;
692  uint32_t RelScore = RegScore - LB - 1;
693  OS << RelScore << ":s" << J << " ";
694  }
695  }
696  }
697  OS << '\n';
698  }
699  OS << '\n';
700 }
701 
702 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
703 /// whether a waitcnt instruction is needed at all.
704 bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
705  return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
706  simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
707  simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
708 }
709 
710 bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
711  unsigned &Count) const {
712  const uint32_t LB = getScoreLB(T);
713  const uint32_t UB = getScoreUB(T);
714  if (Count < UB && UB - Count > LB)
715  return true;
716 
717  Count = ~0u;
718  return false;
719 }
720 
721 void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait,
722  AMDGPU::Waitcnt &Wait) const {
723  // If the score of src_operand falls within the bracket, we need an
724  // s_waitcnt instruction.
725  const uint32_t LB = getScoreLB(T);
726  const uint32_t UB = getScoreUB(T);
727  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
728  if ((T == VM_CNT || T == LGKM_CNT) &&
729  hasPendingFlat() &&
730  !ST->hasFlatLgkmVMemCountInOrder()) {
731  // If there is a pending FLAT operation, and this is a VMem or LGKM
732  // waitcnt and the target can report early completion, then we need
733  // to force a waitcnt 0.
734  addWait(Wait, T, 0);
735  } else if (counterOutOfOrder(T)) {
736  // Counter can get decremented out-of-order when there
737  // are multiple types event in the bracket. Also emit an s_wait counter
738  // with a conservative value of 0 for the counter.
739  addWait(Wait, T, 0);
740  } else {
741  addWait(Wait, T, UB - ScoreToWait);
742  }
743  }
744 }
745 
746 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
747  applyWaitcnt(VM_CNT, Wait.VmCnt);
748  applyWaitcnt(EXP_CNT, Wait.ExpCnt);
749  applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
750 }
751 
752 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
753  const uint32_t UB = getScoreUB(T);
754  if (Count >= UB)
755  return;
756  if (Count != 0) {
757  if (counterOutOfOrder(T))
758  return;
759  setScoreLB(T, std::max(getScoreLB(T), UB - Count));
760  } else {
761  setScoreLB(T, UB);
762  MixedPendingEvents[T] = false;
763  PendingEvents &= ~WaitEventMaskForInst[T];
764  }
765 }
766 
767 // Where there are multiple types of event in the bracket of a counter,
768 // the decrement may go out of order.
769 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
770  // Scalar memory read always can go out of order.
771  if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
772  return true;
773  return MixedPendingEvents[T];
774 }
775 
776 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
777  false)
778 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
779  false)
780 
781 char SIInsertWaitcnts::ID = 0;
782 
783 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
784 
786  return new SIInsertWaitcnts();
787 }
788 
789 static bool readsVCCZ(const MachineInstr &MI) {
790  unsigned Opc = MI.getOpcode();
791  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
792  !MI.getOperand(1).isUndef();
793 }
794 
795 /// Generate s_waitcnt instruction to be placed before cur_Inst.
796 /// Instructions of a given type are returned in order,
797 /// but instructions of different types can complete out of order.
798 /// We rely on this in-order completion
799 /// and simply assign a score to the memory access instructions.
800 /// We keep track of the active "score bracket" to determine
801 /// if an access of a memory read requires an s_waitcnt
802 /// and if so what the value of each counter is.
803 /// The "score bracket" is bound by the lower bound and upper bound
804 /// scores (*_score_LB and *_score_ub respectively).
805 bool SIInsertWaitcnts::generateWaitcntInstBefore(
806  MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
807  MachineInstr *OldWaitcntInstr) {
808  setForceEmitWaitcnt();
809  bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
810 
811  if (MI.isDebugInstr())
812  return false;
813 
814  AMDGPU::Waitcnt Wait;
815 
816  // See if this instruction has a forced S_WAITCNT VM.
817  // TODO: Handle other cases of NeedsWaitcntVmBefore()
818  if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
819  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
820  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
821  Wait.VmCnt = 0;
822  }
823 
824  // All waits must be resolved at call return.
825  // NOTE: this could be improved with knowledge of all call sites or
826  // with knowledge of the called routines.
827  if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
828  MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
829  Wait = AMDGPU::Waitcnt::allZero();
830  }
831  // Resolve vm waits before gs-done.
832  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
833  MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
836  Wait.VmCnt = 0;
837  }
838 #if 0 // TODO: the following blocks of logic when we have fence.
839  else if (MI.getOpcode() == SC_FENCE) {
840  const unsigned int group_size =
841  context->shader_info->GetMaxThreadGroupSize();
842  // group_size == 0 means thread group size is unknown at compile time
843  const bool group_is_multi_wave =
844  (group_size == 0 || group_size > target_info->GetWaveFrontSize());
845  const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
846 
847  for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
848  SCRegType src_type = Inst->GetSrcType(i);
849  switch (src_type) {
850  case SCMEM_LDS:
851  if (group_is_multi_wave ||
852  context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
853  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
854  ScoreBrackets->getScoreUB(LGKM_CNT));
855  // LDS may have to wait for VM_CNT after buffer load to LDS
856  if (target_info->HasBufferLoadToLDS()) {
857  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
858  ScoreBrackets->getScoreUB(VM_CNT));
859  }
860  }
861  break;
862 
863  case SCMEM_GDS:
864  if (group_is_multi_wave || fence_is_global) {
865  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
866  ScoreBrackets->getScoreUB(EXP_CNT));
867  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
868  ScoreBrackets->getScoreUB(LGKM_CNT));
869  }
870  break;
871 
872  case SCMEM_UAV:
873  case SCMEM_TFBUF:
874  case SCMEM_RING:
875  case SCMEM_SCATTER:
876  if (group_is_multi_wave || fence_is_global) {
877  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
878  ScoreBrackets->getScoreUB(EXP_CNT));
879  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
880  ScoreBrackets->getScoreUB(VM_CNT));
881  }
882  break;
883 
884  case SCMEM_SCRATCH:
885  default:
886  break;
887  }
888  }
889  }
890 #endif
891 
892  // Export & GDS instructions do not read the EXEC mask until after the export
893  // is granted (which can occur well after the instruction is issued).
894  // The shader program must flush all EXP operations on the export-count
895  // before overwriting the EXEC mask.
896  else {
897  if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
898  // Export and GDS are tracked individually, either may trigger a waitcnt
899  // for EXEC.
900  if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
901  ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
902  ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
903  ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
904  Wait.ExpCnt = 0;
905  }
906  }
907 
908 #if 0 // TODO: the following code to handle CALL.
909  // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
910  // However, there is a problem with EXP_CNT, because the call cannot
911  // easily tell if a register is used in the function, and if it did, then
912  // the referring instruction would have to have an S_WAITCNT, which is
913  // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
914  // before the call.
915  if (MI.getOpcode() == SC_CALL) {
916  if (ScoreBrackets->getScoreUB(EXP_CNT) >
917  ScoreBrackets->getScoreLB(EXP_CNT)) {
918  ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
919  EmitWaitcnt |= CNT_MASK(EXP_CNT);
920  }
921  }
922 #endif
923 
924  // FIXME: Should not be relying on memoperands.
925  // Look at the source operands of every instruction to see if
926  // any of them results from a previous memory operation that affects
927  // its current usage. If so, an s_waitcnt instruction needs to be
928  // emitted.
929  // If the source operand was defined by a load, add the s_waitcnt
930  // instruction.
931  for (const MachineMemOperand *Memop : MI.memoperands()) {
932  unsigned AS = Memop->getAddrSpace();
933  if (AS != AMDGPUAS::LOCAL_ADDRESS)
934  continue;
935  unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
936  // VM_CNT is only relevant to vgpr or LDS.
937  ScoreBrackets.determineWait(
938  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
939  }
940 
941  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
942  const MachineOperand &Op = MI.getOperand(I);
943  const MachineRegisterInfo &MRIA = *MRI;
944  RegInterval Interval =
945  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
946  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
947  if (TRI->isVGPR(MRIA, Op.getReg())) {
948  // VM_CNT is only relevant to vgpr or LDS.
949  ScoreBrackets.determineWait(
950  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
951  }
952  ScoreBrackets.determineWait(
953  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
954  }
955  }
956  // End of for loop that looks at all source operands to decide vm_wait_cnt
957  // and lgk_wait_cnt.
958 
959  // Two cases are handled for destination operands:
960  // 1) If the destination operand was defined by a load, add the s_waitcnt
961  // instruction to guarantee the right WAW order.
962  // 2) If a destination operand that was used by a recent export/store ins,
963  // add s_waitcnt on exp_cnt to guarantee the WAR order.
964  if (MI.mayStore()) {
965  // FIXME: Should not be relying on memoperands.
966  for (const MachineMemOperand *Memop : MI.memoperands()) {
967  unsigned AS = Memop->getAddrSpace();
968  if (AS != AMDGPUAS::LOCAL_ADDRESS)
969  continue;
970  unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
971  ScoreBrackets.determineWait(
972  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
973  ScoreBrackets.determineWait(
974  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
975  }
976  }
977  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
979  const MachineRegisterInfo &MRIA = *MRI;
980  RegInterval Interval =
981  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
982  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
983  if (TRI->isVGPR(MRIA, Def.getReg())) {
984  ScoreBrackets.determineWait(
985  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
986  ScoreBrackets.determineWait(
987  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
988  }
989  ScoreBrackets.determineWait(
990  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
991  }
992  } // End of for loop that looks at all dest operands.
993  }
994 
995  // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
996  // occurs before the instruction. Doing it here prevents any additional
997  // S_WAITCNTs from being emitted if the instruction was marked as
998  // requiring a WAITCNT beforehand.
999  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1000  !ST->hasAutoWaitcntBeforeBarrier()) {
1001  Wait = AMDGPU::Waitcnt::allZero();
1002  }
1003 
1004  // TODO: Remove this work-around, enable the assert for Bug 457939
1005  // after fixing the scheduler. Also, the Shader Compiler code is
1006  // independent of target.
1007  if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
1008  if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1009  ScoreBrackets.getScoreUB(LGKM_CNT) &&
1010  ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1011  Wait.LgkmCnt = 0;
1012  }
1013  }
1014 
1015  // Early-out if no wait is indicated.
1016  if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
1017  bool Modified = false;
1018  if (OldWaitcntInstr) {
1019  if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1020  TrackedWaitcntSet.erase(OldWaitcntInstr);
1021  OldWaitcntInstr->eraseFromParent();
1022  Modified = true;
1023  } else {
1024  int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
1025  ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1026  }
1027  Modified = true;
1028  }
1029  return Modified;
1030  }
1031 
1032  if (ForceEmitZeroWaitcnts)
1033  Wait = AMDGPU::Waitcnt::allZero();
1034 
1035  if (ForceEmitWaitcnt[VM_CNT])
1036  Wait.VmCnt = 0;
1037  if (ForceEmitWaitcnt[EXP_CNT])
1038  Wait.ExpCnt = 0;
1039  if (ForceEmitWaitcnt[LGKM_CNT])
1040  Wait.LgkmCnt = 0;
1041 
1042  ScoreBrackets.applyWaitcnt(Wait);
1043 
1044  AMDGPU::Waitcnt OldWait;
1045  if (OldWaitcntInstr) {
1046  OldWait =
1047  AMDGPU::decodeWaitcnt(IV, OldWaitcntInstr->getOperand(0).getImm());
1048  }
1049  if (OldWait.dominates(Wait))
1050  return false;
1051 
1052  if (OldWaitcntInstr && !TrackedWaitcntSet.count(OldWaitcntInstr))
1053  Wait = Wait.combined(OldWait);
1054 
1055  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1056  if (OldWaitcntInstr) {
1057  OldWaitcntInstr->getOperand(0).setImm(Enc);
1058 
1059  LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
1060  << "Old Instr: " << MI << '\n'
1061  << "New Instr: " << *OldWaitcntInstr << '\n');
1062  } else {
1063  auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1064  MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1065  .addImm(Enc);
1066  TrackedWaitcntSet.insert(SWaitInst);
1067 
1068  LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1069  << "Old Instr: " << MI << '\n'
1070  << "New Instr: " << *SWaitInst << '\n');
1071  }
1072 
1073  return true;
1074 }
1075 
1076 // This is a flat memory operation. Check to see if it has memory
1077 // tokens for both LDS and Memory, and if so mark it as a flat.
1078 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1079  if (MI.memoperands_empty())
1080  return true;
1081 
1082  for (const MachineMemOperand *Memop : MI.memoperands()) {
1083  unsigned AS = Memop->getAddrSpace();
1085  return true;
1086  }
1087 
1088  return false;
1089 }
1090 
1091 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1092  WaitcntBrackets *ScoreBrackets) {
1093  // Now look at the instruction opcode. If it is a memory access
1094  // instruction, update the upper-bound of the appropriate counter's
1095  // bracket and the destination operand scores.
1096  // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1097  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1098  if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1099  TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1100  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1101  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1102  } else {
1103  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1104  }
1105  } else if (TII->isFLAT(Inst)) {
1106  assert(Inst.mayLoad() || Inst.mayStore());
1107 
1108  if (TII->usesVM_CNT(Inst))
1109  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1110 
1111  if (TII->usesLGKM_CNT(Inst)) {
1112  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1113 
1114  // This is a flat memory operation, so note it - it will require
1115  // that both the VM and LGKM be flushed to zero if it is pending when
1116  // a VM or LGKM dependency occurs.
1117  if (mayAccessLDSThroughFlat(Inst))
1118  ScoreBrackets->setPendingFlat();
1119  }
1120  } else if (SIInstrInfo::isVMEM(Inst) &&
1121  // TODO: get a better carve out.
1122  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1123  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1124  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1125  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1126  if (ST->vmemWriteNeedsExpWaitcnt() &&
1127  (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
1128  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1129  }
1130  } else if (TII->isSMRD(Inst)) {
1131  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1132  } else {
1133  switch (Inst.getOpcode()) {
1134  case AMDGPU::S_SENDMSG:
1135  case AMDGPU::S_SENDMSGHALT:
1136  ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1137  break;
1138  case AMDGPU::EXP:
1139  case AMDGPU::EXP_DONE: {
1140  int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1141  if (Imm >= 32 && Imm <= 63)
1142  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1143  else if (Imm >= 12 && Imm <= 15)
1144  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1145  else
1146  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1147  break;
1148  }
1149  case AMDGPU::S_MEMTIME:
1150  case AMDGPU::S_MEMREALTIME:
1151  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1152  break;
1153  default:
1154  break;
1155  }
1156  }
1157 }
1158 
1159 bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score,
1160  uint32_t OtherScore) {
1161  uint32_t MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
1162  uint32_t OtherShifted =
1163  OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
1164  Score = std::max(MyShifted, OtherShifted);
1165  return OtherShifted > MyShifted;
1166 }
1167 
1168 /// Merge the pending events and associater score brackets of \p Other into
1169 /// this brackets status.
1170 ///
1171 /// Returns whether the merge resulted in a change that requires tighter waits
1172 /// (i.e. the merged brackets strictly dominate the original brackets).
1173 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
1174  bool StrictDom = false;
1175 
1176  for (auto T : inst_counter_types()) {
1177  // Merge event flags for this counter
1178  const bool OldOutOfOrder = counterOutOfOrder(T);
1179  const uint32_t OldEvents = PendingEvents & WaitEventMaskForInst[T];
1180  const uint32_t OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
1181  if (OtherEvents & ~OldEvents)
1182  StrictDom = true;
1183  if (Other.MixedPendingEvents[T] ||
1184  (OldEvents && OtherEvents && OldEvents != OtherEvents))
1185  MixedPendingEvents[T] = true;
1186  PendingEvents |= OtherEvents;
1187 
1188  // Merge scores for this counter
1189  const uint32_t MyPending = ScoreUBs[T] - ScoreLBs[T];
1190  const uint32_t OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
1191  MergeInfo M;
1192  M.OldLB = ScoreLBs[T];
1193  M.OtherLB = Other.ScoreLBs[T];
1194  M.MyShift = OtherPending > MyPending ? OtherPending - MyPending : 0;
1195  M.OtherShift = ScoreUBs[T] - Other.ScoreUBs[T] + M.MyShift;
1196 
1197  const uint32_t NewUB = ScoreUBs[T] + M.MyShift;
1198  if (NewUB < ScoreUBs[T])
1199  report_fatal_error("waitcnt score overflow");
1200  ScoreUBs[T] = NewUB;
1201  ScoreLBs[T] = std::min(M.OldLB + M.MyShift, M.OtherLB + M.OtherShift);
1202 
1203  StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
1204 
1205  bool RegStrictDom = false;
1206  for (int J = 0, E = std::max(getMaxVGPR(), Other.getMaxVGPR()) + 1; J != E;
1207  J++) {
1208  RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
1209  }
1210 
1211  if (T == LGKM_CNT) {
1212  for (int J = 0, E = std::max(getMaxSGPR(), Other.getMaxSGPR()) + 1;
1213  J != E; J++) {
1214  RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
1215  }
1216  }
1217 
1218  if (RegStrictDom && !OldOutOfOrder)
1219  StrictDom = true;
1220  }
1221 
1222  VgprUB = std::max(getMaxVGPR(), Other.getMaxVGPR());
1223  SgprUB = std::max(getMaxSGPR(), Other.getMaxSGPR());
1224 
1225  return StrictDom;
1226 }
1227 
1228 // Generate s_waitcnt instructions where needed.
1229 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1230  MachineBasicBlock &Block,
1231  WaitcntBrackets &ScoreBrackets) {
1232  bool Modified = false;
1233 
1234  LLVM_DEBUG({
1235  dbgs() << "*** Block" << Block.getNumber() << " ***";
1236  ScoreBrackets.dump();
1237  });
1238 
1239  // Walk over the instructions.
1240  MachineInstr *OldWaitcntInstr = nullptr;
1241 
1242  for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1243  Iter != E;) {
1244  MachineInstr &Inst = *Iter;
1245 
1246  // Remove any previously existing waitcnts.
1247  if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
1248  if (OldWaitcntInstr) {
1249  if (TrackedWaitcntSet.count(OldWaitcntInstr)) {
1250  TrackedWaitcntSet.erase(OldWaitcntInstr);
1251  OldWaitcntInstr->eraseFromParent();
1252  OldWaitcntInstr = nullptr;
1253  } else if (!TrackedWaitcntSet.count(&Inst)) {
1254  // Two successive s_waitcnt's, both of which are pre-existing and
1255  // are therefore preserved.
1256  int64_t Imm = OldWaitcntInstr->getOperand(0).getImm();
1257  ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1258  } else {
1259  ++Iter;
1260  Inst.eraseFromParent();
1261  Modified = true;
1262  continue;
1263  }
1264  }
1265 
1266  OldWaitcntInstr = &Inst;
1267  ++Iter;
1268  continue;
1269  }
1270 
1271  bool VCCZBugWorkAround = false;
1272  if (readsVCCZ(Inst) &&
1273  (!VCCZBugHandledSet.count(&Inst))) {
1274  if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1275  ScoreBrackets.getScoreUB(LGKM_CNT) &&
1276  ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1277  if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
1278  VCCZBugWorkAround = true;
1279  }
1280  }
1281 
1282  // Generate an s_waitcnt instruction to be placed before
1283  // cur_Inst, if needed.
1284  Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
1285  OldWaitcntInstr = nullptr;
1286 
1287  updateEventWaitcntAfter(Inst, &ScoreBrackets);
1288 
1289 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1290  // If this instruction generates a S_SETVSKIP because it is an
1291  // indexed resource, and we are on Tahiti, then it will also force
1292  // an S_WAITCNT vmcnt(0)
1293  if (RequireCheckResourceType(Inst, context)) {
1294  // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1295  ScoreBrackets->setScoreLB(VM_CNT,
1296  ScoreBrackets->getScoreUB(VM_CNT));
1297  }
1298 #endif
1299 
1300  LLVM_DEBUG({
1301  Inst.print(dbgs());
1302  ScoreBrackets.dump();
1303  });
1304 
1305  // Check to see if this is a GWS instruction. If so, and if this is CI or
1306  // VI, then the generated code sequence will include an S_WAITCNT 0.
1307  // TODO: Are these the only GWS instructions?
1308  if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1309  Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1310  Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1311  Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1312  Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1313  // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
1314  ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZero());
1315  }
1316 
1317  // TODO: Remove this work-around after fixing the scheduler and enable the
1318  // assert above.
1319  if (VCCZBugWorkAround) {
1320  // Restore the vccz bit. Any time a value is written to vcc, the vcc
1321  // bit is updated, so we can restore the bit by reading the value of
1322  // vcc and then writing it back to the register.
1323  BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1324  AMDGPU::VCC)
1325  .addReg(AMDGPU::VCC);
1326  VCCZBugHandledSet.insert(&Inst);
1327  Modified = true;
1328  }
1329 
1330  ++Iter;
1331  }
1332 
1333  return Modified;
1334 }
1335 
1336 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1337  ST = &MF.getSubtarget<GCNSubtarget>();
1338  TII = ST->getInstrInfo();
1339  TRI = &TII->getRegisterInfo();
1340  MRI = &MF.getRegInfo();
1341  IV = AMDGPU::getIsaVersion(ST->getCPU());
1343 
1344  ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1345  for (auto T : inst_counter_types())
1346  ForceEmitWaitcnt[T] = false;
1347 
1348  HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1349  HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1350  HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1351 
1352  HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1353  HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1354  assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1355  assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1356 
1357  RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1358  RegisterEncoding.VGPRL =
1359  RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1360  RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1361  RegisterEncoding.SGPRL =
1362  RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1363 
1364  TrackedWaitcntSet.clear();
1365  VCCZBugHandledSet.clear();
1366  RpotIdxMap.clear();
1367  BlockInfos.clear();
1368 
1369  // Keep iterating over the blocks in reverse post order, inserting and
1370  // updating s_waitcnt where needed, until a fix point is reached.
1371  for (MachineBasicBlock *MBB :
1373  RpotIdxMap[MBB] = BlockInfos.size();
1374  BlockInfos.emplace_back(MBB);
1375  }
1376 
1377  std::unique_ptr<WaitcntBrackets> Brackets;
1378  bool Modified = false;
1379  bool Repeat;
1380  do {
1381  Repeat = false;
1382 
1383  for (BlockInfo &BI : BlockInfos) {
1384  if (!BI.Dirty)
1385  continue;
1386 
1387  unsigned Idx = std::distance(&*BlockInfos.begin(), &BI);
1388 
1389  if (BI.Incoming) {
1390  if (!Brackets)
1391  Brackets = llvm::make_unique<WaitcntBrackets>(*BI.Incoming);
1392  else
1393  *Brackets = *BI.Incoming;
1394  } else {
1395  if (!Brackets)
1396  Brackets = llvm::make_unique<WaitcntBrackets>(ST);
1397  else
1398  Brackets->clear();
1399  }
1400 
1401  Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
1402  BI.Dirty = false;
1403 
1404  if (Brackets->hasPending()) {
1405  BlockInfo *MoveBracketsToSucc = nullptr;
1406  for (MachineBasicBlock *Succ : BI.MBB->successors()) {
1407  unsigned SuccIdx = RpotIdxMap[Succ];
1408  BlockInfo &SuccBI = BlockInfos[SuccIdx];
1409  if (!SuccBI.Incoming) {
1410  SuccBI.Dirty = true;
1411  if (SuccIdx <= Idx)
1412  Repeat = true;
1413  if (!MoveBracketsToSucc) {
1414  MoveBracketsToSucc = &SuccBI;
1415  } else {
1416  SuccBI.Incoming = llvm::make_unique<WaitcntBrackets>(*Brackets);
1417  }
1418  } else if (SuccBI.Incoming->merge(*Brackets)) {
1419  SuccBI.Dirty = true;
1420  if (SuccIdx <= Idx)
1421  Repeat = true;
1422  }
1423  }
1424  if (MoveBracketsToSucc)
1425  MoveBracketsToSucc->Incoming = std::move(Brackets);
1426  }
1427  }
1428  } while (Repeat);
1429 
1431 
1432  bool HaveScalarStores = false;
1433 
1434  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1435  ++BI) {
1436  MachineBasicBlock &MBB = *BI;
1437 
1438  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1439  ++I) {
1440  if (!HaveScalarStores && TII->isScalarStore(*I))
1441  HaveScalarStores = true;
1442 
1443  if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1444  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1445  EndPgmBlocks.push_back(&MBB);
1446  }
1447  }
1448 
1449  if (HaveScalarStores) {
1450  // If scalar writes are used, the cache must be flushed or else the next
1451  // wave to reuse the same scratch memory can be clobbered.
1452  //
1453  // Insert s_dcache_wb at wave termination points if there were any scalar
1454  // stores, and only if the cache hasn't already been flushed. This could be
1455  // improved by looking across blocks for flushes in postdominating blocks
1456  // from the stores but an explicitly requested flush is probably very rare.
1457  for (MachineBasicBlock *MBB : EndPgmBlocks) {
1458  bool SeenDCacheWB = false;
1459 
1460  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1461  ++I) {
1462  if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1463  SeenDCacheWB = true;
1464  else if (TII->isScalarStore(*I))
1465  SeenDCacheWB = false;
1466 
1467  // FIXME: It would be better to insert this before a waitcnt if any.
1468  if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1469  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1470  !SeenDCacheWB) {
1471  Modified = true;
1472  BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1473  }
1474  }
1475  }
1476  }
1477 
1478  if (!MFI->isEntryFunction()) {
1479  // Wait for any outstanding memory operations that the input registers may
1480  // depend on. We can't track them and it's better to the wait after the
1481  // costly call sequence.
1482 
1483  // TODO: Could insert earlier and schedule more liberally with operations
1484  // that only use caller preserved registers.
1485  MachineBasicBlock &EntryBB = MF.front();
1486  BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1487  .addImm(0);
1488 
1489  Modified = true;
1490  }
1491 
1492  return Modified;
1493 }
static Waitcnt allZero()
static cl::opt< unsigned > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(0), cl::Hidden)
Interface definition for SIRegisterInfo.
bool modifiesRegister(unsigned Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register...
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
AMDGPU specific subclass of TargetSubtarget.
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:139
This class represents lattice values for constants.
Definition: AllocatorList.h:23
SI Insert Waitcnts
unsigned getExpcntBitMask(const IsaVersion &Version)
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:36
Implements a dense probed hash-table based set.
Definition: DenseSet.h:249
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:382
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
unsigned getReg() const
getReg - Returns the register number.
unsigned Reg
unsigned getSubReg() const
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:33
LLVM_READONLY int getAtomicNoRetOp(uint16_t Opcode)
FunctionPass * createSIInsertWaitcntsPass()
DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", "Force emit s_waitcnt expcnt(0) instrs")
iterator_range< succ_iterator > successors()
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:444
static const AMDGPUSubtarget & get(const MachineFunction &MF)
A description of a memory reference used in the backend.
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:470
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:411
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:454
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
static bool readsVCCZ(const MachineInstr &MI)
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:408
This file provides an implementation of debug counters.
ELFYAML::ELF_STO Other
Definition: ELFYAML.cpp:808
APInt operator*(APInt a, uint64_t RHS)
Definition: APInt.h:2090
InstCounterType
bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:418
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
Address space for local memory.
Definition: AMDGPU.h:259
#define T
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:101
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they&#39;re not in a MachineFuncti...
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition: iterator.h:67
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:819
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:422
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
unsigned const MachineRegisterInfo * MRI
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:515
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Represent the analysis usage information of a pass.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
void setImm(int64_t immVal)
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
self_iterator getIterator()
Definition: ilist_node.h:81
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:73
#define CNT_MASK(t)
RegisterMapping
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
const MachineBasicBlock & front() const
bool isDebugInstr() const
Definition: MachineInstr.h:998
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:486
IsaVersion getIsaVersion(StringRef GPU)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
char & SIInsertWaitcntsID
Iterator for intrusive lists based on ilist_node.
Address space for flat memory.
Definition: AMDGPU.h:254
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:839
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:285
int64_t getImm() const
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
A range adaptor for a pair of iterators.
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:211
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:253
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Representation of each machine instruction.
Definition: MachineInstr.h:63
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) INITIALIZE_PASS_END(SIInsertWaitcnts
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:426
#define DEBUG_TYPE
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:330
uint32_t Size
Definition: Profile.cpp:46
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:806
bool memoperands_empty() const
Return true if we don&#39;t have any memory operands which described the memory access done by this instr...
Definition: MachineInstr.h:545
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
LLVM Value Representation.
Definition: Value.h:72
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:45
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
unsigned getLgkmcntBitMask(const IsaVersion &Version)
ProcessInfo Wait(const ProcessInfo &PI, unsigned SecondsToWait, bool WaitUntilTerminates, std::string *ErrMsg=nullptr)
This function waits for the process specified by PI to finish.
bool operator==(uint64_t V1, const APInt &V2)
Definition: APInt.h:1966
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:413
WaitEventType
unsigned getVmcntBitMask(const IsaVersion &Version)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
const SIRegisterInfo * getRegisterInfo() const override