LLVM  9.0.0svn
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert wait instructions for memory reads and writes.
11 ///
12 /// Memory reads and writes are issued asynchronously, so we need to insert
13 /// S_WAITCNT instructions when we want to access any of their results or
14 /// overwrite any register that's used asynchronously.
15 ///
16 /// TODO: This pass currently keeps one timeline per hardware counter. A more
17 /// finely-grained approach that keeps one timeline per event type could
18 /// sometimes get away with generating weaker s_waitcnt instructions. For
19 /// example, when both SMEM and LDS are in flight and we need to wait for
20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21 /// but the pass will currently generate a conservative lgkmcnt(0) because
22 /// multiple event types are in flight.
23 //
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPU.h"
27 #include "AMDGPUSubtarget.h"
28 #include "SIDefines.h"
29 #include "SIInstrInfo.h"
30 #include "SIMachineFunctionInfo.h"
31 #include "SIRegisterInfo.h"
32 #include "Utils/AMDGPUBaseInfo.h"
33 #include "llvm/ADT/DenseMap.h"
34 #include "llvm/ADT/DenseSet.h"
36 #include "llvm/ADT/STLExtras.h"
37 #include "llvm/ADT/SmallVector.h"
46 #include "llvm/IR/DebugLoc.h"
47 #include "llvm/Pass.h"
48 #include "llvm/Support/Debug.h"
52 #include <algorithm>
53 #include <cassert>
54 #include <cstdint>
55 #include <cstring>
56 #include <memory>
57 #include <utility>
58 #include <vector>
59 
60 using namespace llvm;
61 
62 #define DEBUG_TYPE "si-insert-waitcnts"
63 
64 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
65  "Force emit s_waitcnt expcnt(0) instrs");
66 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
67  "Force emit s_waitcnt lgkmcnt(0) instrs");
68 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
69  "Force emit s_waitcnt vmcnt(0) instrs");
70 
72  "amdgpu-waitcnt-forcezero",
73  cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
74  cl::init(false), cl::Hidden);
75 
76 namespace {
77 
78 template <typename EnumT>
79 class enum_iterator
80  : public iterator_facade_base<enum_iterator<EnumT>,
81  std::forward_iterator_tag, const EnumT> {
82  EnumT Value;
83 public:
84  enum_iterator() = default;
85  enum_iterator(EnumT Value) : Value(Value) {}
86 
87  enum_iterator &operator++() {
88  Value = static_cast<EnumT>(Value + 1);
89  return *this;
90  }
91 
92  bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
93 
94  EnumT operator*() const { return Value; }
95 };
96 
97 // Class of object that encapsulates latest instruction counter score
98 // associated with the operand. Used for determining whether
99 // s_waitcnt instruction needs to be emited.
100 
101 #define CNT_MASK(t) (1u << (t))
102 
103 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
104 
105 iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
106  return make_range(enum_iterator<InstCounterType>(VM_CNT),
107  enum_iterator<InstCounterType>(NUM_INST_CNTS));
108 }
109 
110 using RegInterval = std::pair<signed, signed>;
111 
112 struct {
113  uint32_t VmcntMax;
114  uint32_t ExpcntMax;
115  uint32_t LgkmcntMax;
116  uint32_t VscntMax;
117  int32_t NumVGPRsMax;
118  int32_t NumSGPRsMax;
119 } HardwareLimits;
120 
121 struct {
122  unsigned VGPR0;
123  unsigned VGPRL;
124  unsigned SGPR0;
125  unsigned SGPRL;
126 } RegisterEncoding;
127 
129  VMEM_ACCESS, // vector-memory read & write
130  VMEM_READ_ACCESS, // vector-memory read
131  VMEM_WRITE_ACCESS,// vector-memory write
132  LDS_ACCESS, // lds read & write
133  GDS_ACCESS, // gds read & write
134  SQ_MESSAGE, // send message
135  SMEM_ACCESS, // scalar-memory read & write
136  EXP_GPR_LOCK, // export holding on its data src
137  GDS_GPR_LOCK, // GDS holding on its data and addr src
138  EXP_POS_ACCESS, // write to export position
139  EXP_PARAM_ACCESS, // write to export parameter
140  VMW_GPR_LOCK, // vector-memory write holding on its data src
141  NUM_WAIT_EVENTS,
142 };
143 
144 static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
145  (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
146  (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
147  (1 << SQ_MESSAGE),
148  (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
149  (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
150  (1 << VMEM_WRITE_ACCESS)
151 };
152 
153 // The mapping is:
154 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
155 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
156 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
157 // We reserve a fixed number of VGPR slots in the scoring tables for
158 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
160  SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
161  SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
162  NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
163  EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
164  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
165 };
166 
167 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
168  switch (T) {
169  case VM_CNT:
170  Wait.VmCnt = std::min(Wait.VmCnt, Count);
171  break;
172  case EXP_CNT:
173  Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
174  break;
175  case LGKM_CNT:
176  Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
177  break;
178  case VS_CNT:
179  Wait.VsCnt = std::min(Wait.VsCnt, Count);
180  break;
181  default:
182  llvm_unreachable("bad InstCounterType");
183  }
184 }
185 
186 // This objects maintains the current score brackets of each wait counter, and
187 // a per-register scoreboard for each wait counter.
188 //
189 // We also maintain the latest score for every event type that can change the
190 // waitcnt in order to know if there are multiple types of events within
191 // the brackets. When multiple types of event happen in the bracket,
192 // wait count may get decreased out of order, therefore we need to put in
193 // "s_waitcnt 0" before use.
194 class WaitcntBrackets {
195 public:
196  WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
197  for (auto T : inst_counter_types())
198  memset(VgprScores[T], 0, sizeof(VgprScores[T]));
199  }
200 
201  static uint32_t getWaitCountMax(InstCounterType T) {
202  switch (T) {
203  case VM_CNT:
204  return HardwareLimits.VmcntMax;
205  case LGKM_CNT:
206  return HardwareLimits.LgkmcntMax;
207  case EXP_CNT:
208  return HardwareLimits.ExpcntMax;
209  case VS_CNT:
210  return HardwareLimits.VscntMax;
211  default:
212  break;
213  }
214  return 0;
215  }
216 
217  uint32_t getScoreLB(InstCounterType T) const {
218  assert(T < NUM_INST_CNTS);
219  if (T >= NUM_INST_CNTS)
220  return 0;
221  return ScoreLBs[T];
222  }
223 
224  uint32_t getScoreUB(InstCounterType T) const {
225  assert(T < NUM_INST_CNTS);
226  if (T >= NUM_INST_CNTS)
227  return 0;
228  return ScoreUBs[T];
229  }
230 
231  // Mapping from event to counter.
232  InstCounterType eventCounter(WaitEventType E) {
233  if (WaitEventMaskForInst[VM_CNT] & (1 << E))
234  return VM_CNT;
235  if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
236  return LGKM_CNT;
237  if (WaitEventMaskForInst[VS_CNT] & (1 << E))
238  return VS_CNT;
239  assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
240  return EXP_CNT;
241  }
242 
243  uint32_t getRegScore(int GprNo, InstCounterType T) {
244  if (GprNo < NUM_ALL_VGPRS) {
245  return VgprScores[T][GprNo];
246  }
247  assert(T == LGKM_CNT);
248  return SgprScores[GprNo - NUM_ALL_VGPRS];
249  }
250 
251  void clear() {
252  memset(ScoreLBs, 0, sizeof(ScoreLBs));
253  memset(ScoreUBs, 0, sizeof(ScoreUBs));
254  PendingEvents = 0;
255  memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
256  for (auto T : inst_counter_types())
257  memset(VgprScores[T], 0, sizeof(VgprScores[T]));
258  memset(SgprScores, 0, sizeof(SgprScores));
259  }
260 
261  bool merge(const WaitcntBrackets &Other);
262 
263  RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
264  const MachineRegisterInfo *MRI,
265  const SIRegisterInfo *TRI, unsigned OpNo,
266  bool Def) const;
267 
268  int32_t getMaxVGPR() const { return VgprUB; }
269  int32_t getMaxSGPR() const { return SgprUB; }
270 
271  bool counterOutOfOrder(InstCounterType T) const;
272  bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
273  bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
274  void determineWait(InstCounterType T, uint32_t ScoreToWait,
275  AMDGPU::Waitcnt &Wait) const;
276  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
277  void applyWaitcnt(InstCounterType T, unsigned Count);
278  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
279  const MachineRegisterInfo *MRI, WaitEventType E,
280  MachineInstr &MI);
281 
282  bool hasPending() const { return PendingEvents != 0; }
283  bool hasPendingEvent(WaitEventType E) const {
284  return PendingEvents & (1 << E);
285  }
286 
287  bool hasPendingFlat() const {
288  return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
289  LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
290  (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
291  LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
292  }
293 
294  void setPendingFlat() {
295  LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
296  LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
297  }
298 
299  void print(raw_ostream &);
300  void dump() { print(dbgs()); }
301 
302 private:
303  struct MergeInfo {
304  uint32_t OldLB;
305  uint32_t OtherLB;
306  uint32_t MyShift;
307  uint32_t OtherShift;
308  };
309  static bool mergeScore(const MergeInfo &M, uint32_t &Score,
310  uint32_t OtherScore);
311 
312  void setScoreLB(InstCounterType T, uint32_t Val) {
313  assert(T < NUM_INST_CNTS);
314  if (T >= NUM_INST_CNTS)
315  return;
316  ScoreLBs[T] = Val;
317  }
318 
319  void setScoreUB(InstCounterType T, uint32_t Val) {
320  assert(T < NUM_INST_CNTS);
321  if (T >= NUM_INST_CNTS)
322  return;
323  ScoreUBs[T] = Val;
324  if (T == EXP_CNT) {
325  uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
326  if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
327  ScoreLBs[T] = UB;
328  }
329  }
330 
331  void setRegScore(int GprNo, InstCounterType T, uint32_t Val) {
332  if (GprNo < NUM_ALL_VGPRS) {
333  if (GprNo > VgprUB) {
334  VgprUB = GprNo;
335  }
336  VgprScores[T][GprNo] = Val;
337  } else {
338  assert(T == LGKM_CNT);
339  if (GprNo - NUM_ALL_VGPRS > SgprUB) {
340  SgprUB = GprNo - NUM_ALL_VGPRS;
341  }
342  SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
343  }
344  }
345 
346  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
347  const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
348  unsigned OpNo, uint32_t Val);
349 
350  const GCNSubtarget *ST = nullptr;
351  uint32_t ScoreLBs[NUM_INST_CNTS] = {0};
352  uint32_t ScoreUBs[NUM_INST_CNTS] = {0};
353  uint32_t PendingEvents = 0;
354  bool MixedPendingEvents[NUM_INST_CNTS] = {false};
355  // Remember the last flat memory operation.
356  uint32_t LastFlat[NUM_INST_CNTS] = {0};
357  // wait_cnt scores for every vgpr.
358  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
359  int32_t VgprUB = 0;
360  int32_t SgprUB = 0;
361  uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
362  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
363  uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
364 };
365 
366 class SIInsertWaitcnts : public MachineFunctionPass {
367 private:
368  const GCNSubtarget *ST = nullptr;
369  const SIInstrInfo *TII = nullptr;
370  const SIRegisterInfo *TRI = nullptr;
371  const MachineRegisterInfo *MRI = nullptr;
372  AMDGPU::IsaVersion IV;
373 
374  DenseSet<MachineInstr *> TrackedWaitcntSet;
375  DenseSet<MachineInstr *> VCCZBugHandledSet;
376 
377  struct BlockInfo {
378  MachineBasicBlock *MBB;
379  std::unique_ptr<WaitcntBrackets> Incoming;
380  bool Dirty = true;
381 
382  explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
383  };
384 
385  std::vector<BlockInfo> BlockInfos; // by reverse post-order traversal index
387 
388  // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
389  // because of amdgpu-waitcnt-forcezero flag
390  bool ForceEmitZeroWaitcnts;
391  bool ForceEmitWaitcnt[NUM_INST_CNTS];
392 
393 public:
394  static char ID;
395 
396  SIInsertWaitcnts() : MachineFunctionPass(ID) {
397  (void)ForceExpCounter;
398  (void)ForceLgkmCounter;
399  (void)ForceVMCounter;
400  }
401 
402  bool runOnMachineFunction(MachineFunction &MF) override;
403 
404  StringRef getPassName() const override {
405  return "SI insert wait instructions";
406  }
407 
408  void getAnalysisUsage(AnalysisUsage &AU) const override {
409  AU.setPreservesCFG();
411  }
412 
413  bool isForceEmitWaitcnt() const {
414  for (auto T : inst_counter_types())
415  if (ForceEmitWaitcnt[T])
416  return true;
417  return false;
418  }
419 
420  void setForceEmitWaitcnt() {
421 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
422 // For debug builds, get the debug counter info and adjust if need be
423 #ifndef NDEBUG
424  if (DebugCounter::isCounterSet(ForceExpCounter) &&
425  DebugCounter::shouldExecute(ForceExpCounter)) {
426  ForceEmitWaitcnt[EXP_CNT] = true;
427  } else {
428  ForceEmitWaitcnt[EXP_CNT] = false;
429  }
430 
431  if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
432  DebugCounter::shouldExecute(ForceLgkmCounter)) {
433  ForceEmitWaitcnt[LGKM_CNT] = true;
434  } else {
435  ForceEmitWaitcnt[LGKM_CNT] = false;
436  }
437 
438  if (DebugCounter::isCounterSet(ForceVMCounter) &&
439  DebugCounter::shouldExecute(ForceVMCounter)) {
440  ForceEmitWaitcnt[VM_CNT] = true;
441  } else {
442  ForceEmitWaitcnt[VM_CNT] = false;
443  }
444 #endif // NDEBUG
445  }
446 
447  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
448  bool generateWaitcntInstBefore(MachineInstr &MI,
449  WaitcntBrackets &ScoreBrackets,
450  MachineInstr *OldWaitcntInstr);
451  void updateEventWaitcntAfter(MachineInstr &Inst,
452  WaitcntBrackets *ScoreBrackets);
453  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
454  WaitcntBrackets &ScoreBrackets);
455 };
456 
457 } // end anonymous namespace
458 
459 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
460  const SIInstrInfo *TII,
461  const MachineRegisterInfo *MRI,
462  const SIRegisterInfo *TRI,
463  unsigned OpNo, bool Def) const {
464  const MachineOperand &Op = MI->getOperand(OpNo);
465  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
466  (Def && !Op.isDef()))
467  return {-1, -1};
468 
469  // A use via a PW operand does not need a waitcnt.
470  // A partial write is not a WAW.
471  assert(!Op.getSubReg() || !Op.isUndef());
472 
473  RegInterval Result;
474  const MachineRegisterInfo &MRIA = *MRI;
475 
476  unsigned Reg = TRI->getEncodingValue(Op.getReg());
477 
478  if (TRI->isVGPR(MRIA, Op.getReg())) {
479  assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
480  Result.first = Reg - RegisterEncoding.VGPR0;
481  assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
482  } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
483  assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
484  Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
485  assert(Result.first >= NUM_ALL_VGPRS &&
486  Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
487  }
488  // TODO: Handle TTMP
489  // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
490  else
491  return {-1, -1};
492 
493  const MachineInstr &MIA = *MI;
494  const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
495  unsigned Size = TRI->getRegSizeInBits(*RC);
496  Result.second = Result.first + (Size / 32);
497 
498  return Result;
499 }
500 
501 void WaitcntBrackets::setExpScore(const MachineInstr *MI,
502  const SIInstrInfo *TII,
503  const SIRegisterInfo *TRI,
504  const MachineRegisterInfo *MRI, unsigned OpNo,
505  uint32_t Val) {
506  RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
507  LLVM_DEBUG({
508  const MachineOperand &Opnd = MI->getOperand(OpNo);
509  assert(TRI->isVGPR(*MRI, Opnd.getReg()));
510  });
511  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
512  setRegScore(RegNo, EXP_CNT, Val);
513  }
514 }
515 
516 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
517  const SIRegisterInfo *TRI,
518  const MachineRegisterInfo *MRI,
519  WaitEventType E, MachineInstr &Inst) {
520  const MachineRegisterInfo &MRIA = *MRI;
521  InstCounterType T = eventCounter(E);
522  uint32_t CurrScore = getScoreUB(T) + 1;
523  if (CurrScore == 0)
524  report_fatal_error("InsertWaitcnt score wraparound");
525  // PendingEvents and ScoreUB need to be update regardless if this event
526  // changes the score of a register or not.
527  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
528  if (!hasPendingEvent(E)) {
529  if (PendingEvents & WaitEventMaskForInst[T])
530  MixedPendingEvents[T] = true;
531  PendingEvents |= 1 << E;
532  }
533  setScoreUB(T, CurrScore);
534 
535  if (T == EXP_CNT) {
536  // Put score on the source vgprs. If this is a store, just use those
537  // specific register(s).
538  if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
539  int AddrOpIdx =
540  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
541  // All GDS operations must protect their address register (same as
542  // export.)
543  if (AddrOpIdx != -1) {
544  setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
545  } else {
546  assert(Inst.getOpcode() == AMDGPU::DS_APPEND ||
547  Inst.getOpcode() == AMDGPU::DS_CONSUME ||
548  Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
549  Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER);
550  }
551 
552  if (Inst.mayStore()) {
554  AMDGPU::OpName::data0) != -1) {
555  setExpScore(
556  &Inst, TII, TRI, MRI,
557  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
558  CurrScore);
559  }
561  AMDGPU::OpName::data1) != -1) {
562  setExpScore(&Inst, TII, TRI, MRI,
564  AMDGPU::OpName::data1),
565  CurrScore);
566  }
567  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
568  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
569  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
570  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
571  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
572  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
573  Inst.getOpcode() != AMDGPU::DS_APPEND &&
574  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
576  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
577  const MachineOperand &Op = Inst.getOperand(I);
578  if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
579  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
580  }
581  }
582  }
583  } else if (TII->isFLAT(Inst)) {
584  if (Inst.mayStore()) {
585  setExpScore(
586  &Inst, TII, TRI, MRI,
587  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
588  CurrScore);
589  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
590  setExpScore(
591  &Inst, TII, TRI, MRI,
592  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
593  CurrScore);
594  }
595  } else if (TII->isMIMG(Inst)) {
596  if (Inst.mayStore()) {
597  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
598  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
599  setExpScore(
600  &Inst, TII, TRI, MRI,
601  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
602  CurrScore);
603  }
604  } else if (TII->isMTBUF(Inst)) {
605  if (Inst.mayStore()) {
606  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
607  }
608  } else if (TII->isMUBUF(Inst)) {
609  if (Inst.mayStore()) {
610  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
611  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
612  setExpScore(
613  &Inst, TII, TRI, MRI,
614  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
615  CurrScore);
616  }
617  } else {
618  if (TII->isEXP(Inst)) {
619  // For export the destination registers are really temps that
620  // can be used as the actual source after export patching, so
621  // we need to treat them like sources and set the EXP_CNT
622  // score.
623  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
624  MachineOperand &DefMO = Inst.getOperand(I);
625  if (DefMO.isReg() && DefMO.isDef() &&
626  TRI->isVGPR(MRIA, DefMO.getReg())) {
627  setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
628  CurrScore);
629  }
630  }
631  }
632  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
633  MachineOperand &MO = Inst.getOperand(I);
634  if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
635  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
636  }
637  }
638  }
639 #if 0 // TODO: check if this is handled by MUBUF code above.
640  } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
641  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
642  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
643  MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
644  unsigned OpNo;//TODO: find the OpNo for this operand;
645  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
646  for (signed RegNo = Interval.first; RegNo < Interval.second;
647  ++RegNo) {
648  setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
649  }
650 #endif
651  } else {
652  // Match the score to the destination registers.
653  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
654  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
655  if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
656  continue;
657  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
658  setRegScore(RegNo, T, CurrScore);
659  }
660  }
661  if (TII->isDS(Inst) && Inst.mayStore()) {
662  setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
663  }
664  }
665 }
666 
668  OS << '\n';
669  for (auto T : inst_counter_types()) {
670  uint32_t LB = getScoreLB(T);
671  uint32_t UB = getScoreUB(T);
672 
673  switch (T) {
674  case VM_CNT:
675  OS << " VM_CNT(" << UB - LB << "): ";
676  break;
677  case LGKM_CNT:
678  OS << " LGKM_CNT(" << UB - LB << "): ";
679  break;
680  case EXP_CNT:
681  OS << " EXP_CNT(" << UB - LB << "): ";
682  break;
683  case VS_CNT:
684  OS << " VS_CNT(" << UB - LB << "): ";
685  break;
686  default:
687  OS << " UNKNOWN(" << UB - LB << "): ";
688  break;
689  }
690 
691  if (LB < UB) {
692  // Print vgpr scores.
693  for (int J = 0; J <= getMaxVGPR(); J++) {
694  uint32_t RegScore = getRegScore(J, T);
695  if (RegScore <= LB)
696  continue;
697  uint32_t RelScore = RegScore - LB - 1;
698  if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
699  OS << RelScore << ":v" << J << " ";
700  } else {
701  OS << RelScore << ":ds ";
702  }
703  }
704  // Also need to print sgpr scores for lgkm_cnt.
705  if (T == LGKM_CNT) {
706  for (int J = 0; J <= getMaxSGPR(); J++) {
707  uint32_t RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
708  if (RegScore <= LB)
709  continue;
710  uint32_t RelScore = RegScore - LB - 1;
711  OS << RelScore << ":s" << J << " ";
712  }
713  }
714  }
715  OS << '\n';
716  }
717  OS << '\n';
718 }
719 
720 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
721 /// whether a waitcnt instruction is needed at all.
722 bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
723  return simplifyWaitcnt(VM_CNT, Wait.VmCnt) |
724  simplifyWaitcnt(EXP_CNT, Wait.ExpCnt) |
725  simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt) |
726  simplifyWaitcnt(VS_CNT, Wait.VsCnt);
727 }
728 
729 bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
730  unsigned &Count) const {
731  const uint32_t LB = getScoreLB(T);
732  const uint32_t UB = getScoreUB(T);
733  if (Count < UB && UB - Count > LB)
734  return true;
735 
736  Count = ~0u;
737  return false;
738 }
739 
740 void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait,
741  AMDGPU::Waitcnt &Wait) const {
742  // If the score of src_operand falls within the bracket, we need an
743  // s_waitcnt instruction.
744  const uint32_t LB = getScoreLB(T);
745  const uint32_t UB = getScoreUB(T);
746  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
747  if ((T == VM_CNT || T == LGKM_CNT) &&
748  hasPendingFlat() &&
749  !ST->hasFlatLgkmVMemCountInOrder()) {
750  // If there is a pending FLAT operation, and this is a VMem or LGKM
751  // waitcnt and the target can report early completion, then we need
752  // to force a waitcnt 0.
753  addWait(Wait, T, 0);
754  } else if (counterOutOfOrder(T)) {
755  // Counter can get decremented out-of-order when there
756  // are multiple types event in the bracket. Also emit an s_wait counter
757  // with a conservative value of 0 for the counter.
758  addWait(Wait, T, 0);
759  } else {
760  addWait(Wait, T, UB - ScoreToWait);
761  }
762  }
763 }
764 
765 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
766  applyWaitcnt(VM_CNT, Wait.VmCnt);
767  applyWaitcnt(EXP_CNT, Wait.ExpCnt);
768  applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
769  applyWaitcnt(VS_CNT, Wait.VsCnt);
770 }
771 
772 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
773  const uint32_t UB = getScoreUB(T);
774  if (Count >= UB)
775  return;
776  if (Count != 0) {
777  if (counterOutOfOrder(T))
778  return;
779  setScoreLB(T, std::max(getScoreLB(T), UB - Count));
780  } else {
781  setScoreLB(T, UB);
782  MixedPendingEvents[T] = false;
783  PendingEvents &= ~WaitEventMaskForInst[T];
784  }
785 }
786 
787 // Where there are multiple types of event in the bracket of a counter,
788 // the decrement may go out of order.
789 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
790  // Scalar memory read always can go out of order.
791  if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
792  return true;
793  return MixedPendingEvents[T];
794 }
795 
796 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
797  false)
798 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
799  false)
800 
801 char SIInsertWaitcnts::ID = 0;
802 
803 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
804 
806  return new SIInsertWaitcnts();
807 }
808 
809 static bool readsVCCZ(const MachineInstr &MI) {
810  unsigned Opc = MI.getOpcode();
811  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
812  !MI.getOperand(1).isUndef();
813 }
814 
815 /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
817  // Currently all conventions wait, but this may not always be the case.
818  //
819  // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
820  // senses to omit the wait and do it in the caller.
821  return true;
822 }
823 
824 /// \returns true if the callee is expected to wait for any outstanding waits
825 /// before returning.
827  return true;
828 }
829 
830 /// Generate s_waitcnt instruction to be placed before cur_Inst.
831 /// Instructions of a given type are returned in order,
832 /// but instructions of different types can complete out of order.
833 /// We rely on this in-order completion
834 /// and simply assign a score to the memory access instructions.
835 /// We keep track of the active "score bracket" to determine
836 /// if an access of a memory read requires an s_waitcnt
837 /// and if so what the value of each counter is.
838 /// The "score bracket" is bound by the lower bound and upper bound
839 /// scores (*_score_LB and *_score_ub respectively).
840 bool SIInsertWaitcnts::generateWaitcntInstBefore(
841  MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
842  MachineInstr *OldWaitcntInstr) {
843  setForceEmitWaitcnt();
844  bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
845 
846  if (MI.isDebugInstr())
847  return false;
848 
849  AMDGPU::Waitcnt Wait;
850 
851  // See if this instruction has a forced S_WAITCNT VM.
852  // TODO: Handle other cases of NeedsWaitcntVmBefore()
853  if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
854  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
855  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
856  MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
857  MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
858  Wait.VmCnt = 0;
859  }
860 
861  // All waits must be resolved at call return.
862  // NOTE: this could be improved with knowledge of all call sites or
863  // with knowledge of the called routines.
864  if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
865  MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
866  (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
867  Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
868  }
869  // Resolve vm waits before gs-done.
870  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
871  MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
874  Wait.VmCnt = 0;
875  }
876 #if 0 // TODO: the following blocks of logic when we have fence.
877  else if (MI.getOpcode() == SC_FENCE) {
878  const unsigned int group_size =
879  context->shader_info->GetMaxThreadGroupSize();
880  // group_size == 0 means thread group size is unknown at compile time
881  const bool group_is_multi_wave =
882  (group_size == 0 || group_size > target_info->GetWaveFrontSize());
883  const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
884 
885  for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
886  SCRegType src_type = Inst->GetSrcType(i);
887  switch (src_type) {
888  case SCMEM_LDS:
889  if (group_is_multi_wave ||
890  context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
891  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
892  ScoreBrackets->getScoreUB(LGKM_CNT));
893  // LDS may have to wait for VM_CNT after buffer load to LDS
894  if (target_info->HasBufferLoadToLDS()) {
895  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
896  ScoreBrackets->getScoreUB(VM_CNT));
897  }
898  }
899  break;
900 
901  case SCMEM_GDS:
902  if (group_is_multi_wave || fence_is_global) {
903  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
904  ScoreBrackets->getScoreUB(EXP_CNT));
905  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
906  ScoreBrackets->getScoreUB(LGKM_CNT));
907  }
908  break;
909 
910  case SCMEM_UAV:
911  case SCMEM_TFBUF:
912  case SCMEM_RING:
913  case SCMEM_SCATTER:
914  if (group_is_multi_wave || fence_is_global) {
915  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
916  ScoreBrackets->getScoreUB(EXP_CNT));
917  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
918  ScoreBrackets->getScoreUB(VM_CNT));
919  }
920  break;
921 
922  case SCMEM_SCRATCH:
923  default:
924  break;
925  }
926  }
927  }
928 #endif
929 
930  // Export & GDS instructions do not read the EXEC mask until after the export
931  // is granted (which can occur well after the instruction is issued).
932  // The shader program must flush all EXP operations on the export-count
933  // before overwriting the EXEC mask.
934  else {
935  if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
936  // Export and GDS are tracked individually, either may trigger a waitcnt
937  // for EXEC.
938  if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
939  ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
940  ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
941  ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
942  Wait.ExpCnt = 0;
943  }
944  }
945 
946  if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
947  // Don't bother waiting on anything except the call address. The function
948  // is going to insert a wait on everything in its prolog. This still needs
949  // to be careful if the call target is a load (e.g. a GOT load).
950  Wait = AMDGPU::Waitcnt();
951 
952  int CallAddrOpIdx =
953  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
954  RegInterval Interval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI,
955  CallAddrOpIdx, false);
956  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
957  ScoreBrackets.determineWait(
958  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
959  }
960  } else {
961  // FIXME: Should not be relying on memoperands.
962  // Look at the source operands of every instruction to see if
963  // any of them results from a previous memory operation that affects
964  // its current usage. If so, an s_waitcnt instruction needs to be
965  // emitted.
966  // If the source operand was defined by a load, add the s_waitcnt
967  // instruction.
968  for (const MachineMemOperand *Memop : MI.memoperands()) {
969  unsigned AS = Memop->getAddrSpace();
970  if (AS != AMDGPUAS::LOCAL_ADDRESS)
971  continue;
972  unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
973  // VM_CNT is only relevant to vgpr or LDS.
974  ScoreBrackets.determineWait(
975  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
976  }
977 
978  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
979  const MachineOperand &Op = MI.getOperand(I);
980  const MachineRegisterInfo &MRIA = *MRI;
981  RegInterval Interval =
982  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
983  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
984  if (TRI->isVGPR(MRIA, Op.getReg())) {
985  // VM_CNT is only relevant to vgpr or LDS.
986  ScoreBrackets.determineWait(
987  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
988  }
989  ScoreBrackets.determineWait(
990  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
991  }
992  }
993  // End of for loop that looks at all source operands to decide vm_wait_cnt
994  // and lgk_wait_cnt.
995 
996  // Two cases are handled for destination operands:
997  // 1) If the destination operand was defined by a load, add the s_waitcnt
998  // instruction to guarantee the right WAW order.
999  // 2) If a destination operand that was used by a recent export/store ins,
1000  // add s_waitcnt on exp_cnt to guarantee the WAR order.
1001  if (MI.mayStore()) {
1002  // FIXME: Should not be relying on memoperands.
1003  for (const MachineMemOperand *Memop : MI.memoperands()) {
1004  unsigned AS = Memop->getAddrSpace();
1005  if (AS != AMDGPUAS::LOCAL_ADDRESS)
1006  continue;
1007  unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1008  ScoreBrackets.determineWait(
1009  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1010  ScoreBrackets.determineWait(
1011  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1012  }
1013  }
1014  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1015  MachineOperand &Def = MI.getOperand(I);
1016  const MachineRegisterInfo &MRIA = *MRI;
1017  RegInterval Interval =
1018  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
1019  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1020  if (TRI->isVGPR(MRIA, Def.getReg())) {
1021  ScoreBrackets.determineWait(
1022  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1023  ScoreBrackets.determineWait(
1024  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1025  }
1026  ScoreBrackets.determineWait(
1027  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1028  }
1029  } // End of for loop that looks at all dest operands.
1030  }
1031  }
1032 
1033  // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1034  // occurs before the instruction. Doing it here prevents any additional
1035  // S_WAITCNTs from being emitted if the instruction was marked as
1036  // requiring a WAITCNT beforehand.
1037  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1038  !ST->hasAutoWaitcntBeforeBarrier()) {
1039  Wait = Wait.combined(AMDGPU::Waitcnt::allZero(IV));
1040  }
1041 
1042  // TODO: Remove this work-around, enable the assert for Bug 457939
1043  // after fixing the scheduler. Also, the Shader Compiler code is
1044  // independent of target.
1045  if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1046  if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1047  ScoreBrackets.getScoreUB(LGKM_CNT) &&
1048  ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1049  Wait.LgkmCnt = 0;
1050  }
1051  }
1052 
1053  // Early-out if no wait is indicated.
1054  if (!ScoreBrackets.simplifyWaitcnt(Wait) && !IsForceEmitWaitcnt) {
1055  bool Modified = false;
1056  if (OldWaitcntInstr) {
1057  for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
1058  &*II != &MI; II = NextI, ++NextI) {
1059  if (II->isDebugInstr())
1060  continue;
1061 
1062  if (TrackedWaitcntSet.count(&*II)) {
1063  TrackedWaitcntSet.erase(&*II);
1064  II->eraseFromParent();
1065  Modified = true;
1066  } else if (II->getOpcode() == AMDGPU::S_WAITCNT) {
1067  int64_t Imm = II->getOperand(0).getImm();
1068  ScoreBrackets.applyWaitcnt(AMDGPU::decodeWaitcnt(IV, Imm));
1069  } else {
1070  assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
1071  assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1072  ScoreBrackets.applyWaitcnt(
1073  AMDGPU::Waitcnt(0, 0, 0, II->getOperand(1).getImm()));
1074  }
1075  }
1076  }
1077  return Modified;
1078  }
1079 
1080  if (ForceEmitZeroWaitcnts)
1081  Wait = AMDGPU::Waitcnt::allZero(IV);
1082 
1083  if (ForceEmitWaitcnt[VM_CNT])
1084  Wait.VmCnt = 0;
1085  if (ForceEmitWaitcnt[EXP_CNT])
1086  Wait.ExpCnt = 0;
1087  if (ForceEmitWaitcnt[LGKM_CNT])
1088  Wait.LgkmCnt = 0;
1089  if (ForceEmitWaitcnt[VS_CNT])
1090  Wait.VsCnt = 0;
1091 
1092  ScoreBrackets.applyWaitcnt(Wait);
1093 
1094  AMDGPU::Waitcnt OldWait;
1095  bool Modified = false;
1096 
1097  if (OldWaitcntInstr) {
1098  for (auto II = OldWaitcntInstr->getIterator(), NextI = std::next(II);
1099  &*II != &MI; II = NextI, NextI++) {
1100  if (II->isDebugInstr())
1101  continue;
1102 
1103  if (II->getOpcode() == AMDGPU::S_WAITCNT) {
1104  unsigned IEnc = II->getOperand(0).getImm();
1105  AMDGPU::Waitcnt IWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1106  OldWait = OldWait.combined(IWait);
1107  if (!TrackedWaitcntSet.count(&*II))
1108  Wait = Wait.combined(IWait);
1109  unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
1110  if (IEnc != NewEnc) {
1111  II->getOperand(0).setImm(NewEnc);
1112  Modified = true;
1113  }
1114  Wait.VmCnt = ~0u;
1115  Wait.LgkmCnt = ~0u;
1116  Wait.ExpCnt = ~0u;
1117  } else {
1118  assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
1119  assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1120 
1121  unsigned ICnt = II->getOperand(1).getImm();
1122  OldWait.VsCnt = std::min(OldWait.VsCnt, ICnt);
1123  if (!TrackedWaitcntSet.count(&*II))
1124  Wait.VsCnt = std::min(Wait.VsCnt, ICnt);
1125  if (Wait.VsCnt != ICnt) {
1126  II->getOperand(1).setImm(Wait.VsCnt);
1127  Modified = true;
1128  }
1129  Wait.VsCnt = ~0u;
1130  }
1131 
1132  LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
1133  << "Old Instr: " << MI << '\n'
1134  << "New Instr: " << *II << '\n');
1135 
1136  if (!Wait.hasWait())
1137  return Modified;
1138  }
1139  }
1140 
1141  if (Wait.VmCnt != ~0u || Wait.LgkmCnt != ~0u || Wait.ExpCnt != ~0u) {
1142  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1143  auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1144  MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1145  .addImm(Enc);
1146  TrackedWaitcntSet.insert(SWaitInst);
1147  Modified = true;
1148 
1149  LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1150  << "Old Instr: " << MI << '\n'
1151  << "New Instr: " << *SWaitInst << '\n');
1152  }
1153 
1154  if (Wait.VsCnt != ~0u) {
1155  assert(ST->hasVscnt());
1156 
1157  auto SWaitInst =
1158  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1159  TII->get(AMDGPU::S_WAITCNT_VSCNT))
1160  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1161  .addImm(Wait.VsCnt);
1162  TrackedWaitcntSet.insert(SWaitInst);
1163  Modified = true;
1164 
1165  LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1166  << "Old Instr: " << MI << '\n'
1167  << "New Instr: " << *SWaitInst << '\n');
1168  }
1169 
1170  return Modified;
1171 }
1172 
1173 // This is a flat memory operation. Check to see if it has memory
1174 // tokens for both LDS and Memory, and if so mark it as a flat.
1175 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1176  if (MI.memoperands_empty())
1177  return true;
1178 
1179  for (const MachineMemOperand *Memop : MI.memoperands()) {
1180  unsigned AS = Memop->getAddrSpace();
1182  return true;
1183  }
1184 
1185  return false;
1186 }
1187 
1188 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1189  WaitcntBrackets *ScoreBrackets) {
1190  // Now look at the instruction opcode. If it is a memory access
1191  // instruction, update the upper-bound of the appropriate counter's
1192  // bracket and the destination operand scores.
1193  // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1194  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1195  if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1196  TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1197  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1198  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1199  } else {
1200  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1201  }
1202  } else if (TII->isFLAT(Inst)) {
1203  assert(Inst.mayLoad() || Inst.mayStore());
1204 
1205  if (TII->usesVM_CNT(Inst)) {
1206  if (!ST->hasVscnt())
1207  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1208  else if (Inst.mayLoad() &&
1209  AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1)
1210  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1211  else
1212  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1213  }
1214 
1215  if (TII->usesLGKM_CNT(Inst)) {
1216  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1217 
1218  // This is a flat memory operation, so note it - it will require
1219  // that both the VM and LGKM be flushed to zero if it is pending when
1220  // a VM or LGKM dependency occurs.
1221  if (mayAccessLDSThroughFlat(Inst))
1222  ScoreBrackets->setPendingFlat();
1223  }
1224  } else if (SIInstrInfo::isVMEM(Inst) &&
1225  // TODO: get a better carve out.
1226  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1227  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1228  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL &&
1229  Inst.getOpcode() != AMDGPU::BUFFER_GL0_INV &&
1230  Inst.getOpcode() != AMDGPU::BUFFER_GL1_INV) {
1231  if (!ST->hasVscnt())
1232  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1233  else if ((Inst.mayLoad() &&
1234  AMDGPU::getAtomicRetOp(Inst.getOpcode()) == -1) ||
1235  /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
1236  (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
1237  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1238  else if (Inst.mayStore())
1239  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1240 
1241  if (ST->vmemWriteNeedsExpWaitcnt() &&
1242  (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
1243  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1244  }
1245  } else if (TII->isSMRD(Inst)) {
1246  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1247  } else if (Inst.isCall()) {
1248  if (callWaitsOnFunctionReturn(Inst)) {
1249  // Act as a wait on everything
1250  ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(IV));
1251  } else {
1252  // May need to way wait for anything.
1253  ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
1254  }
1255  } else {
1256  switch (Inst.getOpcode()) {
1257  case AMDGPU::S_SENDMSG:
1258  case AMDGPU::S_SENDMSGHALT:
1259  ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1260  break;
1261  case AMDGPU::EXP:
1262  case AMDGPU::EXP_DONE: {
1263  int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1264  if (Imm >= 32 && Imm <= 63)
1265  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1266  else if (Imm >= 12 && Imm <= 15)
1267  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1268  else
1269  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1270  break;
1271  }
1272  case AMDGPU::S_MEMTIME:
1273  case AMDGPU::S_MEMREALTIME:
1274  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1275  break;
1276  default:
1277  break;
1278  }
1279  }
1280 }
1281 
1282 bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score,
1283  uint32_t OtherScore) {
1284  uint32_t MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
1285  uint32_t OtherShifted =
1286  OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
1287  Score = std::max(MyShifted, OtherShifted);
1288  return OtherShifted > MyShifted;
1289 }
1290 
1291 /// Merge the pending events and associater score brackets of \p Other into
1292 /// this brackets status.
1293 ///
1294 /// Returns whether the merge resulted in a change that requires tighter waits
1295 /// (i.e. the merged brackets strictly dominate the original brackets).
1296 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
1297  bool StrictDom = false;
1298 
1299  for (auto T : inst_counter_types()) {
1300  // Merge event flags for this counter
1301  const bool OldOutOfOrder = counterOutOfOrder(T);
1302  const uint32_t OldEvents = PendingEvents & WaitEventMaskForInst[T];
1303  const uint32_t OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
1304  if (OtherEvents & ~OldEvents)
1305  StrictDom = true;
1306  if (Other.MixedPendingEvents[T] ||
1307  (OldEvents && OtherEvents && OldEvents != OtherEvents))
1308  MixedPendingEvents[T] = true;
1309  PendingEvents |= OtherEvents;
1310 
1311  // Merge scores for this counter
1312  const uint32_t MyPending = ScoreUBs[T] - ScoreLBs[T];
1313  const uint32_t OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
1314  MergeInfo M;
1315  M.OldLB = ScoreLBs[T];
1316  M.OtherLB = Other.ScoreLBs[T];
1317  M.MyShift = OtherPending > MyPending ? OtherPending - MyPending : 0;
1318  M.OtherShift = ScoreUBs[T] - Other.ScoreUBs[T] + M.MyShift;
1319 
1320  const uint32_t NewUB = ScoreUBs[T] + M.MyShift;
1321  if (NewUB < ScoreUBs[T])
1322  report_fatal_error("waitcnt score overflow");
1323  ScoreUBs[T] = NewUB;
1324  ScoreLBs[T] = std::min(M.OldLB + M.MyShift, M.OtherLB + M.OtherShift);
1325 
1326  StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
1327 
1328  bool RegStrictDom = false;
1329  for (int J = 0, E = std::max(getMaxVGPR(), Other.getMaxVGPR()) + 1; J != E;
1330  J++) {
1331  RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
1332  }
1333 
1334  if (T == LGKM_CNT) {
1335  for (int J = 0, E = std::max(getMaxSGPR(), Other.getMaxSGPR()) + 1;
1336  J != E; J++) {
1337  RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
1338  }
1339  }
1340 
1341  if (RegStrictDom && !OldOutOfOrder)
1342  StrictDom = true;
1343  }
1344 
1345  VgprUB = std::max(getMaxVGPR(), Other.getMaxVGPR());
1346  SgprUB = std::max(getMaxSGPR(), Other.getMaxSGPR());
1347 
1348  return StrictDom;
1349 }
1350 
1351 // Generate s_waitcnt instructions where needed.
1352 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1353  MachineBasicBlock &Block,
1354  WaitcntBrackets &ScoreBrackets) {
1355  bool Modified = false;
1356 
1357  LLVM_DEBUG({
1358  dbgs() << "*** Block" << Block.getNumber() << " ***";
1359  ScoreBrackets.dump();
1360  });
1361 
1362  // Walk over the instructions.
1363  MachineInstr *OldWaitcntInstr = nullptr;
1364 
1365  for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1366  Iter != E;) {
1367  MachineInstr &Inst = *Iter;
1368 
1369  // Track pre-existing waitcnts from earlier iterations.
1370  if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
1371  (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1372  Inst.getOperand(0).isReg() &&
1373  Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL)) {
1374  if (!OldWaitcntInstr)
1375  OldWaitcntInstr = &Inst;
1376  ++Iter;
1377  continue;
1378  }
1379 
1380  bool VCCZBugWorkAround = false;
1381  if (readsVCCZ(Inst) &&
1382  (!VCCZBugHandledSet.count(&Inst))) {
1383  if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1384  ScoreBrackets.getScoreUB(LGKM_CNT) &&
1385  ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1386  if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
1387  VCCZBugWorkAround = true;
1388  }
1389  }
1390 
1391  // Generate an s_waitcnt instruction to be placed before
1392  // cur_Inst, if needed.
1393  Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
1394  OldWaitcntInstr = nullptr;
1395 
1396  updateEventWaitcntAfter(Inst, &ScoreBrackets);
1397 
1398 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1399  // If this instruction generates a S_SETVSKIP because it is an
1400  // indexed resource, and we are on Tahiti, then it will also force
1401  // an S_WAITCNT vmcnt(0)
1402  if (RequireCheckResourceType(Inst, context)) {
1403  // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1404  ScoreBrackets->setScoreLB(VM_CNT,
1405  ScoreBrackets->getScoreUB(VM_CNT));
1406  }
1407 #endif
1408 
1409  LLVM_DEBUG({
1410  Inst.print(dbgs());
1411  ScoreBrackets.dump();
1412  });
1413 
1414  // TODO: Remove this work-around after fixing the scheduler and enable the
1415  // assert above.
1416  if (VCCZBugWorkAround) {
1417  // Restore the vccz bit. Any time a value is written to vcc, the vcc
1418  // bit is updated, so we can restore the bit by reading the value of
1419  // vcc and then writing it back to the register.
1420  BuildMI(Block, Inst, Inst.getDebugLoc(),
1421  TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1422  TRI->getVCC())
1423  .addReg(TRI->getVCC());
1424  VCCZBugHandledSet.insert(&Inst);
1425  Modified = true;
1426  }
1427 
1428  ++Iter;
1429  }
1430 
1431  return Modified;
1432 }
1433 
1434 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1435  ST = &MF.getSubtarget<GCNSubtarget>();
1436  TII = ST->getInstrInfo();
1437  TRI = &TII->getRegisterInfo();
1438  MRI = &MF.getRegInfo();
1439  IV = AMDGPU::getIsaVersion(ST->getCPU());
1441 
1442  ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1443  for (auto T : inst_counter_types())
1444  ForceEmitWaitcnt[T] = false;
1445 
1446  HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1447  HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1448  HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1449  HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;
1450 
1451  HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1452  HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1453  assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1454  assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1455 
1456  RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1457  RegisterEncoding.VGPRL =
1458  RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1459  RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1460  RegisterEncoding.SGPRL =
1461  RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1462 
1463  TrackedWaitcntSet.clear();
1464  VCCZBugHandledSet.clear();
1465  RpotIdxMap.clear();
1466  BlockInfos.clear();
1467 
1468  // Keep iterating over the blocks in reverse post order, inserting and
1469  // updating s_waitcnt where needed, until a fix point is reached.
1470  for (MachineBasicBlock *MBB :
1472  RpotIdxMap[MBB] = BlockInfos.size();
1473  BlockInfos.emplace_back(MBB);
1474  }
1475 
1476  std::unique_ptr<WaitcntBrackets> Brackets;
1477  bool Modified = false;
1478  bool Repeat;
1479  do {
1480  Repeat = false;
1481 
1482  for (BlockInfo &BI : BlockInfos) {
1483  if (!BI.Dirty)
1484  continue;
1485 
1486  unsigned Idx = std::distance(&*BlockInfos.begin(), &BI);
1487 
1488  if (BI.Incoming) {
1489  if (!Brackets)
1490  Brackets = llvm::make_unique<WaitcntBrackets>(*BI.Incoming);
1491  else
1492  *Brackets = *BI.Incoming;
1493  } else {
1494  if (!Brackets)
1495  Brackets = llvm::make_unique<WaitcntBrackets>(ST);
1496  else
1497  Brackets->clear();
1498  }
1499 
1500  Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
1501  BI.Dirty = false;
1502 
1503  if (Brackets->hasPending()) {
1504  BlockInfo *MoveBracketsToSucc = nullptr;
1505  for (MachineBasicBlock *Succ : BI.MBB->successors()) {
1506  unsigned SuccIdx = RpotIdxMap[Succ];
1507  BlockInfo &SuccBI = BlockInfos[SuccIdx];
1508  if (!SuccBI.Incoming) {
1509  SuccBI.Dirty = true;
1510  if (SuccIdx <= Idx)
1511  Repeat = true;
1512  if (!MoveBracketsToSucc) {
1513  MoveBracketsToSucc = &SuccBI;
1514  } else {
1515  SuccBI.Incoming = llvm::make_unique<WaitcntBrackets>(*Brackets);
1516  }
1517  } else if (SuccBI.Incoming->merge(*Brackets)) {
1518  SuccBI.Dirty = true;
1519  if (SuccIdx <= Idx)
1520  Repeat = true;
1521  }
1522  }
1523  if (MoveBracketsToSucc)
1524  MoveBracketsToSucc->Incoming = std::move(Brackets);
1525  }
1526  }
1527  } while (Repeat);
1528 
1530 
1531  bool HaveScalarStores = false;
1532 
1533  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1534  ++BI) {
1535  MachineBasicBlock &MBB = *BI;
1536 
1537  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1538  ++I) {
1539  if (!HaveScalarStores && TII->isScalarStore(*I))
1540  HaveScalarStores = true;
1541 
1542  if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1543  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1544  EndPgmBlocks.push_back(&MBB);
1545  }
1546  }
1547 
1548  if (HaveScalarStores) {
1549  // If scalar writes are used, the cache must be flushed or else the next
1550  // wave to reuse the same scratch memory can be clobbered.
1551  //
1552  // Insert s_dcache_wb at wave termination points if there were any scalar
1553  // stores, and only if the cache hasn't already been flushed. This could be
1554  // improved by looking across blocks for flushes in postdominating blocks
1555  // from the stores but an explicitly requested flush is probably very rare.
1556  for (MachineBasicBlock *MBB : EndPgmBlocks) {
1557  bool SeenDCacheWB = false;
1558 
1559  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1560  ++I) {
1561  if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1562  SeenDCacheWB = true;
1563  else if (TII->isScalarStore(*I))
1564  SeenDCacheWB = false;
1565 
1566  // FIXME: It would be better to insert this before a waitcnt if any.
1567  if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1568  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1569  !SeenDCacheWB) {
1570  Modified = true;
1571  BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1572  }
1573  }
1574  }
1575  }
1576 
1577  if (!MFI->isEntryFunction()) {
1578  // Wait for any outstanding memory operations that the input registers may
1579  // depend on. We can't track them and it's better to the wait after the
1580  // costly call sequence.
1581 
1582  // TODO: Could insert earlier and schedule more liberally with operations
1583  // that only use caller preserved registers.
1584  MachineBasicBlock &EntryBB = MF.front();
1585  if (ST->hasVscnt())
1586  BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(),
1587  TII->get(AMDGPU::S_WAITCNT_VSCNT))
1588  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1589  .addImm(0);
1590  BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1591  .addImm(0);
1592 
1593  Modified = true;
1594  }
1595 
1596  return Modified;
1597 }
Address space for flat memory.
Definition: AMDGPU.h:262
Interface definition for SIRegisterInfo.
Address space for local memory.
Definition: AMDGPU.h:267
bool modifiesRegister(unsigned Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register...
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
AMDGPU specific subclass of TargetSubtarget.
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:634
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:139
This class represents lattice values for constants.
Definition: AllocatorList.h:23
SI Insert Waitcnts
unsigned getExpcntBitMask(const IsaVersion &Version)
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:36
Implements a dense probed hash-table based set.
Definition: DenseSet.h:249
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:384
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
unsigned getReg() const
getReg - Returns the register number.
unsigned Reg
unsigned getSubReg() const
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:33
LLVM_READONLY int getAtomicNoRetOp(uint16_t Opcode)
FunctionPass * createSIInsertWaitcntsPass()
DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", "Force emit s_waitcnt expcnt(0) instrs")
iterator_range< succ_iterator > successors()
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:462
static const AMDGPUSubtarget & get(const MachineFunction &MF)
A description of a memory reference used in the backend.
LLVM_READONLY int getAtomicRetOp(uint16_t Opcode)
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:488
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:413
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:472
static bool readsVCCZ(const MachineInstr &MI)
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:410
This file provides an implementation of debug counters.
ELFYAML::ELF_STO Other
Definition: ELFYAML.cpp:870
APInt operator*(APInt a, uint64_t RHS)
Definition: APInt.h:2090
InstCounterType
bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:436
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:101
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they&#39;re not in a MachineFuncti...
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition: iterator.h:67
bool isReturn(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:624
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:821
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
unsigned const MachineRegisterInfo * MRI
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:517
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Represent the analysis usage information of a pass.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
self_iterator getIterator()
Definition: ilist_node.h:81
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:73
RegisterMapping
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
const MachineBasicBlock & front() const
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
bool isDebugInstr() const
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:504
IsaVersion getIsaVersion(StringRef GPU)
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
char & SIInsertWaitcntsID
Iterator for intrusive lists based on ilist_node.
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:301
int64_t getImm() const
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
A range adaptor for a pair of iterators.
static Waitcnt allZero(const IsaVersion &Version)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:211
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:255
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Representation of each machine instruction.
Definition: MachineInstr.h:63
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) INITIALIZE_PASS_END(SIInsertWaitcnts
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:444
#define DEBUG_TYPE
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:340
uint32_t Size
Definition: Profile.cpp:46
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:808
bool memoperands_empty() const
Return true if we don&#39;t have any memory operands which described the memory access done by this instr...
Definition: MachineInstr.h:547
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:45
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
unsigned getLgkmcntBitMask(const IsaVersion &Version)
ProcessInfo Wait(const ProcessInfo &PI, unsigned SecondsToWait, bool WaitUntilTerminates, std::string *ErrMsg=nullptr)
This function waits for the process specified by PI to finish.
bool operator==(uint64_t V1, const APInt &V2)
Definition: APInt.h:1966
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:415
WaitEventType
unsigned getVmcntBitMask(const IsaVersion &Version)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
const SIRegisterInfo * getRegisterInfo() const override