LLVM  7.0.0svn
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// Insert wait instructions for memory reads and writes.
12 ///
13 /// Memory reads and writes are issued asynchronously, so we need to insert
14 /// S_WAITCNT instructions when we want to access any of their results or
15 /// overwrite any register that's used asynchronously.
16 //
17 //===----------------------------------------------------------------------===//
18 
19 #include "AMDGPU.h"
20 #include "AMDGPUSubtarget.h"
21 #include "SIDefines.h"
22 #include "SIInstrInfo.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "SIRegisterInfo.h"
25 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/ADT/DenseMap.h"
27 #include "llvm/ADT/DenseSet.h"
29 #include "llvm/ADT/STLExtras.h"
30 #include "llvm/ADT/SmallVector.h"
40 #include "llvm/IR/DebugLoc.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/Debug.h"
46 #include <algorithm>
47 #include <cassert>
48 #include <cstdint>
49 #include <cstring>
50 #include <memory>
51 #include <utility>
52 #include <vector>
53 
54 using namespace llvm;
55 
56 #define DEBUG_TYPE "si-insert-waitcnts"
57 
58 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
59  "Force emit s_waitcnt expcnt(0) instrs");
60 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
61  "Force emit s_waitcnt lgkmcnt(0) instrs");
62 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
63  "Force emit s_waitcnt vmcnt(0) instrs");
64 
66  "amdgpu-waitcnt-forcezero",
67  cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
68  cl::init(0), cl::Hidden);
69 
70 namespace {
71 
72 // Class of object that encapsulates latest instruction counter score
73 // associated with the operand. Used for determining whether
74 // s_waitcnt instruction needs to be emited.
75 
76 #define CNT_MASK(t) (1u << (t))
77 
78 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
79 
80 using RegInterval = std::pair<signed, signed>;
81 
82 struct {
83  int32_t VmcntMax;
84  int32_t ExpcntMax;
85  int32_t LgkmcntMax;
86  int32_t NumVGPRsMax;
87  int32_t NumSGPRsMax;
88 } HardwareLimits;
89 
90 struct {
91  unsigned VGPR0;
92  unsigned VGPRL;
93  unsigned SGPR0;
94  unsigned SGPRL;
95 } RegisterEncoding;
96 
98  VMEM_ACCESS, // vector-memory read & write
99  LDS_ACCESS, // lds read & write
100  GDS_ACCESS, // gds read & write
101  SQ_MESSAGE, // send message
102  SMEM_ACCESS, // scalar-memory read & write
103  EXP_GPR_LOCK, // export holding on its data src
104  GDS_GPR_LOCK, // GDS holding on its data and addr src
105  EXP_POS_ACCESS, // write to export position
106  EXP_PARAM_ACCESS, // write to export parameter
107  VMW_GPR_LOCK, // vector-memory write holding on its data src
108  NUM_WAIT_EVENTS,
109 };
110 
111 // The mapping is:
112 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
113 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
114 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
115 // We reserve a fixed number of VGPR slots in the scoring tables for
116 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
118  SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
119  SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
120  NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
121  EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
122  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
123 };
124 
125 #define ForAllWaitEventType(w) \
126  for (enum WaitEventType w = (enum WaitEventType)0; \
127  (w) < (enum WaitEventType)NUM_WAIT_EVENTS; \
128  (w) = (enum WaitEventType)((w) + 1))
129 
130 // This is a per-basic-block object that maintains current score brackets
131 // of each wait counter, and a per-register scoreboard for each wait counter.
132 // We also maintain the latest score for every event type that can change the
133 // waitcnt in order to know if there are multiple types of events within
134 // the brackets. When multiple types of event happen in the bracket,
135 // wait count may get decreased out of order, therefore we need to put in
136 // "s_waitcnt 0" before use.
137 class BlockWaitcntBrackets {
138 public:
139  BlockWaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
140  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
141  T = (enum InstCounterType)(T + 1)) {
142  memset(VgprScores[T], 0, sizeof(VgprScores[T]));
143  }
144  }
145 
146  ~BlockWaitcntBrackets() = default;
147 
148  static int32_t getWaitCountMax(InstCounterType T) {
149  switch (T) {
150  case VM_CNT:
151  return HardwareLimits.VmcntMax;
152  case LGKM_CNT:
153  return HardwareLimits.LgkmcntMax;
154  case EXP_CNT:
155  return HardwareLimits.ExpcntMax;
156  default:
157  break;
158  }
159  return 0;
160  }
161 
162  void setScoreLB(InstCounterType T, int32_t Val) {
163  assert(T < NUM_INST_CNTS);
164  if (T >= NUM_INST_CNTS)
165  return;
166  ScoreLBs[T] = Val;
167  }
168 
169  void setScoreUB(InstCounterType T, int32_t Val) {
170  assert(T < NUM_INST_CNTS);
171  if (T >= NUM_INST_CNTS)
172  return;
173  ScoreUBs[T] = Val;
174  if (T == EXP_CNT) {
175  int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
176  if (ScoreLBs[T] < UB)
177  ScoreLBs[T] = UB;
178  }
179  }
180 
181  int32_t getScoreLB(InstCounterType T) {
182  assert(T < NUM_INST_CNTS);
183  if (T >= NUM_INST_CNTS)
184  return 0;
185  return ScoreLBs[T];
186  }
187 
188  int32_t getScoreUB(InstCounterType T) {
189  assert(T < NUM_INST_CNTS);
190  if (T >= NUM_INST_CNTS)
191  return 0;
192  return ScoreUBs[T];
193  }
194 
195  // Mapping from event to counter.
196  InstCounterType eventCounter(WaitEventType E) {
197  switch (E) {
198  case VMEM_ACCESS:
199  return VM_CNT;
200  case LDS_ACCESS:
201  case GDS_ACCESS:
202  case SQ_MESSAGE:
203  case SMEM_ACCESS:
204  return LGKM_CNT;
205  case EXP_GPR_LOCK:
206  case GDS_GPR_LOCK:
207  case VMW_GPR_LOCK:
208  case EXP_POS_ACCESS:
209  case EXP_PARAM_ACCESS:
210  return EXP_CNT;
211  default:
212  llvm_unreachable("unhandled event type");
213  }
214  return NUM_INST_CNTS;
215  }
216 
217  void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
218  if (GprNo < NUM_ALL_VGPRS) {
219  if (GprNo > VgprUB) {
220  VgprUB = GprNo;
221  }
222  VgprScores[T][GprNo] = Val;
223  } else {
224  assert(T == LGKM_CNT);
225  if (GprNo - NUM_ALL_VGPRS > SgprUB) {
226  SgprUB = GprNo - NUM_ALL_VGPRS;
227  }
228  SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
229  }
230  }
231 
232  int32_t getRegScore(int GprNo, InstCounterType T) {
233  if (GprNo < NUM_ALL_VGPRS) {
234  return VgprScores[T][GprNo];
235  }
236  return SgprScores[GprNo - NUM_ALL_VGPRS];
237  }
238 
239  void clear() {
240  memset(ScoreLBs, 0, sizeof(ScoreLBs));
241  memset(ScoreUBs, 0, sizeof(ScoreUBs));
242  memset(EventUBs, 0, sizeof(EventUBs));
243  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
244  T = (enum InstCounterType)(T + 1)) {
245  memset(VgprScores[T], 0, sizeof(VgprScores[T]));
246  }
247  memset(SgprScores, 0, sizeof(SgprScores));
248  }
249 
250  RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
251  const MachineRegisterInfo *MRI,
252  const SIRegisterInfo *TRI, unsigned OpNo,
253  bool Def) const;
254 
255  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
256  const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
257  unsigned OpNo, int32_t Val);
258 
259  void setWaitAtBeginning() { WaitAtBeginning = true; }
260  void clearWaitAtBeginning() { WaitAtBeginning = false; }
261  bool getWaitAtBeginning() const { return WaitAtBeginning; }
262  void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
263  int32_t getMaxVGPR() const { return VgprUB; }
264  int32_t getMaxSGPR() const { return SgprUB; }
265 
266  int32_t getEventUB(enum WaitEventType W) const {
267  assert(W < NUM_WAIT_EVENTS);
268  return EventUBs[W];
269  }
270 
271  bool counterOutOfOrder(InstCounterType T);
272  unsigned int updateByWait(InstCounterType T, int ScoreToWait);
273  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
274  const MachineRegisterInfo *MRI, WaitEventType E,
275  MachineInstr &MI);
276 
277  bool hasPendingSMEM() const {
278  return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
279  EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
280  }
281 
282  bool hasPendingFlat() const {
283  return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
284  LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
285  (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
286  LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
287  }
288 
289  void setPendingFlat() {
290  LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
291  LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
292  }
293 
294  int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
295 
296  void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
297 
298  bool getRevisitLoop() const { return RevisitLoop; }
299  void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
300 
301  void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
302  int32_t getPostOrder() const { return PostOrder; }
303 
304  void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
305  void clearWaitcnt() { Waitcnt = nullptr; }
306  MachineInstr *getWaitcnt() const { return Waitcnt; }
307 
308  bool mixedExpTypes() const { return MixedExpTypes; }
309  void setMixedExpTypes(bool MixedExpTypesIn) {
310  MixedExpTypes = MixedExpTypesIn;
311  }
312 
313  void print(raw_ostream &);
314  void dump() { print(dbgs()); }
315 
316 private:
317  const GCNSubtarget *ST = nullptr;
318  bool WaitAtBeginning = false;
319  bool RevisitLoop = false;
320  bool MixedExpTypes = false;
321  int32_t PostOrder = 0;
322  MachineInstr *Waitcnt = nullptr;
323  int32_t ScoreLBs[NUM_INST_CNTS] = {0};
324  int32_t ScoreUBs[NUM_INST_CNTS] = {0};
325  int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
326  // Remember the last flat memory operation.
327  int32_t LastFlat[NUM_INST_CNTS] = {0};
328  // wait_cnt scores for every vgpr.
329  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
330  int32_t VgprUB = 0;
331  int32_t SgprUB = 0;
332  int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
333  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
334  int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
335 };
336 
337 // This is a per-loop-region object that records waitcnt status at the end of
338 // loop footer from the previous iteration. We also maintain an iteration
339 // count to track the number of times the loop has been visited. When it
340 // doesn't converge naturally, we force convergence by inserting s_waitcnt 0
341 // at the end of the loop footer.
342 class LoopWaitcntData {
343 public:
344  LoopWaitcntData() = default;
345  ~LoopWaitcntData() = default;
346 
347  void incIterCnt() { IterCnt++; }
348  void resetIterCnt() { IterCnt = 0; }
349  unsigned getIterCnt() { return IterCnt; }
350 
351  void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
352  MachineInstr *getWaitcnt() const { return LfWaitcnt; }
353 
354  void print() { LLVM_DEBUG(dbgs() << " iteration " << IterCnt << '\n';); }
355 
356 private:
357  // s_waitcnt added at the end of loop footer to stablize wait scores
358  // at the end of the loop footer.
359  MachineInstr *LfWaitcnt = nullptr;
360  // Number of iterations the loop has been visited, not including the initial
361  // walk over.
362  int32_t IterCnt = 0;
363 };
364 
365 class SIInsertWaitcnts : public MachineFunctionPass {
366 private:
367  const GCNSubtarget *ST = nullptr;
368  const SIInstrInfo *TII = nullptr;
369  const SIRegisterInfo *TRI = nullptr;
370  const MachineRegisterInfo *MRI = nullptr;
371  const MachineLoopInfo *MLI = nullptr;
373  AMDGPUAS AMDGPUASI;
374 
375  DenseSet<MachineBasicBlock *> BlockVisitedSet;
376  DenseSet<MachineInstr *> TrackedWaitcntSet;
377  DenseSet<MachineInstr *> VCCZBugHandledSet;
378 
380  BlockWaitcntBracketsMap;
381 
382  std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
383 
385 
386  std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
387 
388  // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
389  // because of amdgpu-waitcnt-forcezero flag
390  bool ForceEmitZeroWaitcnts;
391  bool ForceEmitWaitcnt[NUM_INST_CNTS];
392 
393 public:
394  static char ID;
395 
396  SIInsertWaitcnts() : MachineFunctionPass(ID) {
397  (void)ForceExpCounter;
398  (void)ForceLgkmCounter;
399  (void)ForceVMCounter;
400  }
401 
402  bool runOnMachineFunction(MachineFunction &MF) override;
403 
404  StringRef getPassName() const override {
405  return "SI insert wait instructions";
406  }
407 
408  void getAnalysisUsage(AnalysisUsage &AU) const override {
409  AU.setPreservesCFG();
412  }
413 
414  void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
415  // The waitcnt information is copied because it changes as the block is
416  // traversed.
417  KillWaitBrackets.push_back(
418  llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
419  }
420 
421  bool isForceEmitWaitcnt() const {
422  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
423  T = (enum InstCounterType)(T + 1))
424  if (ForceEmitWaitcnt[T])
425  return true;
426  return false;
427  }
428 
429  void setForceEmitWaitcnt() {
430 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
431 // For debug builds, get the debug counter info and adjust if need be
432 #ifndef NDEBUG
433  if (DebugCounter::isCounterSet(ForceExpCounter) &&
434  DebugCounter::shouldExecute(ForceExpCounter)) {
435  ForceEmitWaitcnt[EXP_CNT] = true;
436  } else {
437  ForceEmitWaitcnt[EXP_CNT] = false;
438  }
439 
440  if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
441  DebugCounter::shouldExecute(ForceLgkmCounter)) {
442  ForceEmitWaitcnt[LGKM_CNT] = true;
443  } else {
444  ForceEmitWaitcnt[LGKM_CNT] = false;
445  }
446 
447  if (DebugCounter::isCounterSet(ForceVMCounter) &&
448  DebugCounter::shouldExecute(ForceVMCounter)) {
449  ForceEmitWaitcnt[VM_CNT] = true;
450  } else {
451  ForceEmitWaitcnt[VM_CNT] = false;
452  }
453 #endif // NDEBUG
454  }
455 
456  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
457  void generateWaitcntInstBefore(MachineInstr &MI,
458  BlockWaitcntBrackets *ScoreBrackets);
459  void updateEventWaitcntAfter(MachineInstr &Inst,
460  BlockWaitcntBrackets *ScoreBrackets);
461  void mergeInputScoreBrackets(MachineBasicBlock &Block);
462  bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
463  unsigned countNumBottomBlocks(const MachineLoop *Loop);
464  void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
465  void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
466  bool isWaitcntStronger(unsigned LHS, unsigned RHS);
467  unsigned combineWaitcnt(unsigned LHS, unsigned RHS);
468 };
469 
470 } // end anonymous namespace
471 
472 RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
473  const SIInstrInfo *TII,
474  const MachineRegisterInfo *MRI,
475  const SIRegisterInfo *TRI,
476  unsigned OpNo,
477  bool Def) const {
478  const MachineOperand &Op = MI->getOperand(OpNo);
479  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
480  (Def && !Op.isDef()))
481  return {-1, -1};
482 
483  // A use via a PW operand does not need a waitcnt.
484  // A partial write is not a WAW.
485  assert(!Op.getSubReg() || !Op.isUndef());
486 
487  RegInterval Result;
488  const MachineRegisterInfo &MRIA = *MRI;
489 
490  unsigned Reg = TRI->getEncodingValue(Op.getReg());
491 
492  if (TRI->isVGPR(MRIA, Op.getReg())) {
493  assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
494  Result.first = Reg - RegisterEncoding.VGPR0;
495  assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
496  } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
497  assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
498  Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
499  assert(Result.first >= NUM_ALL_VGPRS &&
500  Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
501  }
502  // TODO: Handle TTMP
503  // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
504  else
505  return {-1, -1};
506 
507  const MachineInstr &MIA = *MI;
508  const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
509  unsigned Size = TRI->getRegSizeInBits(*RC);
510  Result.second = Result.first + (Size / 32);
511 
512  return Result;
513 }
514 
515 void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
516  const SIInstrInfo *TII,
517  const SIRegisterInfo *TRI,
518  const MachineRegisterInfo *MRI,
519  unsigned OpNo, int32_t Val) {
520  RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
521  LLVM_DEBUG({
522  const MachineOperand &Opnd = MI->getOperand(OpNo);
523  assert(TRI->isVGPR(*MRI, Opnd.getReg()));
524  });
525  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
526  setRegScore(RegNo, EXP_CNT, Val);
527  }
528 }
529 
530 void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
531  const SIRegisterInfo *TRI,
532  const MachineRegisterInfo *MRI,
533  WaitEventType E, MachineInstr &Inst) {
534  const MachineRegisterInfo &MRIA = *MRI;
535  InstCounterType T = eventCounter(E);
536  int32_t CurrScore = getScoreUB(T) + 1;
537  // EventUB and ScoreUB need to be update regardless if this event changes
538  // the score of a register or not.
539  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
540  EventUBs[E] = CurrScore;
541  setScoreUB(T, CurrScore);
542 
543  if (T == EXP_CNT) {
544  // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
545  // is required.
546  if (!MixedExpTypes) {
547  MixedExpTypes = counterOutOfOrder(EXP_CNT);
548  }
549 
550  // Put score on the source vgprs. If this is a store, just use those
551  // specific register(s).
552  if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
553  // All GDS operations must protect their address register (same as
554  // export.)
555  if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
556  Inst.getOpcode() != AMDGPU::DS_CONSUME) {
557  setExpScore(
558  &Inst, TII, TRI, MRI,
559  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
560  CurrScore);
561  }
562  if (Inst.mayStore()) {
563  setExpScore(
564  &Inst, TII, TRI, MRI,
565  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
566  CurrScore);
568  AMDGPU::OpName::data1) != -1) {
569  setExpScore(&Inst, TII, TRI, MRI,
571  AMDGPU::OpName::data1),
572  CurrScore);
573  }
574  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
575  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
576  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
577  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
578  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
579  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
580  Inst.getOpcode() != AMDGPU::DS_APPEND &&
581  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
582  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
583  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
584  const MachineOperand &Op = Inst.getOperand(I);
585  if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
586  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
587  }
588  }
589  }
590  } else if (TII->isFLAT(Inst)) {
591  if (Inst.mayStore()) {
592  setExpScore(
593  &Inst, TII, TRI, MRI,
594  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
595  CurrScore);
596  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
597  setExpScore(
598  &Inst, TII, TRI, MRI,
599  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
600  CurrScore);
601  }
602  } else if (TII->isMIMG(Inst)) {
603  if (Inst.mayStore()) {
604  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
605  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
606  setExpScore(
607  &Inst, TII, TRI, MRI,
608  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
609  CurrScore);
610  }
611  } else if (TII->isMTBUF(Inst)) {
612  if (Inst.mayStore()) {
613  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
614  }
615  } else if (TII->isMUBUF(Inst)) {
616  if (Inst.mayStore()) {
617  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
618  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
619  setExpScore(
620  &Inst, TII, TRI, MRI,
621  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
622  CurrScore);
623  }
624  } else {
625  if (TII->isEXP(Inst)) {
626  // For export the destination registers are really temps that
627  // can be used as the actual source after export patching, so
628  // we need to treat them like sources and set the EXP_CNT
629  // score.
630  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
631  MachineOperand &DefMO = Inst.getOperand(I);
632  if (DefMO.isReg() && DefMO.isDef() &&
633  TRI->isVGPR(MRIA, DefMO.getReg())) {
634  setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
635  CurrScore);
636  }
637  }
638  }
639  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
640  MachineOperand &MO = Inst.getOperand(I);
641  if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
642  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
643  }
644  }
645  }
646 #if 0 // TODO: check if this is handled by MUBUF code above.
647  } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
648  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
649  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
650  MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
651  unsigned OpNo;//TODO: find the OpNo for this operand;
652  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
653  for (signed RegNo = Interval.first; RegNo < Interval.second;
654  ++RegNo) {
655  setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
656  }
657 #endif
658  } else {
659  // Match the score to the destination registers.
660  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
661  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
662  if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
663  continue;
664  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
665  setRegScore(RegNo, T, CurrScore);
666  }
667  }
668  if (TII->isDS(Inst) && Inst.mayStore()) {
669  setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
670  }
671  }
672 }
673 
675  OS << '\n';
676  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
677  T = (enum InstCounterType)(T + 1)) {
678  int LB = getScoreLB(T);
679  int UB = getScoreUB(T);
680 
681  switch (T) {
682  case VM_CNT:
683  OS << " VM_CNT(" << UB - LB << "): ";
684  break;
685  case LGKM_CNT:
686  OS << " LGKM_CNT(" << UB - LB << "): ";
687  break;
688  case EXP_CNT:
689  OS << " EXP_CNT(" << UB - LB << "): ";
690  break;
691  default:
692  OS << " UNKNOWN(" << UB - LB << "): ";
693  break;
694  }
695 
696  if (LB < UB) {
697  // Print vgpr scores.
698  for (int J = 0; J <= getMaxVGPR(); J++) {
699  int RegScore = getRegScore(J, T);
700  if (RegScore <= LB)
701  continue;
702  int RelScore = RegScore - LB - 1;
703  if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
704  OS << RelScore << ":v" << J << " ";
705  } else {
706  OS << RelScore << ":ds ";
707  }
708  }
709  // Also need to print sgpr scores for lgkm_cnt.
710  if (T == LGKM_CNT) {
711  for (int J = 0; J <= getMaxSGPR(); J++) {
712  int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
713  if (RegScore <= LB)
714  continue;
715  int RelScore = RegScore - LB - 1;
716  OS << RelScore << ":s" << J << " ";
717  }
718  }
719  }
720  OS << '\n';
721  }
722  OS << '\n';
723 }
724 
725 unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
726  int ScoreToWait) {
727  unsigned int NeedWait = 0;
728  if (ScoreToWait == -1) {
729  // The score to wait is unknown. This implies that it was not encountered
730  // during the path of the CFG walk done during the current traversal but
731  // may be seen on a different path. Emit an s_wait counter with a
732  // conservative value of 0 for the counter.
733  NeedWait = CNT_MASK(T);
734  setScoreLB(T, getScoreUB(T));
735  return NeedWait;
736  }
737 
738  // If the score of src_operand falls within the bracket, we need an
739  // s_waitcnt instruction.
740  const int32_t LB = getScoreLB(T);
741  const int32_t UB = getScoreUB(T);
742  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
743  if ((T == VM_CNT || T == LGKM_CNT) &&
744  hasPendingFlat() &&
745  !ST->hasFlatLgkmVMemCountInOrder()) {
746  // If there is a pending FLAT operation, and this is a VMem or LGKM
747  // waitcnt and the target can report early completion, then we need
748  // to force a waitcnt 0.
749  NeedWait = CNT_MASK(T);
750  setScoreLB(T, getScoreUB(T));
751  } else if (counterOutOfOrder(T)) {
752  // Counter can get decremented out-of-order when there
753  // are multiple types event in the bracket. Also emit an s_wait counter
754  // with a conservative value of 0 for the counter.
755  NeedWait = CNT_MASK(T);
756  setScoreLB(T, getScoreUB(T));
757  } else {
758  NeedWait = CNT_MASK(T);
759  setScoreLB(T, ScoreToWait);
760  }
761  }
762 
763  return NeedWait;
764 }
765 
766 // Where there are multiple types of event in the bracket of a counter,
767 // the decrement may go out of order.
768 bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
769  switch (T) {
770  case VM_CNT:
771  return false;
772  case LGKM_CNT: {
773  if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
774  EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
775  // Scalar memory read always can go out of order.
776  return true;
777  }
778  int NumEventTypes = 0;
779  if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
780  EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
781  NumEventTypes++;
782  }
783  if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
784  EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
785  NumEventTypes++;
786  }
787  if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
788  EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
789  NumEventTypes++;
790  }
791  if (NumEventTypes <= 1) {
792  return false;
793  }
794  break;
795  }
796  case EXP_CNT: {
797  // If there has been a mixture of export types, then a waitcnt exp(0) is
798  // required.
799  if (MixedExpTypes)
800  return true;
801  int NumEventTypes = 0;
802  if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
803  EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
804  NumEventTypes++;
805  }
806  if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
807  EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
808  NumEventTypes++;
809  }
810  if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
811  EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
812  NumEventTypes++;
813  }
814  if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
815  EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
816  NumEventTypes++;
817  }
818 
819  if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
820  EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
821  NumEventTypes++;
822  }
823 
824  if (NumEventTypes <= 1) {
825  return false;
826  }
827  break;
828  }
829  default:
830  break;
831  }
832  return true;
833 }
834 
835 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
836  false)
837 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
838  false)
839 
840 char SIInsertWaitcnts::ID = 0;
841 
842 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
843 
845  return new SIInsertWaitcnts();
846 }
847 
848 static bool readsVCCZ(const MachineInstr &MI) {
849  unsigned Opc = MI.getOpcode();
850  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
851  !MI.getOperand(1).isUndef();
852 }
853 
854 /// Given wait count encodings checks if LHS is stronger than RHS.
855 bool SIInsertWaitcnts::isWaitcntStronger(unsigned LHS, unsigned RHS) {
856  if (AMDGPU::decodeVmcnt(IV, LHS) > AMDGPU::decodeVmcnt(IV, RHS))
857  return false;
858  if (AMDGPU::decodeLgkmcnt(IV, LHS) > AMDGPU::decodeLgkmcnt(IV, RHS))
859  return false;
860  if (AMDGPU::decodeExpcnt(IV, LHS) > AMDGPU::decodeExpcnt(IV, RHS))
861  return false;
862  return true;
863 }
864 
865 /// Given wait count encodings create a new encoding which is stronger
866 /// or equal to both.
867 unsigned SIInsertWaitcnts::combineWaitcnt(unsigned LHS, unsigned RHS) {
868  unsigned VmCnt = std::min(AMDGPU::decodeVmcnt(IV, LHS),
869  AMDGPU::decodeVmcnt(IV, RHS));
870  unsigned LgkmCnt = std::min(AMDGPU::decodeLgkmcnt(IV, LHS),
871  AMDGPU::decodeLgkmcnt(IV, RHS));
872  unsigned ExpCnt = std::min(AMDGPU::decodeExpcnt(IV, LHS),
873  AMDGPU::decodeExpcnt(IV, RHS));
874  return AMDGPU::encodeWaitcnt(IV, VmCnt, ExpCnt, LgkmCnt);
875 }
876 
877 /// Generate s_waitcnt instruction to be placed before cur_Inst.
878 /// Instructions of a given type are returned in order,
879 /// but instructions of different types can complete out of order.
880 /// We rely on this in-order completion
881 /// and simply assign a score to the memory access instructions.
882 /// We keep track of the active "score bracket" to determine
883 /// if an access of a memory read requires an s_waitcnt
884 /// and if so what the value of each counter is.
885 /// The "score bracket" is bound by the lower bound and upper bound
886 /// scores (*_score_LB and *_score_ub respectively).
887 void SIInsertWaitcnts::generateWaitcntInstBefore(
888  MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
889  // To emit, or not to emit - that's the question!
890  // Start with an assumption that there is no need to emit.
891  unsigned int EmitWaitcnt = 0;
892 
893  // No need to wait before phi. If a phi-move exists, then the wait should
894  // has been inserted before the move. If a phi-move does not exist, then
895  // wait should be inserted before the real use. The same is true for
896  // sc-merge. It is not a coincident that all these cases correspond to the
897  // instructions that are skipped in the assembling loop.
898  bool NeedLineMapping = false; // TODO: Check on this.
899 
900  // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug
901  bool ForceEmitZeroWaitcnt = false;
902 
903  setForceEmitWaitcnt();
904  bool IsForceEmitWaitcnt = isForceEmitWaitcnt();
905 
906  if (MI.isDebugInstr() &&
907  // TODO: any other opcode?
908  !NeedLineMapping) {
909  return;
910  }
911 
912  // See if an s_waitcnt is forced at block entry, or is needed at
913  // program end.
914  if (ScoreBrackets->getWaitAtBeginning()) {
915  // Note that we have already cleared the state, so we don't need to update
916  // it.
917  ScoreBrackets->clearWaitAtBeginning();
918  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
919  T = (enum InstCounterType)(T + 1)) {
920  EmitWaitcnt |= CNT_MASK(T);
921  ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
922  }
923  }
924 
925  // See if this instruction has a forced S_WAITCNT VM.
926  // TODO: Handle other cases of NeedsWaitcntVmBefore()
927  else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
928  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
929  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
930  EmitWaitcnt |=
931  ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
932  }
933 
934  // All waits must be resolved at call return.
935  // NOTE: this could be improved with knowledge of all call sites or
936  // with knowledge of the called routines.
937  if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
938  MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
939  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
940  T = (enum InstCounterType)(T + 1)) {
941  if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
942  ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
943  EmitWaitcnt |= CNT_MASK(T);
944  }
945  }
946  }
947  // Resolve vm waits before gs-done.
948  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
949  MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
952  if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
953  ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
954  EmitWaitcnt |= CNT_MASK(VM_CNT);
955  }
956  }
957 #if 0 // TODO: the following blocks of logic when we have fence.
958  else if (MI.getOpcode() == SC_FENCE) {
959  const unsigned int group_size =
960  context->shader_info->GetMaxThreadGroupSize();
961  // group_size == 0 means thread group size is unknown at compile time
962  const bool group_is_multi_wave =
963  (group_size == 0 || group_size > target_info->GetWaveFrontSize());
964  const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
965 
966  for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
967  SCRegType src_type = Inst->GetSrcType(i);
968  switch (src_type) {
969  case SCMEM_LDS:
970  if (group_is_multi_wave ||
971  context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
972  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
973  ScoreBrackets->getScoreUB(LGKM_CNT));
974  // LDS may have to wait for VM_CNT after buffer load to LDS
975  if (target_info->HasBufferLoadToLDS()) {
976  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
977  ScoreBrackets->getScoreUB(VM_CNT));
978  }
979  }
980  break;
981 
982  case SCMEM_GDS:
983  if (group_is_multi_wave || fence_is_global) {
984  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
985  ScoreBrackets->getScoreUB(EXP_CNT));
986  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
987  ScoreBrackets->getScoreUB(LGKM_CNT));
988  }
989  break;
990 
991  case SCMEM_UAV:
992  case SCMEM_TFBUF:
993  case SCMEM_RING:
994  case SCMEM_SCATTER:
995  if (group_is_multi_wave || fence_is_global) {
996  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
997  ScoreBrackets->getScoreUB(EXP_CNT));
998  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
999  ScoreBrackets->getScoreUB(VM_CNT));
1000  }
1001  break;
1002 
1003  case SCMEM_SCRATCH:
1004  default:
1005  break;
1006  }
1007  }
1008  }
1009 #endif
1010 
1011  // Export & GDS instructions do not read the EXEC mask until after the export
1012  // is granted (which can occur well after the instruction is issued).
1013  // The shader program must flush all EXP operations on the export-count
1014  // before overwriting the EXEC mask.
1015  else {
1016  if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1017  // Export and GDS are tracked individually, either may trigger a waitcnt
1018  // for EXEC.
1019  EmitWaitcnt |= ScoreBrackets->updateByWait(
1020  EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
1021  EmitWaitcnt |= ScoreBrackets->updateByWait(
1022  EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
1023  EmitWaitcnt |= ScoreBrackets->updateByWait(
1024  EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
1025  EmitWaitcnt |= ScoreBrackets->updateByWait(
1026  EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
1027  }
1028 
1029 #if 0 // TODO: the following code to handle CALL.
1030  // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
1031  // However, there is a problem with EXP_CNT, because the call cannot
1032  // easily tell if a register is used in the function, and if it did, then
1033  // the referring instruction would have to have an S_WAITCNT, which is
1034  // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
1035  // before the call.
1036  if (MI.getOpcode() == SC_CALL) {
1037  if (ScoreBrackets->getScoreUB(EXP_CNT) >
1038  ScoreBrackets->getScoreLB(EXP_CNT)) {
1039  ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1040  EmitWaitcnt |= CNT_MASK(EXP_CNT);
1041  }
1042  }
1043 #endif
1044 
1045  // FIXME: Should not be relying on memoperands.
1046  // Look at the source operands of every instruction to see if
1047  // any of them results from a previous memory operation that affects
1048  // its current usage. If so, an s_waitcnt instruction needs to be
1049  // emitted.
1050  // If the source operand was defined by a load, add the s_waitcnt
1051  // instruction.
1052  for (const MachineMemOperand *Memop : MI.memoperands()) {
1053  unsigned AS = Memop->getAddrSpace();
1054  if (AS != AMDGPUASI.LOCAL_ADDRESS)
1055  continue;
1056  unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1057  // VM_CNT is only relevant to vgpr or LDS.
1058  EmitWaitcnt |= ScoreBrackets->updateByWait(
1059  VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1060  }
1061 
1062  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1063  const MachineOperand &Op = MI.getOperand(I);
1064  const MachineRegisterInfo &MRIA = *MRI;
1065  RegInterval Interval =
1066  ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
1067  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1068  if (TRI->isVGPR(MRIA, Op.getReg())) {
1069  // VM_CNT is only relevant to vgpr or LDS.
1070  EmitWaitcnt |= ScoreBrackets->updateByWait(
1071  VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1072  }
1073  EmitWaitcnt |= ScoreBrackets->updateByWait(
1074  LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1075  }
1076  }
1077  // End of for loop that looks at all source operands to decide vm_wait_cnt
1078  // and lgk_wait_cnt.
1079 
1080  // Two cases are handled for destination operands:
1081  // 1) If the destination operand was defined by a load, add the s_waitcnt
1082  // instruction to guarantee the right WAW order.
1083  // 2) If a destination operand that was used by a recent export/store ins,
1084  // add s_waitcnt on exp_cnt to guarantee the WAR order.
1085  if (MI.mayStore()) {
1086  // FIXME: Should not be relying on memoperands.
1087  for (const MachineMemOperand *Memop : MI.memoperands()) {
1088  unsigned AS = Memop->getAddrSpace();
1089  if (AS != AMDGPUASI.LOCAL_ADDRESS)
1090  continue;
1091  unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1092  EmitWaitcnt |= ScoreBrackets->updateByWait(
1093  VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1094  EmitWaitcnt |= ScoreBrackets->updateByWait(
1095  EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1096  }
1097  }
1098  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1099  MachineOperand &Def = MI.getOperand(I);
1100  const MachineRegisterInfo &MRIA = *MRI;
1101  RegInterval Interval =
1102  ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
1103  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1104  if (TRI->isVGPR(MRIA, Def.getReg())) {
1105  EmitWaitcnt |= ScoreBrackets->updateByWait(
1106  VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1107  EmitWaitcnt |= ScoreBrackets->updateByWait(
1108  EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1109  }
1110  EmitWaitcnt |= ScoreBrackets->updateByWait(
1111  LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1112  }
1113  } // End of for loop that looks at all dest operands.
1114  }
1115 
1116  // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1117  // occurs before the instruction. Doing it here prevents any additional
1118  // S_WAITCNTs from being emitted if the instruction was marked as
1119  // requiring a WAITCNT beforehand.
1120  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1121  !ST->hasAutoWaitcntBeforeBarrier()) {
1122  EmitWaitcnt |=
1123  ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1124  EmitWaitcnt |= ScoreBrackets->updateByWait(
1125  EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1126  EmitWaitcnt |= ScoreBrackets->updateByWait(
1127  LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
1128  }
1129 
1130  // TODO: Remove this work-around, enable the assert for Bug 457939
1131  // after fixing the scheduler. Also, the Shader Compiler code is
1132  // independent of target.
1133  if (readsVCCZ(MI) && ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) {
1134  if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1135  ScoreBrackets->getScoreUB(LGKM_CNT) &&
1136  ScoreBrackets->hasPendingSMEM()) {
1137  // Wait on everything, not just LGKM. vccz reads usually come from
1138  // terminators, and we always wait on everything at the end of the
1139  // block, so if we only wait on LGKM here, we might end up with
1140  // another s_waitcnt inserted right after this if there are non-LGKM
1141  // instructions still outstanding.
1142  // FIXME: this is too conservative / the comment is wrong.
1143  // We don't wait on everything at the end of the block and we combine
1144  // waitcnts so we should never have back-to-back waitcnts.
1145  ForceEmitZeroWaitcnt = true;
1146  EmitWaitcnt = true;
1147  }
1148  }
1149 
1150  // Does this operand processing indicate s_wait counter update?
1151  if (EmitWaitcnt || IsForceEmitWaitcnt) {
1152  int CntVal[NUM_INST_CNTS];
1153 
1154  bool UseDefaultWaitcntStrategy = true;
1155  if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) {
1156  // Force all waitcnts to 0.
1157  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1158  T = (enum InstCounterType)(T + 1)) {
1159  ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1160  }
1161  CntVal[VM_CNT] = 0;
1162  CntVal[EXP_CNT] = 0;
1163  CntVal[LGKM_CNT] = 0;
1164  UseDefaultWaitcntStrategy = false;
1165  }
1166 
1167  if (UseDefaultWaitcntStrategy) {
1168  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1169  T = (enum InstCounterType)(T + 1)) {
1170  if (EmitWaitcnt & CNT_MASK(T)) {
1171  int Delta =
1172  ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
1173  int MaxDelta = ScoreBrackets->getWaitCountMax(T);
1174  if (Delta >= MaxDelta) {
1175  Delta = -1;
1176  if (T != EXP_CNT) {
1177  ScoreBrackets->setScoreLB(
1178  T, ScoreBrackets->getScoreUB(T) - MaxDelta);
1179  }
1180  EmitWaitcnt &= ~CNT_MASK(T);
1181  }
1182  CntVal[T] = Delta;
1183  } else {
1184  // If we are not waiting for a particular counter then encode
1185  // it as -1 which means "don't care."
1186  CntVal[T] = -1;
1187  }
1188  }
1189  }
1190 
1191  // If we are not waiting on any counter we can skip the wait altogether.
1192  if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) {
1193  MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
1194  int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
1195  if (!OldWaitcnt ||
1196  (AMDGPU::decodeVmcnt(IV, Imm) !=
1197  (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
1198  (AMDGPU::decodeExpcnt(IV, Imm) !=
1199  (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
1200  (AMDGPU::decodeLgkmcnt(IV, Imm) !=
1201  (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
1202  MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
1203  if (ContainingLoop) {
1204  MachineBasicBlock *TBB = ContainingLoop->getHeader();
1205  BlockWaitcntBrackets *ScoreBracket =
1206  BlockWaitcntBracketsMap[TBB].get();
1207  if (!ScoreBracket) {
1208  assert(!BlockVisitedSet.count(TBB));
1209  BlockWaitcntBracketsMap[TBB] =
1210  llvm::make_unique<BlockWaitcntBrackets>(ST);
1211  ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
1212  }
1213  ScoreBracket->setRevisitLoop(true);
1214  LLVM_DEBUG(dbgs()
1215  << "set-revisit2: Block"
1216  << ContainingLoop->getHeader()->getNumber() << '\n';);
1217  }
1218  }
1219 
1220  // Update an existing waitcount, or make a new one.
1221  unsigned Enc = AMDGPU::encodeWaitcnt(IV,
1222  ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT],
1223  ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT],
1224  ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]);
1225  // We don't remove waitcnts that existed prior to the waitcnt
1226  // pass. Check if the waitcnt to-be-inserted can be avoided
1227  // or if the prev waitcnt can be updated.
1228  bool insertSWaitInst = true;
1230  B = MI.getParent()->begin();
1231  insertSWaitInst && I != B; --I) {
1232  if (I == MI.getIterator())
1233  continue;
1234 
1235  switch (I->getOpcode()) {
1236  case AMDGPU::S_WAITCNT:
1237  if (isWaitcntStronger(I->getOperand(0).getImm(), Enc))
1238  insertSWaitInst = false;
1239  else if (!OldWaitcnt) {
1240  OldWaitcnt = &*I;
1241  Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc);
1242  }
1243  break;
1244  // TODO: skip over instructions which never require wait.
1245  }
1246  break;
1247  }
1248  if (insertSWaitInst) {
1249  if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) {
1250  if (ForceEmitZeroWaitcnts)
1251  LLVM_DEBUG(
1252  dbgs()
1253  << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n");
1254  if (IsForceEmitWaitcnt)
1255  LLVM_DEBUG(dbgs()
1256  << "Force emit a s_waitcnt due to debug counter\n");
1257 
1258  OldWaitcnt->getOperand(0).setImm(Enc);
1259  if (!OldWaitcnt->getParent())
1260  MI.getParent()->insert(MI, OldWaitcnt);
1261 
1262  LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n"
1263  << "Old Instr: " << MI << '\n'
1264  << "New Instr: " << *OldWaitcnt << '\n');
1265  } else {
1266  auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1267  MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1268  .addImm(Enc);
1269  TrackedWaitcntSet.insert(SWaitInst);
1270 
1271  LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n"
1272  << "Old Instr: " << MI << '\n'
1273  << "New Instr: " << *SWaitInst << '\n');
1274  }
1275  }
1276 
1277  if (CntVal[EXP_CNT] == 0) {
1278  ScoreBrackets->setMixedExpTypes(false);
1279  }
1280  }
1281  }
1282 }
1283 
1284 void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
1285  MachineInstr *Waitcnt) {
1286  if (MBB.empty()) {
1287  MBB.push_back(Waitcnt);
1288  return;
1289  }
1290 
1291  MachineBasicBlock::iterator It = MBB.end();
1292  MachineInstr *MI = &*(--It);
1293  if (MI->isBranch()) {
1294  MBB.insert(It, Waitcnt);
1295  } else {
1296  MBB.push_back(Waitcnt);
1297  }
1298 }
1299 
1300 // This is a flat memory operation. Check to see if it has memory
1301 // tokens for both LDS and Memory, and if so mark it as a flat.
1302 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1303  if (MI.memoperands_empty())
1304  return true;
1305 
1306  for (const MachineMemOperand *Memop : MI.memoperands()) {
1307  unsigned AS = Memop->getAddrSpace();
1308  if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
1309  return true;
1310  }
1311 
1312  return false;
1313 }
1314 
1315 void SIInsertWaitcnts::updateEventWaitcntAfter(
1316  MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
1317  // Now look at the instruction opcode. If it is a memory access
1318  // instruction, update the upper-bound of the appropriate counter's
1319  // bracket and the destination operand scores.
1320  // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1321  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1322  if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1323  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1324  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1325  } else {
1326  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1327  }
1328  } else if (TII->isFLAT(Inst)) {
1329  assert(Inst.mayLoad() || Inst.mayStore());
1330 
1331  if (TII->usesVM_CNT(Inst))
1332  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1333 
1334  if (TII->usesLGKM_CNT(Inst)) {
1335  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1336 
1337  // This is a flat memory operation, so note it - it will require
1338  // that both the VM and LGKM be flushed to zero if it is pending when
1339  // a VM or LGKM dependency occurs.
1340  if (mayAccessLDSThroughFlat(Inst))
1341  ScoreBrackets->setPendingFlat();
1342  }
1343  } else if (SIInstrInfo::isVMEM(Inst) &&
1344  // TODO: get a better carve out.
1345  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1346  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1347  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1348  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1349  if (ST->vmemWriteNeedsExpWaitcnt() &&
1350  (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
1351  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1352  }
1353  } else if (TII->isSMRD(Inst)) {
1354  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1355  } else {
1356  switch (Inst.getOpcode()) {
1357  case AMDGPU::S_SENDMSG:
1358  case AMDGPU::S_SENDMSGHALT:
1359  ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1360  break;
1361  case AMDGPU::EXP:
1362  case AMDGPU::EXP_DONE: {
1363  int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1364  if (Imm >= 32 && Imm <= 63)
1365  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1366  else if (Imm >= 12 && Imm <= 15)
1367  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1368  else
1369  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1370  break;
1371  }
1372  case AMDGPU::S_MEMTIME:
1373  case AMDGPU::S_MEMREALTIME:
1374  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1375  break;
1376  default:
1377  break;
1378  }
1379  }
1380 }
1381 
1382 // Merge the score brackets of the Block's predecessors;
1383 // this merged score bracket is used when adding waitcnts to the Block
1384 void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
1385  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1386  int32_t MaxPending[NUM_INST_CNTS] = {0};
1387  int32_t MaxFlat[NUM_INST_CNTS] = {0};
1388  bool MixedExpTypes = false;
1389 
1390  // For single basic block loops, we need to retain the Block's
1391  // score bracket to have accurate Pred info. So, make a copy of Block's
1392  // score bracket, clear() it (which retains several important bits of info),
1393  // populate, and then replace en masse. For non-single basic block loops,
1394  // just clear Block's current score bracket and repopulate in-place.
1395  bool IsSelfPred;
1396  std::unique_ptr<BlockWaitcntBrackets> S;
1397 
1398  IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block))
1399  != Block.pred_end();
1400  if (IsSelfPred) {
1401  S = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1402  ScoreBrackets = S.get();
1403  }
1404 
1405  ScoreBrackets->clear();
1406 
1407  // See if there are any uninitialized predecessors. If so, emit an
1408  // s_waitcnt 0 at the beginning of the block.
1409  for (MachineBasicBlock *Pred : Block.predecessors()) {
1410  BlockWaitcntBrackets *PredScoreBrackets =
1411  BlockWaitcntBracketsMap[Pred].get();
1412  bool Visited = BlockVisitedSet.count(Pred);
1413  if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
1414  continue;
1415  }
1416  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1417  T = (enum InstCounterType)(T + 1)) {
1418  int span =
1419  PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
1420  MaxPending[T] = std::max(MaxPending[T], span);
1421  span =
1422  PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
1423  MaxFlat[T] = std::max(MaxFlat[T], span);
1424  }
1425 
1426  MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
1427  }
1428 
1429  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1430  // Also handle kills for exit block.
1431  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1432  for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1433  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1434  T = (enum InstCounterType)(T + 1)) {
1435  int Span = KillWaitBrackets[I]->getScoreUB(T) -
1436  KillWaitBrackets[I]->getScoreLB(T);
1437  MaxPending[T] = std::max(MaxPending[T], Span);
1438  Span = KillWaitBrackets[I]->pendingFlat(T) -
1439  KillWaitBrackets[I]->getScoreLB(T);
1440  MaxFlat[T] = std::max(MaxFlat[T], Span);
1441  }
1442 
1443  MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
1444  }
1445  }
1446 
1447  // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
1448  for (MachineBasicBlock *Pred : Block.predecessors()) {
1449  BlockWaitcntBrackets *PredScoreBrackets =
1450  BlockWaitcntBracketsMap[Pred].get();
1451  bool Visited = BlockVisitedSet.count(Pred);
1452  if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
1453  continue;
1454  }
1455 
1456  int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
1457  PredScoreBrackets->getScoreLB(EXP_CNT);
1458  MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1459  int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
1460  PredScoreBrackets->getScoreLB(EXP_CNT);
1461  MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1462  }
1463 
1464  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1465  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1466  for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1467  int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
1468  KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1469  MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1470  int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
1471  KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1472  MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1473  }
1474  }
1475 
1476 #if 0
1477  // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
1478  // TODO: how does LC distinguish between function entry and main entry?
1479  // If this is the entry to a function, force a wait.
1480  MachineBasicBlock &Entry = Block.getParent()->front();
1481  if (Entry.getNumber() == Block.getNumber()) {
1482  ScoreBrackets->setWaitAtBeginning();
1483  return;
1484  }
1485 #endif
1486 
1487  // Now set the current Block's brackets to the largest ending bracket.
1488  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1489  T = (enum InstCounterType)(T + 1)) {
1490  ScoreBrackets->setScoreUB(T, MaxPending[T]);
1491  ScoreBrackets->setScoreLB(T, 0);
1492  ScoreBrackets->setLastFlat(T, MaxFlat[T]);
1493  }
1494 
1495  ScoreBrackets->setMixedExpTypes(MixedExpTypes);
1496 
1497  // Set the register scoreboard.
1498  for (MachineBasicBlock *Pred : Block.predecessors()) {
1499  if (!BlockVisitedSet.count(Pred)) {
1500  continue;
1501  }
1502 
1503  BlockWaitcntBrackets *PredScoreBrackets =
1504  BlockWaitcntBracketsMap[Pred].get();
1505 
1506  // Now merge the gpr_reg_score information
1507  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1508  T = (enum InstCounterType)(T + 1)) {
1509  int PredLB = PredScoreBrackets->getScoreLB(T);
1510  int PredUB = PredScoreBrackets->getScoreUB(T);
1511  if (PredLB < PredUB) {
1512  int PredScale = MaxPending[T] - PredUB;
1513  // Merge vgpr scores.
1514  for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
1515  int PredRegScore = PredScoreBrackets->getRegScore(J, T);
1516  if (PredRegScore <= PredLB)
1517  continue;
1518  int NewRegScore = PredScale + PredRegScore;
1519  ScoreBrackets->setRegScore(
1520  J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1521  }
1522  // Also need to merge sgpr scores for lgkm_cnt.
1523  if (T == LGKM_CNT) {
1524  for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
1525  int PredRegScore =
1526  PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1527  if (PredRegScore <= PredLB)
1528  continue;
1529  int NewRegScore = PredScale + PredRegScore;
1530  ScoreBrackets->setRegScore(
1531  J + NUM_ALL_VGPRS, LGKM_CNT,
1532  std::max(
1533  ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1534  NewRegScore));
1535  }
1536  }
1537  }
1538  }
1539 
1540  // Also merge the WaitEvent information.
1542  enum InstCounterType T = PredScoreBrackets->eventCounter(W);
1543  int PredEventUB = PredScoreBrackets->getEventUB(W);
1544  if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
1545  int NewEventUB =
1546  MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
1547  if (NewEventUB > 0) {
1548  ScoreBrackets->setEventUB(
1549  W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1550  }
1551  }
1552  }
1553  }
1554 
1555  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1556  // Set the register scoreboard.
1557  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1558  for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1559  // Now merge the gpr_reg_score information.
1560  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1561  T = (enum InstCounterType)(T + 1)) {
1562  int PredLB = KillWaitBrackets[I]->getScoreLB(T);
1563  int PredUB = KillWaitBrackets[I]->getScoreUB(T);
1564  if (PredLB < PredUB) {
1565  int PredScale = MaxPending[T] - PredUB;
1566  // Merge vgpr scores.
1567  for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
1568  int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
1569  if (PredRegScore <= PredLB)
1570  continue;
1571  int NewRegScore = PredScale + PredRegScore;
1572  ScoreBrackets->setRegScore(
1573  J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1574  }
1575  // Also need to merge sgpr scores for lgkm_cnt.
1576  if (T == LGKM_CNT) {
1577  for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
1578  int PredRegScore =
1579  KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1580  if (PredRegScore <= PredLB)
1581  continue;
1582  int NewRegScore = PredScale + PredRegScore;
1583  ScoreBrackets->setRegScore(
1584  J + NUM_ALL_VGPRS, LGKM_CNT,
1585  std::max(
1586  ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1587  NewRegScore));
1588  }
1589  }
1590  }
1591  }
1592 
1593  // Also merge the WaitEvent information.
1595  enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
1596  int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
1597  if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
1598  int NewEventUB =
1599  MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
1600  if (NewEventUB > 0) {
1601  ScoreBrackets->setEventUB(
1602  W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1603  }
1604  }
1605  }
1606  }
1607  }
1608 
1609  // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
1610  // sequencing predecessors, because changes to EXEC require waitcnts due to
1611  // the delayed nature of these operations.
1612  for (MachineBasicBlock *Pred : Block.predecessors()) {
1613  if (!BlockVisitedSet.count(Pred)) {
1614  continue;
1615  }
1616 
1617  BlockWaitcntBrackets *PredScoreBrackets =
1618  BlockWaitcntBracketsMap[Pred].get();
1619 
1620  int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
1621  if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1622  int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
1623  PredScoreBrackets->getScoreUB(EXP_CNT);
1624  if (new_gds_ub > 0) {
1625  ScoreBrackets->setEventUB(
1626  GDS_GPR_LOCK,
1627  std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
1628  }
1629  }
1630  int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
1631  if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1632  int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
1633  PredScoreBrackets->getScoreUB(EXP_CNT);
1634  if (new_exp_ub > 0) {
1635  ScoreBrackets->setEventUB(
1636  EXP_GPR_LOCK,
1637  std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
1638  }
1639  }
1640  }
1641 
1642  // if a single block loop, update the score brackets. Not needed for other
1643  // blocks, as we did this in-place
1644  if (IsSelfPred) {
1645  BlockWaitcntBracketsMap[&Block] = llvm::make_unique<BlockWaitcntBrackets>(*ScoreBrackets);
1646  }
1647 }
1648 
1649 /// Return true if the given basic block is a "bottom" block of a loop.
1650 /// This works even if the loop is discontiguous. This also handles
1651 /// multiple back-edges for the same "header" block of a loop.
1652 bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
1653  const MachineBasicBlock *Block) {
1654  for (MachineBasicBlock *MBB : Loop->blocks()) {
1655  if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
1656  return true;
1657  }
1658  }
1659  return false;
1660 }
1661 
1662 /// Count the number of "bottom" basic blocks of a loop.
1663 unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
1664  unsigned Count = 0;
1665  for (MachineBasicBlock *MBB : Loop->blocks()) {
1666  if (MBB->isSuccessor(Loop->getHeader())) {
1667  Count++;
1668  }
1669  }
1670  return Count;
1671 }
1672 
1673 // Generate s_waitcnt instructions where needed.
1674 void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1675  MachineBasicBlock &Block) {
1676  // Initialize the state information.
1677  mergeInputScoreBrackets(Block);
1678 
1679  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1680 
1681  LLVM_DEBUG({
1682  dbgs() << "*** Block" << Block.getNumber() << " ***";
1683  ScoreBrackets->dump();
1684  });
1685 
1686  // Walk over the instructions.
1687  for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1688  Iter != E;) {
1689  MachineInstr &Inst = *Iter;
1690  // Remove any previously existing waitcnts.
1691  if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
1692  // Leave pre-existing waitcnts, but note their existence via setWaitcnt.
1693  // Remove the waitcnt-pass-generated waitcnts; the pass will add them back
1694  // as needed.
1695  if (!TrackedWaitcntSet.count(&Inst))
1696  ++Iter;
1697  else {
1698  ++Iter;
1699  Inst.removeFromParent();
1700  }
1701  ScoreBrackets->setWaitcnt(&Inst);
1702  continue;
1703  }
1704 
1705  // Kill instructions generate a conditional branch to the endmain block.
1706  // Merge the current waitcnt state into the endmain block information.
1707  // TODO: Are there other flavors of KILL instruction?
1708  if (Inst.getOpcode() == AMDGPU::KILL) {
1709  addKillWaitBracket(ScoreBrackets);
1710  }
1711 
1712  bool VCCZBugWorkAround = false;
1713  if (readsVCCZ(Inst) &&
1714  (!VCCZBugHandledSet.count(&Inst))) {
1715  if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1716  ScoreBrackets->getScoreUB(LGKM_CNT) &&
1717  ScoreBrackets->hasPendingSMEM()) {
1718  if (ST->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
1719  VCCZBugWorkAround = true;
1720  }
1721  }
1722 
1723  // Generate an s_waitcnt instruction to be placed before
1724  // cur_Inst, if needed.
1725  generateWaitcntInstBefore(Inst, ScoreBrackets);
1726 
1727  updateEventWaitcntAfter(Inst, ScoreBrackets);
1728 
1729 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1730  // If this instruction generates a S_SETVSKIP because it is an
1731  // indexed resource, and we are on Tahiti, then it will also force
1732  // an S_WAITCNT vmcnt(0)
1733  if (RequireCheckResourceType(Inst, context)) {
1734  // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1735  ScoreBrackets->setScoreLB(VM_CNT,
1736  ScoreBrackets->getScoreUB(VM_CNT));
1737  }
1738 #endif
1739 
1740  ScoreBrackets->clearWaitcnt();
1741 
1742  LLVM_DEBUG({
1743  Inst.print(dbgs());
1744  ScoreBrackets->dump();
1745  });
1746 
1747  // Check to see if this is a GWS instruction. If so, and if this is CI or
1748  // VI, then the generated code sequence will include an S_WAITCNT 0.
1749  // TODO: Are these the only GWS instructions?
1750  if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1751  Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1752  Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1753  Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1754  Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1755  // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
1756  ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1757  ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1758  ScoreBrackets->updateByWait(LGKM_CNT,
1759  ScoreBrackets->getScoreUB(LGKM_CNT));
1760  }
1761 
1762  // TODO: Remove this work-around after fixing the scheduler and enable the
1763  // assert above.
1764  if (VCCZBugWorkAround) {
1765  // Restore the vccz bit. Any time a value is written to vcc, the vcc
1766  // bit is updated, so we can restore the bit by reading the value of
1767  // vcc and then writing it back to the register.
1768  BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1769  AMDGPU::VCC)
1770  .addReg(AMDGPU::VCC);
1771  VCCZBugHandledSet.insert(&Inst);
1772  }
1773 
1774  ++Iter;
1775  }
1776 
1777  // Check if we need to force convergence at loop footer.
1778  MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
1779  if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
1780  LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1781  WaitcntData->print();
1782  LLVM_DEBUG(dbgs() << '\n';);
1783 
1784  // The iterative waitcnt insertion algorithm aims for optimal waitcnt
1785  // placement, but doesn't guarantee convergence for a loop. Each
1786  // loop should take at most (n+1) iterations for it to converge naturally,
1787  // where n is the number of bottom blocks. If this threshold is reached and
1788  // the result hasn't converged, then we force convergence by inserting
1789  // a s_waitcnt at the end of loop footer.
1790  if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) {
1791  // To ensure convergence, need to make wait events at loop footer be no
1792  // more than those from the previous iteration.
1793  // As a simplification, instead of tracking individual scores and
1794  // generating the precise wait count, just wait on 0.
1795  bool HasPending = false;
1796  MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
1797  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1798  T = (enum InstCounterType)(T + 1)) {
1799  if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
1800  ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1801  HasPending = true;
1802  break;
1803  }
1804  }
1805 
1806  if (HasPending) {
1807  if (!SWaitInst) {
1808  SWaitInst = BuildMI(Block, Block.getFirstNonPHI(),
1809  DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1810  .addImm(0);
1811  TrackedWaitcntSet.insert(SWaitInst);
1812 #if 0 // TODO: Format the debug output
1813  OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
1814  OutputTransformAdd(SWaitInst, context);
1815 #endif
1816  }
1817 #if 0 // TODO: ??
1818  _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
1819 #endif
1820  }
1821 
1822  if (SWaitInst) {
1823  LLVM_DEBUG({
1824  SWaitInst->print(dbgs());
1825  dbgs() << "\nAdjusted score board:";
1826  ScoreBrackets->dump();
1827  });
1828 
1829  // Add this waitcnt to the block. It is either newly created or
1830  // created in previous iterations and added back since block traversal
1831  // always removes waitcnts.
1832  insertWaitcntBeforeCF(Block, SWaitInst);
1833  WaitcntData->setWaitcnt(SWaitInst);
1834  }
1835  }
1836  }
1837 }
1838 
1839 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1840  ST = &MF.getSubtarget<GCNSubtarget>();
1841  TII = ST->getInstrInfo();
1842  TRI = &TII->getRegisterInfo();
1843  MRI = &MF.getRegInfo();
1844  MLI = &getAnalysis<MachineLoopInfo>();
1845  IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
1847  AMDGPUASI = ST->getAMDGPUAS();
1848 
1849  ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1850  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1851  T = (enum InstCounterType)(T + 1))
1852  ForceEmitWaitcnt[T] = false;
1853 
1854  HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1855  HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1856  HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1857 
1858  HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1859  HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1860  assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1861  assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1862 
1863  RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1864  RegisterEncoding.VGPRL =
1865  RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1866  RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1867  RegisterEncoding.SGPRL =
1868  RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1869 
1870  TrackedWaitcntSet.clear();
1871  BlockVisitedSet.clear();
1872  VCCZBugHandledSet.clear();
1873  LoopWaitcntDataMap.clear();
1874  BlockWaitcntProcessedSet.clear();
1875 
1876  // Walk over the blocks in reverse post-dominator order, inserting
1877  // s_waitcnt where needed.
1879  bool Modified = false;
1881  I = RPOT.begin(),
1882  E = RPOT.end(), J = RPOT.begin();
1883  I != E;) {
1884  MachineBasicBlock &MBB = **I;
1885 
1886  BlockVisitedSet.insert(&MBB);
1887 
1888  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1889  if (!ScoreBrackets) {
1890  BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>(ST);
1891  ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1892  }
1893  ScoreBrackets->setPostOrder(MBB.getNumber());
1894  MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
1895  if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
1896  LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
1897 
1898  // If we are walking into the block from before the loop, then guarantee
1899  // at least 1 re-walk over the loop to propagate the information, even if
1900  // no S_WAITCNT instructions were generated.
1901  if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
1902  unsigned Count = countNumBottomBlocks(ContainingLoop);
1903 
1904  // If the loop has multiple back-edges, and so more than one "bottom"
1905  // basic block, we have to guarantee a re-walk over every blocks.
1906  if ((std::count(BlockWaitcntProcessedSet.begin(),
1907  BlockWaitcntProcessedSet.end(), &MBB) < (int)Count)) {
1908  BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
1909  LLVM_DEBUG(dbgs() << "set-revisit1: Block"
1910  << ContainingLoop->getHeader()->getNumber() << '\n';);
1911  }
1912  }
1913 
1914  // Walk over the instructions.
1915  insertWaitcntInBlock(MF, MBB);
1916 
1917  // Record that waitcnts have been processed at least once for this block.
1918  BlockWaitcntProcessedSet.push_back(&MBB);
1919 
1920  // See if we want to revisit the loop. If a loop has multiple back-edges,
1921  // we shouldn't revisit the same "bottom" basic block.
1922  if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
1923  std::count(BlockWaitcntProcessedSet.begin(),
1924  BlockWaitcntProcessedSet.end(), &MBB) == 1) {
1925  MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
1926  BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
1927  if (EntrySB && EntrySB->getRevisitLoop()) {
1928  EntrySB->setRevisitLoop(false);
1929  J = I;
1930  int32_t PostOrder = EntrySB->getPostOrder();
1931  // TODO: Avoid this loop. Find another way to set I.
1933  X = RPOT.begin(),
1934  Y = RPOT.end();
1935  X != Y; ++X) {
1936  MachineBasicBlock &MBBX = **X;
1937  if (MBBX.getNumber() == PostOrder) {
1938  I = X;
1939  break;
1940  }
1941  }
1942  LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1943  WaitcntData->incIterCnt();
1944  LLVM_DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';);
1945  continue;
1946  } else {
1947  LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1948  // Loop converged, reset iteration count. If this loop gets revisited,
1949  // it must be from an outer loop, the counter will restart, this will
1950  // ensure we don't force convergence on such revisits.
1951  WaitcntData->resetIterCnt();
1952  }
1953  }
1954 
1955  J = I;
1956  ++I;
1957  }
1958 
1960 
1961  bool HaveScalarStores = false;
1962 
1963  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1964  ++BI) {
1965  MachineBasicBlock &MBB = *BI;
1966 
1967  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1968  ++I) {
1969  if (!HaveScalarStores && TII->isScalarStore(*I))
1970  HaveScalarStores = true;
1971 
1972  if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1973  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1974  EndPgmBlocks.push_back(&MBB);
1975  }
1976  }
1977 
1978  if (HaveScalarStores) {
1979  // If scalar writes are used, the cache must be flushed or else the next
1980  // wave to reuse the same scratch memory can be clobbered.
1981  //
1982  // Insert s_dcache_wb at wave termination points if there were any scalar
1983  // stores, and only if the cache hasn't already been flushed. This could be
1984  // improved by looking across blocks for flushes in postdominating blocks
1985  // from the stores but an explicitly requested flush is probably very rare.
1986  for (MachineBasicBlock *MBB : EndPgmBlocks) {
1987  bool SeenDCacheWB = false;
1988 
1989  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1990  ++I) {
1991  if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1992  SeenDCacheWB = true;
1993  else if (TII->isScalarStore(*I))
1994  SeenDCacheWB = false;
1995 
1996  // FIXME: It would be better to insert this before a waitcnt if any.
1997  if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1998  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1999  !SeenDCacheWB) {
2000  Modified = true;
2001  BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
2002  }
2003  }
2004  }
2005  }
2006 
2007  if (!MFI->isEntryFunction()) {
2008  // Wait for any outstanding memory operations that the input registers may
2009  // depend on. We can't track them and it's better to the wait after the
2010  // costly call sequence.
2011 
2012  // TODO: Could insert earlier and schedule more liberally with operations
2013  // that only use caller preserved registers.
2014  MachineBasicBlock &EntryBB = MF.front();
2015  BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
2016  .addImm(0);
2017 
2018  Modified = true;
2019  }
2020 
2021  return Modified;
2022 }
static cl::opt< unsigned > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(0), cl::Hidden)
Interface definition for SIRegisterInfo.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
bool modifiesRegister(unsigned Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register...
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
AMDGPU specific subclass of TargetSubtarget.
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
SI Insert Waitcnts
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:37
Implements a dense probed hash-table based set.
Definition: DenseSet.h:221
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:285
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
unsigned getReg() const
getReg - Returns the register number.
unsigned Reg
unsigned getSubReg() const
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:34
iterator_range< mmo_iterator > memoperands()
Definition: MachineInstr.h:423
LLVM_READONLY int getAtomicNoRetOp(uint16_t Opcode)
FunctionPass * createSIInsertWaitcntsPass()
DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", "Force emit s_waitcnt expcnt(0) instrs")
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:430
static const AMDGPUSubtarget & get(const MachineFunction &MF)
AnalysisUsage & addRequired()
A description of a memory reference used in the backend.
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:454
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
OpenCL uses address spaces to differentiate between various memory regions on the hardware...
Definition: AMDGPU.h:224
const HexagonInstrInfo * TII
bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const
unsigned getNumOperands() const
Access to explicit operands of the instruction.
Definition: MachineInstr.h:314
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:438
static bool readsVCCZ(const MachineInstr &MI)
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:311
unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt)
This file provides an implementation of debug counters.
BlockT * getHeader() const
Definition: LoopInfo.h:100
InstCounterType
bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:404
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
#define T
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:106
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they&#39;re not in a MachineFuncti...
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
bool isBranch(QueryType Type=AnyInBundle) const
Returns true if this is a conditional, unconditional, or indirect branch.
Definition: MachineInstr.h:509
auto count(R &&Range, const E &Element) -> typename std::iterator_traits< decltype(adl_begin(Range))>::difference_type
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:974
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:672
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:410
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
unsigned const MachineRegisterInfo * MRI
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Represent the analysis usage information of a pass.
Instruction set architecture version.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
void setImm(int64_t immVal)
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
self_iterator getIterator()
Definition: ilist_node.h:82
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:72
iterator_range< pred_iterator > predecessors()
#define CNT_MASK(t)
RegisterMapping
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
const AMDGPUAS & AS
const MachineBasicBlock & front() const
auto find(R &&Range, const T &Val) -> decltype(adl_begin(Range))
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:929
bool isDebugInstr() const
Definition: MachineInstr.h:851
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:470
unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt)
char & SIInsertWaitcntsID
unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version)
Iterator for intrusive lists based on ilist_node.
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:861
unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version)
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:286
int64_t getImm() const
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:211
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:156
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Representation of each machine instruction.
Definition: MachineInstr.h:60
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) INITIALIZE_PASS_END(SIInsertWaitcnts
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:412
#define DEBUG_TYPE
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt)
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:459
unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
void push_back(MachineInstr *MI)
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:316
typename std::vector< NodeRef >::reverse_iterator rpo_iterator
unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version)
#define ForAllWaitEventType(w)
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
bool isSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB is a successor of this block.
MachineInstr * removeFromParent()
Unlink &#39;this&#39; from the containing basic block, and return it without deleting it. ...
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:659
bool memoperands_empty() const
Return true if we don&#39;t have any memory operands which described the memory access done by this instr...
Definition: MachineInstr.h:421
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
constexpr char Size[]
Key for Kernel::Arg::Metadata::mSize.
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:46
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
#define LLVM_DEBUG(X)
Definition: Debug.h:119
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:316
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:156
WaitEventType
IsaVersion getIsaVersion(const FeatureBitset &Features)
const SIRegisterInfo * getRegisterInfo() const override