LLVM  6.0.0svn
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Insert wait instructions for memory reads and writes.
12 ///
13 /// Memory reads and writes are issued asynchronously, so we need to insert
14 /// S_WAITCNT instructions when we want to access any of their results or
15 /// overwrite any register that's used asynchronously.
16 //
17 //===----------------------------------------------------------------------===//
18 
19 #include "AMDGPU.h"
20 #include "AMDGPUSubtarget.h"
21 #include "SIDefines.h"
22 #include "SIInstrInfo.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "SIRegisterInfo.h"
25 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/ADT/DenseMap.h"
27 #include "llvm/ADT/DenseSet.h"
29 #include "llvm/ADT/STLExtras.h"
30 #include "llvm/ADT/SmallVector.h"
40 #include "llvm/IR/DebugLoc.h"
41 #include "llvm/Pass.h"
42 #include "llvm/Support/Debug.h"
45 #include <algorithm>
46 #include <cassert>
47 #include <cstdint>
48 #include <cstring>
49 #include <memory>
50 #include <utility>
51 #include <vector>
52 
53 #define DEBUG_TYPE "si-insert-waitcnts"
54 
55 using namespace llvm;
56 
57 namespace {
58 
59 // Class of object that encapsulates latest instruction counter score
60 // associated with the operand. Used for determining whether
61 // s_waitcnt instruction needs to be emited.
62 
63 #define CNT_MASK(t) (1u << (t))
64 
65 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
66 
67 using RegInterval = std::pair<signed, signed>;
68 
69 struct {
70  int32_t VmcntMax;
71  int32_t ExpcntMax;
72  int32_t LgkmcntMax;
73  int32_t NumVGPRsMax;
74  int32_t NumSGPRsMax;
75 } HardwareLimits;
76 
77 struct {
78  unsigned VGPR0;
79  unsigned VGPRL;
80  unsigned SGPR0;
81  unsigned SGPRL;
82 } RegisterEncoding;
83 
85  VMEM_ACCESS, // vector-memory read & write
86  LDS_ACCESS, // lds read & write
87  GDS_ACCESS, // gds read & write
88  SQ_MESSAGE, // send message
89  SMEM_ACCESS, // scalar-memory read & write
90  EXP_GPR_LOCK, // export holding on its data src
91  GDS_GPR_LOCK, // GDS holding on its data and addr src
92  EXP_POS_ACCESS, // write to export position
93  EXP_PARAM_ACCESS, // write to export parameter
94  VMW_GPR_LOCK, // vector-memory write holding on its data src
95  NUM_WAIT_EVENTS,
96 };
97 
98 // The mapping is:
99 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
100 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
101 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
102 // We reserve a fixed number of VGPR slots in the scoring tables for
103 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
105  SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
106  SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
107  NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
108  EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
109  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
110 };
111 
112 #define ForAllWaitEventType(w) \
113  for (enum WaitEventType w = (enum WaitEventType)0; \
114  (w) < (enum WaitEventType)NUM_WAIT_EVENTS; \
115  (w) = (enum WaitEventType)((w) + 1))
116 
117 // This is a per-basic-block object that maintains current score brackets
118 // of each wait-counter, and a per-register scoreboard for each wait-couner.
119 // We also maintain the latest score for every event type that can change the
120 // waitcnt in order to know if there are multiple types of events within
121 // the brackets. When multiple types of event happen in the bracket,
122 // wait-count may get decreased out of order, therefore we need to put in
123 // "s_waitcnt 0" before use.
124 class BlockWaitcntBrackets {
125 public:
126  BlockWaitcntBrackets() {
127  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
128  T = (enum InstCounterType)(T + 1)) {
129  memset(VgprScores[T], 0, sizeof(VgprScores[T]));
130  }
131  }
132 
133  ~BlockWaitcntBrackets() = default;
134 
135  static int32_t getWaitCountMax(InstCounterType T) {
136  switch (T) {
137  case VM_CNT:
138  return HardwareLimits.VmcntMax;
139  case LGKM_CNT:
140  return HardwareLimits.LgkmcntMax;
141  case EXP_CNT:
142  return HardwareLimits.ExpcntMax;
143  default:
144  break;
145  }
146  return 0;
147  }
148 
149  void setScoreLB(InstCounterType T, int32_t Val) {
150  assert(T < NUM_INST_CNTS);
151  if (T >= NUM_INST_CNTS)
152  return;
153  ScoreLBs[T] = Val;
154  }
155 
156  void setScoreUB(InstCounterType T, int32_t Val) {
157  assert(T < NUM_INST_CNTS);
158  if (T >= NUM_INST_CNTS)
159  return;
160  ScoreUBs[T] = Val;
161  if (T == EXP_CNT) {
162  int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
163  if (ScoreLBs[T] < UB)
164  ScoreLBs[T] = UB;
165  }
166  }
167 
168  int32_t getScoreLB(InstCounterType T) {
169  assert(T < NUM_INST_CNTS);
170  if (T >= NUM_INST_CNTS)
171  return 0;
172  return ScoreLBs[T];
173  }
174 
175  int32_t getScoreUB(InstCounterType T) {
176  assert(T < NUM_INST_CNTS);
177  if (T >= NUM_INST_CNTS)
178  return 0;
179  return ScoreUBs[T];
180  }
181 
182  // Mapping from event to counter.
183  InstCounterType eventCounter(WaitEventType E) {
184  switch (E) {
185  case VMEM_ACCESS:
186  return VM_CNT;
187  case LDS_ACCESS:
188  case GDS_ACCESS:
189  case SQ_MESSAGE:
190  case SMEM_ACCESS:
191  return LGKM_CNT;
192  case EXP_GPR_LOCK:
193  case GDS_GPR_LOCK:
194  case VMW_GPR_LOCK:
195  case EXP_POS_ACCESS:
196  case EXP_PARAM_ACCESS:
197  return EXP_CNT;
198  default:
199  llvm_unreachable("unhandled event type");
200  }
201  return NUM_INST_CNTS;
202  }
203 
204  void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
205  if (GprNo < NUM_ALL_VGPRS) {
206  if (GprNo > VgprUB) {
207  VgprUB = GprNo;
208  }
209  VgprScores[T][GprNo] = Val;
210  } else {
211  assert(T == LGKM_CNT);
212  if (GprNo - NUM_ALL_VGPRS > SgprUB) {
213  SgprUB = GprNo - NUM_ALL_VGPRS;
214  }
215  SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
216  }
217  }
218 
219  int32_t getRegScore(int GprNo, InstCounterType T) {
220  if (GprNo < NUM_ALL_VGPRS) {
221  return VgprScores[T][GprNo];
222  }
223  return SgprScores[GprNo - NUM_ALL_VGPRS];
224  }
225 
226  void clear() {
227  memset(ScoreLBs, 0, sizeof(ScoreLBs));
228  memset(ScoreUBs, 0, sizeof(ScoreUBs));
229  memset(EventUBs, 0, sizeof(EventUBs));
230  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
231  T = (enum InstCounterType)(T + 1)) {
232  memset(VgprScores[T], 0, sizeof(VgprScores[T]));
233  }
234  memset(SgprScores, 0, sizeof(SgprScores));
235  }
236 
237  RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
238  const MachineRegisterInfo *MRI,
239  const SIRegisterInfo *TRI, unsigned OpNo,
240  bool Def) const;
241 
242  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
243  const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
244  unsigned OpNo, int32_t Val);
245 
246  void setWaitAtBeginning() { WaitAtBeginning = true; }
247  void clearWaitAtBeginning() { WaitAtBeginning = false; }
248  bool getWaitAtBeginning() const { return WaitAtBeginning; }
249  void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
250  int32_t getMaxVGPR() const { return VgprUB; }
251  int32_t getMaxSGPR() const { return SgprUB; }
252 
253  int32_t getEventUB(enum WaitEventType W) const {
254  assert(W < NUM_WAIT_EVENTS);
255  return EventUBs[W];
256  }
257 
258  bool counterOutOfOrder(InstCounterType T);
259  unsigned int updateByWait(InstCounterType T, int ScoreToWait);
260  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
261  const MachineRegisterInfo *MRI, WaitEventType E,
262  MachineInstr &MI);
263 
264  bool hasPendingSMEM() const {
265  return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
266  EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
267  }
268 
269  bool hasPendingFlat() const {
270  return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
271  LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
272  (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
273  LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
274  }
275 
276  void setPendingFlat() {
277  LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
278  LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
279  }
280 
281  int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
282 
283  void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
284 
285  bool getRevisitLoop() const { return RevisitLoop; }
286  void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
287 
288  void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
289  int32_t getPostOrder() const { return PostOrder; }
290 
291  void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
292  void clearWaitcnt() { Waitcnt = nullptr; }
293  MachineInstr *getWaitcnt() const { return Waitcnt; }
294 
295  bool mixedExpTypes() const { return MixedExpTypes; }
296  void setMixedExpTypes(bool MixedExpTypesIn) {
297  MixedExpTypes = MixedExpTypesIn;
298  }
299 
300  void print(raw_ostream &);
301  void dump() { print(dbgs()); }
302 
303 private:
304  bool WaitAtBeginning = false;
305  bool RevisitLoop = false;
306  bool MixedExpTypes = false;
307  int32_t PostOrder = 0;
308  MachineInstr *Waitcnt = nullptr;
309  int32_t ScoreLBs[NUM_INST_CNTS] = {0};
310  int32_t ScoreUBs[NUM_INST_CNTS] = {0};
311  int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
312  // Remember the last flat memory operation.
313  int32_t LastFlat[NUM_INST_CNTS] = {0};
314  // wait_cnt scores for every vgpr.
315  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
316  int32_t VgprUB = 0;
317  int32_t SgprUB = 0;
318  int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
319  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
320  int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
321 };
322 
323 // This is a per-loop-region object that records waitcnt status at the end of
324 // loop footer from the previous iteration. We also maintain an iteration
325 // count to track the number of times the loop has been visited. When it
326 // doesn't converge naturally, we force convergence by inserting s_waitcnt 0
327 // at the end of the loop footer.
328 class LoopWaitcntData {
329 public:
330  LoopWaitcntData() = default;
331  ~LoopWaitcntData() = default;
332 
333  void incIterCnt() { IterCnt++; }
334  void resetIterCnt() { IterCnt = 0; }
335  int32_t getIterCnt() { return IterCnt; }
336 
337  void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
338  MachineInstr *getWaitcnt() const { return LfWaitcnt; }
339 
340  void print() {
341  DEBUG(dbgs() << " iteration " << IterCnt << '\n';);
342  }
343 
344 private:
345  // s_waitcnt added at the end of loop footer to stablize wait scores
346  // at the end of the loop footer.
347  MachineInstr *LfWaitcnt = nullptr;
348  // Number of iterations the loop has been visited, not including the initial
349  // walk over.
350  int32_t IterCnt = 0;
351 };
352 
353 class SIInsertWaitcnts : public MachineFunctionPass {
354 private:
355  const SISubtarget *ST = nullptr;
356  const SIInstrInfo *TII = nullptr;
357  const SIRegisterInfo *TRI = nullptr;
358  const MachineRegisterInfo *MRI = nullptr;
359  const MachineLoopInfo *MLI = nullptr;
361  AMDGPUAS AMDGPUASI;
362 
363  DenseSet<MachineBasicBlock *> BlockVisitedSet;
364  DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet;
365  DenseSet<MachineInstr *> VCCZBugHandledSet;
366 
368  BlockWaitcntBracketsMap;
369 
370  DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet;
371 
373 
374  std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
375 
376 public:
377  static char ID;
378 
379  SIInsertWaitcnts() : MachineFunctionPass(ID) {}
380 
381  bool runOnMachineFunction(MachineFunction &MF) override;
382 
383  StringRef getPassName() const override {
384  return "SI insert wait instructions";
385  }
386 
387  void getAnalysisUsage(AnalysisUsage &AU) const override {
388  AU.setPreservesCFG();
391  }
392 
393  void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
394  // The waitcnt information is copied because it changes as the block is
395  // traversed.
396  KillWaitBrackets.push_back(
397  llvm::make_unique<BlockWaitcntBrackets>(*Bracket));
398  }
399 
400  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
401  MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI,
402  BlockWaitcntBrackets *ScoreBrackets);
403  void updateEventWaitCntAfter(MachineInstr &Inst,
404  BlockWaitcntBrackets *ScoreBrackets);
405  void mergeInputScoreBrackets(MachineBasicBlock &Block);
406  MachineBasicBlock *loopBottom(const MachineLoop *Loop);
407  void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
408  void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
409 };
410 
411 } // end anonymous namespace
412 
413 RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
414  const SIInstrInfo *TII,
415  const MachineRegisterInfo *MRI,
416  const SIRegisterInfo *TRI,
417  unsigned OpNo,
418  bool Def) const {
419  const MachineOperand &Op = MI->getOperand(OpNo);
420  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
421  (Def && !Op.isDef()))
422  return {-1, -1};
423 
424  // A use via a PW operand does not need a waitcnt.
425  // A partial write is not a WAW.
426  assert(!Op.getSubReg() || !Op.isUndef());
427 
428  RegInterval Result;
429  const MachineRegisterInfo &MRIA = *MRI;
430 
431  unsigned Reg = TRI->getEncodingValue(Op.getReg());
432 
433  if (TRI->isVGPR(MRIA, Op.getReg())) {
434  assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
435  Result.first = Reg - RegisterEncoding.VGPR0;
436  assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
437  } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
438  assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
439  Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
440  assert(Result.first >= NUM_ALL_VGPRS &&
441  Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
442  }
443  // TODO: Handle TTMP
444  // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
445  else
446  return {-1, -1};
447 
448  const MachineInstr &MIA = *MI;
449  const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
450  unsigned Size = TRI->getRegSizeInBits(*RC);
451  Result.second = Result.first + (Size / 32);
452 
453  return Result;
454 }
455 
456 void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
457  const SIInstrInfo *TII,
458  const SIRegisterInfo *TRI,
459  const MachineRegisterInfo *MRI,
460  unsigned OpNo, int32_t Val) {
461  RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
462  DEBUG({
463  const MachineOperand &Opnd = MI->getOperand(OpNo);
464  assert(TRI->isVGPR(*MRI, Opnd.getReg()));
465  });
466  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
467  setRegScore(RegNo, EXP_CNT, Val);
468  }
469 }
470 
471 void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
472  const SIRegisterInfo *TRI,
473  const MachineRegisterInfo *MRI,
474  WaitEventType E, MachineInstr &Inst) {
475  const MachineRegisterInfo &MRIA = *MRI;
476  InstCounterType T = eventCounter(E);
477  int32_t CurrScore = getScoreUB(T) + 1;
478  // EventUB and ScoreUB need to be update regardless if this event changes
479  // the score of a register or not.
480  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
481  EventUBs[E] = CurrScore;
482  setScoreUB(T, CurrScore);
483 
484  if (T == EXP_CNT) {
485  // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
486  // is required.
487  if (!MixedExpTypes) {
488  MixedExpTypes = counterOutOfOrder(EXP_CNT);
489  }
490 
491  // Put score on the source vgprs. If this is a store, just use those
492  // specific register(s).
493  if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
494  // All GDS operations must protect their address register (same as
495  // export.)
496  if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
497  Inst.getOpcode() != AMDGPU::DS_CONSUME) {
498  setExpScore(
499  &Inst, TII, TRI, MRI,
500  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
501  CurrScore);
502  }
503  if (Inst.mayStore()) {
504  setExpScore(
505  &Inst, TII, TRI, MRI,
506  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
507  CurrScore);
509  AMDGPU::OpName::data1) != -1) {
510  setExpScore(&Inst, TII, TRI, MRI,
512  AMDGPU::OpName::data1),
513  CurrScore);
514  }
515  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
516  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
517  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
518  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
519  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
520  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
521  Inst.getOpcode() != AMDGPU::DS_APPEND &&
522  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
523  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
524  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
525  const MachineOperand &Op = Inst.getOperand(I);
526  if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
527  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
528  }
529  }
530  }
531  } else if (TII->isFLAT(Inst)) {
532  if (Inst.mayStore()) {
533  setExpScore(
534  &Inst, TII, TRI, MRI,
535  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
536  CurrScore);
537  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
538  setExpScore(
539  &Inst, TII, TRI, MRI,
540  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
541  CurrScore);
542  }
543  } else if (TII->isMIMG(Inst)) {
544  if (Inst.mayStore()) {
545  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
546  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
547  setExpScore(
548  &Inst, TII, TRI, MRI,
549  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
550  CurrScore);
551  }
552  } else if (TII->isMTBUF(Inst)) {
553  if (Inst.mayStore()) {
554  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
555  }
556  } else if (TII->isMUBUF(Inst)) {
557  if (Inst.mayStore()) {
558  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
559  } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
560  setExpScore(
561  &Inst, TII, TRI, MRI,
562  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
563  CurrScore);
564  }
565  } else {
566  if (TII->isEXP(Inst)) {
567  // For export the destination registers are really temps that
568  // can be used as the actual source after export patching, so
569  // we need to treat them like sources and set the EXP_CNT
570  // score.
571  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
572  MachineOperand &DefMO = Inst.getOperand(I);
573  if (DefMO.isReg() && DefMO.isDef() &&
574  TRI->isVGPR(MRIA, DefMO.getReg())) {
575  setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
576  CurrScore);
577  }
578  }
579  }
580  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
581  MachineOperand &MO = Inst.getOperand(I);
582  if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
583  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
584  }
585  }
586  }
587 #if 0 // TODO: check if this is handled by MUBUF code above.
588  } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
589  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
590  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
591  MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
592  unsigned OpNo;//TODO: find the OpNo for this operand;
593  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
594  for (signed RegNo = Interval.first; RegNo < Interval.second;
595  ++RegNo) {
596  setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
597  }
598 #endif
599  } else {
600  // Match the score to the destination registers.
601  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
602  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
603  if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
604  continue;
605  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
606  setRegScore(RegNo, T, CurrScore);
607  }
608  }
609  if (TII->isDS(Inst) && Inst.mayStore()) {
610  setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
611  }
612  }
613 }
614 
616  OS << '\n';
617  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
618  T = (enum InstCounterType)(T + 1)) {
619  int LB = getScoreLB(T);
620  int UB = getScoreUB(T);
621 
622  switch (T) {
623  case VM_CNT:
624  OS << " VM_CNT(" << UB - LB << "): ";
625  break;
626  case LGKM_CNT:
627  OS << " LGKM_CNT(" << UB - LB << "): ";
628  break;
629  case EXP_CNT:
630  OS << " EXP_CNT(" << UB - LB << "): ";
631  break;
632  default:
633  OS << " UNKNOWN(" << UB - LB << "): ";
634  break;
635  }
636 
637  if (LB < UB) {
638  // Print vgpr scores.
639  for (int J = 0; J <= getMaxVGPR(); J++) {
640  int RegScore = getRegScore(J, T);
641  if (RegScore <= LB)
642  continue;
643  int RelScore = RegScore - LB - 1;
644  if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
645  OS << RelScore << ":v" << J << " ";
646  } else {
647  OS << RelScore << ":ds ";
648  }
649  }
650  // Also need to print sgpr scores for lgkm_cnt.
651  if (T == LGKM_CNT) {
652  for (int J = 0; J <= getMaxSGPR(); J++) {
653  int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
654  if (RegScore <= LB)
655  continue;
656  int RelScore = RegScore - LB - 1;
657  OS << RelScore << ":s" << J << " ";
658  }
659  }
660  }
661  OS << '\n';
662  }
663  OS << '\n';
664 }
665 
666 unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
667  int ScoreToWait) {
668  unsigned int NeedWait = 0;
669  if (ScoreToWait == -1) {
670  // The score to wait is unknown. This implies that it was not encountered
671  // during the path of the CFG walk done during the current traversal but
672  // may be seen on a different path. Emit an s_wait counter with a
673  // conservative value of 0 for the counter.
674  NeedWait = CNT_MASK(T);
675  setScoreLB(T, getScoreUB(T));
676  return NeedWait;
677  }
678 
679  // If the score of src_operand falls within the bracket, we need an
680  // s_waitcnt instruction.
681  const int32_t LB = getScoreLB(T);
682  const int32_t UB = getScoreUB(T);
683  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
684  if (T == VM_CNT && hasPendingFlat()) {
685  // If there is a pending FLAT operation, and this is a VM waitcnt,
686  // then we need to force a waitcnt 0 for VM.
687  NeedWait = CNT_MASK(T);
688  setScoreLB(T, getScoreUB(T));
689  } else if (counterOutOfOrder(T)) {
690  // Counter can get decremented out-of-order when there
691  // are multiple types event in the brack. Also emit an s_wait counter
692  // with a conservative value of 0 for the counter.
693  NeedWait = CNT_MASK(T);
694  setScoreLB(T, getScoreUB(T));
695  } else {
696  NeedWait = CNT_MASK(T);
697  setScoreLB(T, ScoreToWait);
698  }
699  }
700 
701  return NeedWait;
702 }
703 
704 // Where there are multiple types of event in the bracket of a counter,
705 // the decrement may go out of order.
706 bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
707  switch (T) {
708  case VM_CNT:
709  return false;
710  case LGKM_CNT: {
711  if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
712  EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
713  // Scalar memory read always can go out of order.
714  return true;
715  }
716  int NumEventTypes = 0;
717  if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
718  EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
719  NumEventTypes++;
720  }
721  if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
722  EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
723  NumEventTypes++;
724  }
725  if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
726  EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
727  NumEventTypes++;
728  }
729  if (NumEventTypes <= 1) {
730  return false;
731  }
732  break;
733  }
734  case EXP_CNT: {
735  // If there has been a mixture of export types, then a waitcnt exp(0) is
736  // required.
737  if (MixedExpTypes)
738  return true;
739  int NumEventTypes = 0;
740  if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
741  EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
742  NumEventTypes++;
743  }
744  if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
745  EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
746  NumEventTypes++;
747  }
748  if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
749  EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
750  NumEventTypes++;
751  }
752  if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
753  EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
754  NumEventTypes++;
755  }
756 
757  if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
758  EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
759  NumEventTypes++;
760  }
761 
762  if (NumEventTypes <= 1) {
763  return false;
764  }
765  break;
766  }
767  default:
768  break;
769  }
770  return true;
771 }
772 
773 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
774  false)
775 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
776  false)
777 
778 char SIInsertWaitcnts::ID = 0;
779 
780 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
781 
783  return new SIInsertWaitcnts();
784 }
785 
786 static bool readsVCCZ(const MachineInstr &MI) {
787  unsigned Opc = MI.getOpcode();
788  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
789  !MI.getOperand(1).isUndef();
790 }
791 
792 /// \brief Generate s_waitcnt instruction to be placed before cur_Inst.
793 /// Instructions of a given type are returned in order,
794 /// but instructions of different types can complete out of order.
795 /// We rely on this in-order completion
796 /// and simply assign a score to the memory access instructions.
797 /// We keep track of the active "score bracket" to determine
798 /// if an access of a memory read requires an s_waitcnt
799 /// and if so what the value of each counter is.
800 /// The "score bracket" is bound by the lower bound and upper bound
801 /// scores (*_score_LB and *_score_ub respectively).
802 MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
803  MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
804  // To emit, or not to emit - that's the question!
805  // Start with an assumption that there is no need to emit.
806  unsigned int EmitSwaitcnt = 0;
807  // s_waitcnt instruction to return; default is NULL.
808  MachineInstr *SWaitInst = nullptr;
809  // No need to wait before phi. If a phi-move exists, then the wait should
810  // has been inserted before the move. If a phi-move does not exist, then
811  // wait should be inserted before the real use. The same is true for
812  // sc-merge. It is not a coincident that all these cases correspond to the
813  // instructions that are skipped in the assembling loop.
814  bool NeedLineMapping = false; // TODO: Check on this.
815  if (MI.isDebugValue() &&
816  // TODO: any other opcode?
817  !NeedLineMapping) {
818  return SWaitInst;
819  }
820 
821  // See if an s_waitcnt is forced at block entry, or is needed at
822  // program end.
823  if (ScoreBrackets->getWaitAtBeginning()) {
824  // Note that we have already cleared the state, so we don't need to update
825  // it.
826  ScoreBrackets->clearWaitAtBeginning();
827  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
828  T = (enum InstCounterType)(T + 1)) {
829  EmitSwaitcnt |= CNT_MASK(T);
830  ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
831  }
832  }
833 
834  // See if this instruction has a forced S_WAITCNT VM.
835  // TODO: Handle other cases of NeedsWaitcntVmBefore()
836  else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
837  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
838  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
839  EmitSwaitcnt |=
840  ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
841  }
842 
843  // All waits must be resolved at call return.
844  // NOTE: this could be improved with knowledge of all call sites or
845  // with knowledge of the called routines.
846  if (MI.getOpcode() == AMDGPU::RETURN ||
847  MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
848  MI.getOpcode() == AMDGPU::S_SETPC_B64_return) {
849  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
850  T = (enum InstCounterType)(T + 1)) {
851  if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
852  ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
853  EmitSwaitcnt |= CNT_MASK(T);
854  }
855  }
856  }
857  // Resolve vm waits before gs-done.
858  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
859  MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
862  if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
863  ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
864  EmitSwaitcnt |= CNT_MASK(VM_CNT);
865  }
866  }
867 #if 0 // TODO: the following blocks of logic when we have fence.
868  else if (MI.getOpcode() == SC_FENCE) {
869  const unsigned int group_size =
870  context->shader_info->GetMaxThreadGroupSize();
871  // group_size == 0 means thread group size is unknown at compile time
872  const bool group_is_multi_wave =
873  (group_size == 0 || group_size > target_info->GetWaveFrontSize());
874  const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
875 
876  for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
877  SCRegType src_type = Inst->GetSrcType(i);
878  switch (src_type) {
879  case SCMEM_LDS:
880  if (group_is_multi_wave ||
881  context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
882  EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
883  ScoreBrackets->getScoreUB(LGKM_CNT));
884  // LDS may have to wait for VM_CNT after buffer load to LDS
885  if (target_info->HasBufferLoadToLDS()) {
886  EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
887  ScoreBrackets->getScoreUB(VM_CNT));
888  }
889  }
890  break;
891 
892  case SCMEM_GDS:
893  if (group_is_multi_wave || fence_is_global) {
894  EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
895  ScoreBrackets->getScoreUB(EXP_CNT));
896  EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
897  ScoreBrackets->getScoreUB(LGKM_CNT));
898  }
899  break;
900 
901  case SCMEM_UAV:
902  case SCMEM_TFBUF:
903  case SCMEM_RING:
904  case SCMEM_SCATTER:
905  if (group_is_multi_wave || fence_is_global) {
906  EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
907  ScoreBrackets->getScoreUB(EXP_CNT));
908  EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
909  ScoreBrackets->getScoreUB(VM_CNT));
910  }
911  break;
912 
913  case SCMEM_SCRATCH:
914  default:
915  break;
916  }
917  }
918  }
919 #endif
920 
921  // Export & GDS instructions do not read the EXEC mask until after the export
922  // is granted (which can occur well after the instruction is issued).
923  // The shader program must flush all EXP operations on the export-count
924  // before overwriting the EXEC mask.
925  else {
926  if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
927  // Export and GDS are tracked individually, either may trigger a waitcnt
928  // for EXEC.
929  EmitSwaitcnt |= ScoreBrackets->updateByWait(
930  EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
931  EmitSwaitcnt |= ScoreBrackets->updateByWait(
932  EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
933  EmitSwaitcnt |= ScoreBrackets->updateByWait(
934  EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
935  EmitSwaitcnt |= ScoreBrackets->updateByWait(
936  EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
937  }
938 
939 #if 0 // TODO: the following code to handle CALL.
940  // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
941  // However, there is a problem with EXP_CNT, because the call cannot
942  // easily tell if a register is used in the function, and if it did, then
943  // the referring instruction would have to have an S_WAITCNT, which is
944  // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
945  // before the call.
946  if (MI.getOpcode() == SC_CALL) {
947  if (ScoreBrackets->getScoreUB(EXP_CNT) >
948  ScoreBrackets->getScoreLB(EXP_CNT)) {
949  ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
950  EmitSwaitcnt |= CNT_MASK(EXP_CNT);
951  }
952  }
953 #endif
954 
955  // FIXME: Should not be relying on memoperands.
956  // Look at the source operands of every instruction to see if
957  // any of them results from a previous memory operation that affects
958  // its current usage. If so, an s_waitcnt instruction needs to be
959  // emitted.
960  // If the source operand was defined by a load, add the s_waitcnt
961  // instruction.
962  for (const MachineMemOperand *Memop : MI.memoperands()) {
963  unsigned AS = Memop->getAddrSpace();
964  if (AS != AMDGPUASI.LOCAL_ADDRESS)
965  continue;
966  unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
967  // VM_CNT is only relevant to vgpr or LDS.
968  EmitSwaitcnt |= ScoreBrackets->updateByWait(
969  VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
970  }
971 
972  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
973  const MachineOperand &Op = MI.getOperand(I);
974  const MachineRegisterInfo &MRIA = *MRI;
975  RegInterval Interval =
976  ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
977  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
978  if (TRI->isVGPR(MRIA, Op.getReg())) {
979  // VM_CNT is only relevant to vgpr or LDS.
980  EmitSwaitcnt |= ScoreBrackets->updateByWait(
981  VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
982  }
983  EmitSwaitcnt |= ScoreBrackets->updateByWait(
984  LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
985  }
986  }
987  // End of for loop that looks at all source operands to decide vm_wait_cnt
988  // and lgk_wait_cnt.
989 
990  // Two cases are handled for destination operands:
991  // 1) If the destination operand was defined by a load, add the s_waitcnt
992  // instruction to guarantee the right WAW order.
993  // 2) If a destination operand that was used by a recent export/store ins,
994  // add s_waitcnt on exp_cnt to guarantee the WAR order.
995  if (MI.mayStore()) {
996  // FIXME: Should not be relying on memoperands.
997  for (const MachineMemOperand *Memop : MI.memoperands()) {
998  unsigned AS = Memop->getAddrSpace();
999  if (AS != AMDGPUASI.LOCAL_ADDRESS)
1000  continue;
1001  unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1002  EmitSwaitcnt |= ScoreBrackets->updateByWait(
1003  VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1004  EmitSwaitcnt |= ScoreBrackets->updateByWait(
1005  EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1006  }
1007  }
1008  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1009  MachineOperand &Def = MI.getOperand(I);
1010  const MachineRegisterInfo &MRIA = *MRI;
1011  RegInterval Interval =
1012  ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
1013  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1014  if (TRI->isVGPR(MRIA, Def.getReg())) {
1015  EmitSwaitcnt |= ScoreBrackets->updateByWait(
1016  VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
1017  EmitSwaitcnt |= ScoreBrackets->updateByWait(
1018  EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
1019  }
1020  EmitSwaitcnt |= ScoreBrackets->updateByWait(
1021  LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
1022  }
1023  } // End of for loop that looks at all dest operands.
1024  }
1025 
1026  // TODO: Tie force zero to a compiler triage option.
1027  bool ForceZero = false;
1028 
1029  // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1030  // occurs before the instruction. Doing it here prevents any additional
1031  // S_WAITCNTs from being emitted if the instruction was marked as
1032  // requiring a WAITCNT beforehand.
1033  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1034  !ST->hasAutoWaitcntBeforeBarrier()) {
1035  EmitSwaitcnt |=
1036  ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1037  EmitSwaitcnt |= ScoreBrackets->updateByWait(
1038  EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1039  EmitSwaitcnt |= ScoreBrackets->updateByWait(
1040  LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
1041  }
1042 
1043  // TODO: Remove this work-around, enable the assert for Bug 457939
1044  // after fixing the scheduler. Also, the Shader Compiler code is
1045  // independent of target.
1046  if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
1047  if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1048  ScoreBrackets->getScoreUB(LGKM_CNT) &&
1049  ScoreBrackets->hasPendingSMEM()) {
1050  // Wait on everything, not just LGKM. vccz reads usually come from
1051  // terminators, and we always wait on everything at the end of the
1052  // block, so if we only wait on LGKM here, we might end up with
1053  // another s_waitcnt inserted right after this if there are non-LGKM
1054  // instructions still outstanding.
1055  ForceZero = true;
1056  EmitSwaitcnt = true;
1057  }
1058  }
1059 
1060  // Does this operand processing indicate s_wait counter update?
1061  if (EmitSwaitcnt) {
1062  int CntVal[NUM_INST_CNTS];
1063 
1064  bool UseDefaultWaitcntStrategy = true;
1065  if (ForceZero) {
1066  // Force all waitcnts to 0.
1067  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1068  T = (enum InstCounterType)(T + 1)) {
1069  ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1070  }
1071  CntVal[VM_CNT] = 0;
1072  CntVal[EXP_CNT] = 0;
1073  CntVal[LGKM_CNT] = 0;
1074  UseDefaultWaitcntStrategy = false;
1075  }
1076 
1077  if (UseDefaultWaitcntStrategy) {
1078  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1079  T = (enum InstCounterType)(T + 1)) {
1080  if (EmitSwaitcnt & CNT_MASK(T)) {
1081  int Delta =
1082  ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
1083  int MaxDelta = ScoreBrackets->getWaitCountMax(T);
1084  if (Delta >= MaxDelta) {
1085  Delta = -1;
1086  if (T != EXP_CNT) {
1087  ScoreBrackets->setScoreLB(
1088  T, ScoreBrackets->getScoreUB(T) - MaxDelta);
1089  }
1090  EmitSwaitcnt &= ~CNT_MASK(T);
1091  }
1092  CntVal[T] = Delta;
1093  } else {
1094  // If we are not waiting for a particular counter then encode
1095  // it as -1 which means "don't care."
1096  CntVal[T] = -1;
1097  }
1098  }
1099  }
1100 
1101  // If we are not waiting on any counter we can skip the wait altogether.
1102  if (EmitSwaitcnt != 0) {
1103  MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
1104  int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
1105  if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) !=
1106  (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
1107  (AMDGPU::decodeExpcnt(IV, Imm) !=
1108  (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
1109  (AMDGPU::decodeLgkmcnt(IV, Imm) !=
1110  (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
1111  MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
1112  if (ContainingLoop) {
1113  MachineBasicBlock *TBB = ContainingLoop->getHeader();
1114  BlockWaitcntBrackets *ScoreBracket =
1115  BlockWaitcntBracketsMap[TBB].get();
1116  if (!ScoreBracket) {
1117  assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end());
1118  BlockWaitcntBracketsMap[TBB] =
1119  llvm::make_unique<BlockWaitcntBrackets>();
1120  ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
1121  }
1122  ScoreBracket->setRevisitLoop(true);
1123  DEBUG(dbgs() << "set-revisit: block"
1124  << ContainingLoop->getHeader()->getNumber() << '\n';);
1125  }
1126  }
1127 
1128  // Update an existing waitcount, or make a new one.
1129  MachineFunction &MF = *MI.getParent()->getParent();
1130  if (OldWaitcnt && OldWaitcnt->getOpcode() != AMDGPU::S_WAITCNT) {
1131  SWaitInst = OldWaitcnt;
1132  } else {
1133  SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT),
1134  MI.getDebugLoc());
1135  CompilerGeneratedWaitcntSet.insert(SWaitInst);
1136  }
1137 
1138  const MachineOperand &Op =
1140  IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT]));
1141  SWaitInst->addOperand(MF, Op);
1142 
1143  if (CntVal[EXP_CNT] == 0) {
1144  ScoreBrackets->setMixedExpTypes(false);
1145  }
1146  }
1147  }
1148 
1149  return SWaitInst;
1150 }
1151 
1152 void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
1153  MachineInstr *Waitcnt) {
1154  if (MBB.empty()) {
1155  MBB.push_back(Waitcnt);
1156  return;
1157  }
1158 
1159  MachineBasicBlock::iterator It = MBB.end();
1160  MachineInstr *MI = &*(--It);
1161  if (MI->isBranch()) {
1162  MBB.insert(It, Waitcnt);
1163  } else {
1164  MBB.push_back(Waitcnt);
1165  }
1166 }
1167 
1168 // This is a flat memory operation. Check to see if it has memory
1169 // tokens for both LDS and Memory, and if so mark it as a flat.
1170 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1171  if (MI.memoperands_empty())
1172  return true;
1173 
1174  for (const MachineMemOperand *Memop : MI.memoperands()) {
1175  unsigned AS = Memop->getAddrSpace();
1176  if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
1177  return true;
1178  }
1179 
1180  return false;
1181 }
1182 
1183 void SIInsertWaitcnts::updateEventWaitCntAfter(
1184  MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
1185  // Now look at the instruction opcode. If it is a memory access
1186  // instruction, update the upper-bound of the appropriate counter's
1187  // bracket and the destination operand scores.
1188  // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1189  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1190  if (TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1191  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1192  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1193  } else {
1194  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1195  }
1196  } else if (TII->isFLAT(Inst)) {
1197  assert(Inst.mayLoad() || Inst.mayStore());
1198 
1199  if (TII->usesVM_CNT(Inst))
1200  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1201 
1202  if (TII->usesLGKM_CNT(Inst)) {
1203  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1204 
1205  // This is a flat memory operation, so note it - it will require
1206  // that both the VM and LGKM be flushed to zero if it is pending when
1207  // a VM or LGKM dependency occurs.
1208  if (mayAccessLDSThroughFlat(Inst))
1209  ScoreBrackets->setPendingFlat();
1210  }
1211  } else if (SIInstrInfo::isVMEM(Inst) &&
1212  // TODO: get a better carve out.
1213  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
1214  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
1215  Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
1216  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1217  if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() &&
1218  (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1)) {
1219  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1220  }
1221  } else if (TII->isSMRD(Inst)) {
1222  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1223  } else {
1224  switch (Inst.getOpcode()) {
1225  case AMDGPU::S_SENDMSG:
1226  case AMDGPU::S_SENDMSGHALT:
1227  ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1228  break;
1229  case AMDGPU::EXP:
1230  case AMDGPU::EXP_DONE: {
1231  int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1232  if (Imm >= 32 && Imm <= 63)
1233  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1234  else if (Imm >= 12 && Imm <= 15)
1235  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1236  else
1237  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1238  break;
1239  }
1240  case AMDGPU::S_MEMTIME:
1241  case AMDGPU::S_MEMREALTIME:
1242  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1243  break;
1244  default:
1245  break;
1246  }
1247  }
1248 }
1249 
1250 void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
1251  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1252  int32_t MaxPending[NUM_INST_CNTS] = {0};
1253  int32_t MaxFlat[NUM_INST_CNTS] = {0};
1254  bool MixedExpTypes = false;
1255 
1256  // Clear the score bracket state.
1257  ScoreBrackets->clear();
1258 
1259  // Compute the number of pending elements on block entry.
1260 
1261  // IMPORTANT NOTE: If iterative handling of loops is added, the code will
1262  // need to handle single BBs with backedges to themselves. This means that
1263  // they will need to retain and not clear their initial state.
1264 
1265  // See if there are any uninitialized predecessors. If so, emit an
1266  // s_waitcnt 0 at the beginning of the block.
1267  for (MachineBasicBlock *pred : Block.predecessors()) {
1268  BlockWaitcntBrackets *PredScoreBrackets =
1269  BlockWaitcntBracketsMap[pred].get();
1270  bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
1271  if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
1272  break;
1273  }
1274  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1275  T = (enum InstCounterType)(T + 1)) {
1276  int span =
1277  PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
1278  MaxPending[T] = std::max(MaxPending[T], span);
1279  span =
1280  PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
1281  MaxFlat[T] = std::max(MaxFlat[T], span);
1282  }
1283 
1284  MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
1285  }
1286 
1287  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1288  // Also handle kills for exit block.
1289  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1290  for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1291  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1292  T = (enum InstCounterType)(T + 1)) {
1293  int Span = KillWaitBrackets[I]->getScoreUB(T) -
1294  KillWaitBrackets[I]->getScoreLB(T);
1295  MaxPending[T] = std::max(MaxPending[T], Span);
1296  Span = KillWaitBrackets[I]->pendingFlat(T) -
1297  KillWaitBrackets[I]->getScoreLB(T);
1298  MaxFlat[T] = std::max(MaxFlat[T], Span);
1299  }
1300 
1301  MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
1302  }
1303  }
1304 
1305  // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
1306  for (MachineBasicBlock *Pred : Block.predecessors()) {
1307  BlockWaitcntBrackets *PredScoreBrackets =
1308  BlockWaitcntBracketsMap[Pred].get();
1309  bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
1310  if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
1311  break;
1312  }
1313 
1314  int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
1315  PredScoreBrackets->getScoreLB(EXP_CNT);
1316  MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1317  int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
1318  PredScoreBrackets->getScoreLB(EXP_CNT);
1319  MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1320  }
1321 
1322  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1323  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1324  for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1325  int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
1326  KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1327  MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
1328  int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
1329  KillWaitBrackets[I]->getScoreLB(EXP_CNT);
1330  MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
1331  }
1332  }
1333 
1334 #if 0
1335  // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
1336  // TODO: how does LC distinguish between function entry and main entry?
1337  // If this is the entry to a function, force a wait.
1338  MachineBasicBlock &Entry = Block.getParent()->front();
1339  if (Entry.getNumber() == Block.getNumber()) {
1340  ScoreBrackets->setWaitAtBeginning();
1341  return;
1342  }
1343 #endif
1344 
1345  // Now set the current Block's brackets to the largest ending bracket.
1346  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1347  T = (enum InstCounterType)(T + 1)) {
1348  ScoreBrackets->setScoreUB(T, MaxPending[T]);
1349  ScoreBrackets->setScoreLB(T, 0);
1350  ScoreBrackets->setLastFlat(T, MaxFlat[T]);
1351  }
1352 
1353  ScoreBrackets->setMixedExpTypes(MixedExpTypes);
1354 
1355  // Set the register scoreboard.
1356  for (MachineBasicBlock *Pred : Block.predecessors()) {
1357  if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
1358  break;
1359  }
1360 
1361  BlockWaitcntBrackets *PredScoreBrackets =
1362  BlockWaitcntBracketsMap[Pred].get();
1363 
1364  // Now merge the gpr_reg_score information
1365  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1366  T = (enum InstCounterType)(T + 1)) {
1367  int PredLB = PredScoreBrackets->getScoreLB(T);
1368  int PredUB = PredScoreBrackets->getScoreUB(T);
1369  if (PredLB < PredUB) {
1370  int PredScale = MaxPending[T] - PredUB;
1371  // Merge vgpr scores.
1372  for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
1373  int PredRegScore = PredScoreBrackets->getRegScore(J, T);
1374  if (PredRegScore <= PredLB)
1375  continue;
1376  int NewRegScore = PredScale + PredRegScore;
1377  ScoreBrackets->setRegScore(
1378  J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1379  }
1380  // Also need to merge sgpr scores for lgkm_cnt.
1381  if (T == LGKM_CNT) {
1382  for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
1383  int PredRegScore =
1384  PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1385  if (PredRegScore <= PredLB)
1386  continue;
1387  int NewRegScore = PredScale + PredRegScore;
1388  ScoreBrackets->setRegScore(
1389  J + NUM_ALL_VGPRS, LGKM_CNT,
1390  std::max(
1391  ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1392  NewRegScore));
1393  }
1394  }
1395  }
1396  }
1397 
1398  // Also merge the WaitEvent information.
1400  enum InstCounterType T = PredScoreBrackets->eventCounter(W);
1401  int PredEventUB = PredScoreBrackets->getEventUB(W);
1402  if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
1403  int NewEventUB =
1404  MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
1405  if (NewEventUB > 0) {
1406  ScoreBrackets->setEventUB(
1407  W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1408  }
1409  }
1410  }
1411  }
1412 
1413  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
1414  // Set the register scoreboard.
1415  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
1416  for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
1417  // Now merge the gpr_reg_score information.
1418  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1419  T = (enum InstCounterType)(T + 1)) {
1420  int PredLB = KillWaitBrackets[I]->getScoreLB(T);
1421  int PredUB = KillWaitBrackets[I]->getScoreUB(T);
1422  if (PredLB < PredUB) {
1423  int PredScale = MaxPending[T] - PredUB;
1424  // Merge vgpr scores.
1425  for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
1426  int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
1427  if (PredRegScore <= PredLB)
1428  continue;
1429  int NewRegScore = PredScale + PredRegScore;
1430  ScoreBrackets->setRegScore(
1431  J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
1432  }
1433  // Also need to merge sgpr scores for lgkm_cnt.
1434  if (T == LGKM_CNT) {
1435  for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
1436  int PredRegScore =
1437  KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
1438  if (PredRegScore <= PredLB)
1439  continue;
1440  int NewRegScore = PredScale + PredRegScore;
1441  ScoreBrackets->setRegScore(
1442  J + NUM_ALL_VGPRS, LGKM_CNT,
1443  std::max(
1444  ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
1445  NewRegScore));
1446  }
1447  }
1448  }
1449  }
1450 
1451  // Also merge the WaitEvent information.
1453  enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
1454  int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
1455  if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
1456  int NewEventUB =
1457  MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
1458  if (NewEventUB > 0) {
1459  ScoreBrackets->setEventUB(
1460  W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
1461  }
1462  }
1463  }
1464  }
1465  }
1466 
1467  // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
1468  // sequencing predecessors, because changes to EXEC require waitcnts due to
1469  // the delayed nature of these operations.
1470  for (MachineBasicBlock *Pred : Block.predecessors()) {
1471  if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
1472  break;
1473  }
1474 
1475  BlockWaitcntBrackets *PredScoreBrackets =
1476  BlockWaitcntBracketsMap[Pred].get();
1477 
1478  int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
1479  if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1480  int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
1481  PredScoreBrackets->getScoreUB(EXP_CNT);
1482  if (new_gds_ub > 0) {
1483  ScoreBrackets->setEventUB(
1484  GDS_GPR_LOCK,
1485  std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
1486  }
1487  }
1488  int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
1489  if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
1490  int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
1491  PredScoreBrackets->getScoreUB(EXP_CNT);
1492  if (new_exp_ub > 0) {
1493  ScoreBrackets->setEventUB(
1494  EXP_GPR_LOCK,
1495  std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
1496  }
1497  }
1498  }
1499 }
1500 
1501 /// Return the "bottom" block of a loop. This differs from
1502 /// MachineLoop::getBottomBlock in that it works even if the loop is
1503 /// discontiguous.
1504 MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) {
1505  MachineBasicBlock *Bottom = Loop->getHeader();
1506  for (MachineBasicBlock *MBB : Loop->blocks())
1507  if (MBB->getNumber() > Bottom->getNumber())
1508  Bottom = MBB;
1509  return Bottom;
1510 }
1511 
1512 // Generate s_waitcnt instructions where needed.
1513 void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1514  MachineBasicBlock &Block) {
1515  // Initialize the state information.
1516  mergeInputScoreBrackets(Block);
1517 
1518  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
1519 
1520  DEBUG({
1521  dbgs() << "Block" << Block.getNumber();
1522  ScoreBrackets->dump();
1523  });
1524 
1525  // Walk over the instructions.
1526  for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
1527  Iter != E;) {
1528  MachineInstr &Inst = *Iter;
1529  // Remove any previously existing waitcnts.
1530  if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
1531  // TODO: Register the old waitcnt and optimize the following waitcnts.
1532  // Leaving the previously existing waitcnts is conservatively correct.
1533  if (CompilerGeneratedWaitcntSet.find(&Inst) ==
1534  CompilerGeneratedWaitcntSet.end())
1535  ++Iter;
1536  else {
1537  ScoreBrackets->setWaitcnt(&Inst);
1538  ++Iter;
1539  Inst.removeFromParent();
1540  }
1541  continue;
1542  }
1543 
1544  // Kill instructions generate a conditional branch to the endmain block.
1545  // Merge the current waitcnt state into the endmain block information.
1546  // TODO: Are there other flavors of KILL instruction?
1547  if (Inst.getOpcode() == AMDGPU::KILL) {
1548  addKillWaitBracket(ScoreBrackets);
1549  }
1550 
1551  bool VCCZBugWorkAround = false;
1552  if (readsVCCZ(Inst) &&
1553  (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) {
1554  if (ScoreBrackets->getScoreLB(LGKM_CNT) <
1555  ScoreBrackets->getScoreUB(LGKM_CNT) &&
1556  ScoreBrackets->hasPendingSMEM()) {
1557  if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
1558  VCCZBugWorkAround = true;
1559  }
1560  }
1561 
1562  // Generate an s_waitcnt instruction to be placed before
1563  // cur_Inst, if needed.
1564  MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets);
1565 
1566  if (SWaitInst) {
1567  Block.insert(Inst, SWaitInst);
1568  if (ScoreBrackets->getWaitcnt() != SWaitInst) {
1569  DEBUG(dbgs() << "insertWaitcntInBlock\n"
1570  << "Old Instr: " << Inst << '\n'
1571  << "New Instr: " << *SWaitInst << '\n';);
1572  }
1573  }
1574 
1575  updateEventWaitCntAfter(Inst, ScoreBrackets);
1576 
1577 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1578  // If this instruction generates a S_SETVSKIP because it is an
1579  // indexed resource, and we are on Tahiti, then it will also force
1580  // an S_WAITCNT vmcnt(0)
1581  if (RequireCheckResourceType(Inst, context)) {
1582  // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1583  ScoreBrackets->setScoreLB(VM_CNT,
1584  ScoreBrackets->getScoreUB(VM_CNT));
1585  }
1586 #endif
1587 
1588  ScoreBrackets->clearWaitcnt();
1589 
1590  if (SWaitInst) {
1591  DEBUG({ SWaitInst->print(dbgs() << '\n'); });
1592  }
1593  DEBUG({
1594  Inst.print(dbgs());
1595  ScoreBrackets->dump();
1596  });
1597 
1598  // Check to see if this is a GWS instruction. If so, and if this is CI or
1599  // VI, then the generated code sequence will include an S_WAITCNT 0.
1600  // TODO: Are these the only GWS instructions?
1601  if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
1602  Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
1603  Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
1604  Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
1605  Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
1606  // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
1607  ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
1608  ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
1609  ScoreBrackets->updateByWait(LGKM_CNT,
1610  ScoreBrackets->getScoreUB(LGKM_CNT));
1611  }
1612 
1613  // TODO: Remove this work-around after fixing the scheduler and enable the
1614  // assert above.
1615  if (VCCZBugWorkAround) {
1616  // Restore the vccz bit. Any time a value is written to vcc, the vcc
1617  // bit is updated, so we can restore the bit by reading the value of
1618  // vcc and then writing it back to the register.
1619  BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
1620  AMDGPU::VCC)
1621  .addReg(AMDGPU::VCC);
1622  VCCZBugHandledSet.insert(&Inst);
1623  }
1624 
1625  ++Iter;
1626  }
1627 
1628  // Check if we need to force convergence at loop footer.
1629  MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
1630  if (ContainingLoop && loopBottom(ContainingLoop) == &Block) {
1631  LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1632  WaitcntData->print();
1633  DEBUG(dbgs() << '\n';);
1634 
1635  // The iterative waitcnt insertion algorithm aims for optimal waitcnt
1636  // placement and doesn't always guarantee convergence for a loop. Each
1637  // loop should take at most 2 iterations for it to converge naturally.
1638  // When this max is reached and result doesn't converge, we force
1639  // convergence by inserting a s_waitcnt at the end of loop footer.
1640  if (WaitcntData->getIterCnt() > 2) {
1641  // To ensure convergence, need to make wait events at loop footer be no
1642  // more than those from the previous iteration.
1643  // As a simplification, Instead of tracking individual scores and
1644  // generate the precise wait count, just wait on 0.
1645  bool HasPending = false;
1646  MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
1647  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
1648  T = (enum InstCounterType)(T + 1)) {
1649  if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
1650  ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
1651  HasPending = true;
1652  }
1653  }
1654 
1655  if (HasPending) {
1656  if (!SWaitInst) {
1657  SWaitInst = Block.getParent()->CreateMachineInstr(
1658  TII->get(AMDGPU::S_WAITCNT), DebugLoc());
1659  CompilerGeneratedWaitcntSet.insert(SWaitInst);
1661  SWaitInst->addOperand(MF, Op);
1662 #if 0 // TODO: Format the debug output
1663  OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
1664  OutputTransformAdd(SWaitInst, context);
1665 #endif
1666  }
1667 #if 0 // TODO: ??
1668  _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
1669 #endif
1670  }
1671 
1672  if (SWaitInst) {
1673  DEBUG({
1674  SWaitInst->print(dbgs());
1675  dbgs() << "\nAdjusted score board:";
1676  ScoreBrackets->dump();
1677  });
1678 
1679  // Add this waitcnt to the block. It is either newly created or
1680  // created in previous iterations and added back since block traversal
1681  // always remove waitcnt.
1682  insertWaitcntBeforeCF(Block, SWaitInst);
1683  WaitcntData->setWaitcnt(SWaitInst);
1684  }
1685  }
1686  }
1687 }
1688 
1689 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1690  ST = &MF.getSubtarget<SISubtarget>();
1691  TII = ST->getInstrInfo();
1692  TRI = &TII->getRegisterInfo();
1693  MRI = &MF.getRegInfo();
1694  MLI = &getAnalysis<MachineLoopInfo>();
1695  IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
1697  AMDGPUASI = ST->getAMDGPUAS();
1698 
1699  HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1700  HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1701  HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1702 
1703  HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
1704  HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
1705  assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1706  assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1707 
1708  RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1709  RegisterEncoding.VGPRL =
1710  RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
1711  RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1712  RegisterEncoding.SGPRL =
1713  RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
1714 
1715  // Walk over the blocks in reverse post-dominator order, inserting
1716  // s_waitcnt where needed.
1718  bool Modified = false;
1720  I = RPOT.begin(),
1721  E = RPOT.end(), J = RPOT.begin();
1722  I != E;) {
1723  MachineBasicBlock &MBB = **I;
1724 
1725  BlockVisitedSet.insert(&MBB);
1726 
1727  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1728  if (!ScoreBrackets) {
1729  BlockWaitcntBracketsMap[&MBB] = llvm::make_unique<BlockWaitcntBrackets>();
1730  ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
1731  }
1732  ScoreBrackets->setPostOrder(MBB.getNumber());
1733  MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
1734  if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
1735  LoopWaitcntDataMap[ContainingLoop] = llvm::make_unique<LoopWaitcntData>();
1736 
1737  // If we are walking into the block from before the loop, then guarantee
1738  // at least 1 re-walk over the loop to propagate the information, even if
1739  // no S_WAITCNT instructions were generated.
1740  if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
1741  (BlockWaitcntProcessedSet.find(&MBB) ==
1742  BlockWaitcntProcessedSet.end())) {
1743  BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
1744  DEBUG(dbgs() << "set-revisit: block"
1745  << ContainingLoop->getHeader()->getNumber() << '\n';);
1746  }
1747 
1748  // Walk over the instructions.
1749  insertWaitcntInBlock(MF, MBB);
1750 
1751  // Flag that waitcnts have been processed at least once.
1752  BlockWaitcntProcessedSet.insert(&MBB);
1753 
1754  // See if we want to revisit the loop.
1755  if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
1756  MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
1757  BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
1758  if (EntrySB && EntrySB->getRevisitLoop()) {
1759  EntrySB->setRevisitLoop(false);
1760  J = I;
1761  int32_t PostOrder = EntrySB->getPostOrder();
1762  // TODO: Avoid this loop. Find another way to set I.
1764  X = RPOT.begin(),
1765  Y = RPOT.end();
1766  X != Y; ++X) {
1767  MachineBasicBlock &MBBX = **X;
1768  if (MBBX.getNumber() == PostOrder) {
1769  I = X;
1770  break;
1771  }
1772  }
1773  LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1774  WaitcntData->incIterCnt();
1775  DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';);
1776  continue;
1777  } else {
1778  LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
1779  // Loop converged, reset iteration count. If this loop gets revisited,
1780  // it must be from an outer loop, the counter will restart, this will
1781  // ensure we don't force convergence on such revisits.
1782  WaitcntData->resetIterCnt();
1783  }
1784  }
1785 
1786  J = I;
1787  ++I;
1788  }
1789 
1791 
1792  bool HaveScalarStores = false;
1793 
1794  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1795  ++BI) {
1796  MachineBasicBlock &MBB = *BI;
1797 
1798  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
1799  ++I) {
1800  if (!HaveScalarStores && TII->isScalarStore(*I))
1801  HaveScalarStores = true;
1802 
1803  if (I->getOpcode() == AMDGPU::S_ENDPGM ||
1804  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1805  EndPgmBlocks.push_back(&MBB);
1806  }
1807  }
1808 
1809  if (HaveScalarStores) {
1810  // If scalar writes are used, the cache must be flushed or else the next
1811  // wave to reuse the same scratch memory can be clobbered.
1812  //
1813  // Insert s_dcache_wb at wave termination points if there were any scalar
1814  // stores, and only if the cache hasn't already been flushed. This could be
1815  // improved by looking across blocks for flushes in postdominating blocks
1816  // from the stores but an explicitly requested flush is probably very rare.
1817  for (MachineBasicBlock *MBB : EndPgmBlocks) {
1818  bool SeenDCacheWB = false;
1819 
1820  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1821  ++I) {
1822  if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1823  SeenDCacheWB = true;
1824  else if (TII->isScalarStore(*I))
1825  SeenDCacheWB = false;
1826 
1827  // FIXME: It would be better to insert this before a waitcnt if any.
1828  if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1829  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1830  !SeenDCacheWB) {
1831  Modified = true;
1832  BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1833  }
1834  }
1835  }
1836  }
1837 
1838  if (!MFI->isEntryFunction()) {
1839  // Wait for any outstanding memory operations that the input registers may
1840  // depend on. We can't track them and it's better to to the wait after the
1841  // costly call sequence.
1842 
1843  // TODO: Could insert earlier and schedule more liberally with operations
1844  // that only use caller preserved registers.
1845  MachineBasicBlock &EntryBB = MF.front();
1846  BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1847  .addImm(0);
1848 
1849  Modified = true;
1850  }
1851 
1852  return Modified;
1853 }
Interface definition for SIRegisterInfo.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
bool modifiesRegister(unsigned Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr modifies (fully define or partially define) the specified register...
Definition: MachineInstr.h:965
GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2)
AMDGPU specific subclass of TargetSubtarget.
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
SI Insert Waitcnts
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:37
Implements a dense probed hash-table based set.
Definition: DenseSet.h:221
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:268
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
unsigned getReg() const
getReg - Returns the register number.
unsigned getSubReg() const
A debug info location.
Definition: DebugLoc.h:34
iterator_range< mmo_iterator > memoperands()
Definition: MachineInstr.h:396
LLVM_READONLY int getAtomicNoRetOp(uint16_t Opcode)
MachineInstr * CreateMachineInstr(const MCInstrDesc &MCID, const DebugLoc &DL, bool NoImp=false)
CreateMachineInstr - Allocate a new MachineInstr.
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:411
FunctionPass * createSIInsertWaitcntsPass()
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:432
AnalysisUsage & addRequired()
A description of a memory reference used in the backend.
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:456
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
OpenCL uses address spaces to differentiate between various memory regions on the hardware...
Definition: AMDGPU.h:214
const HexagonInstrInfo * TII
bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const
unsigned getNumOperands() const
Access to explicit operands of the instruction.
Definition: MachineInstr.h:293
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:440
static bool readsVCCZ(const MachineInstr &MI)
Reg
All possible values of the reg field in the ModR/M byte.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:290
unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt)
BlockT * getHeader() const
Definition: LoopInfo.h:100
InstCounterType
bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:395
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
#define T
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they&#39;re not in a MachineFuncti...
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
bool isBranch(QueryType Type=AnyInBundle) const
Returns true if this is a conditional, unconditional, or indirect branch.
Definition: MachineInstr.h:482
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:639
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
unsigned const MachineRegisterInfo * MRI
static bool usesVM_CNT(const MachineInstr &MI)
Definition: SIInstrInfo.h:540
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Represent the analysis usage information of a pass.
Instruction set architecture version.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
iterator_range< pred_iterator > predecessors()
#define CNT_MASK(t)
RegisterMapping
const AMDGPUAS & AS
const MachineBasicBlock & front() const
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
hexagon gen pred
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:472
unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt)
char & SIInsertWaitcntsID
unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version)
Iterator for intrusive lists based on ilist_node.
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
const SIRegisterInfo * getRegisterInfo() const override
bool isDebugValue() const
Definition: MachineInstr.h:816
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:864
unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version)
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:285
int64_t getImm() const
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
static void clear(coro::Shape &Shape)
Definition: Coroutines.cpp:210
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:139
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Representation of each machine instruction.
Definition: MachineInstr.h:59
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) INITIALIZE_PASS_END(SIInsertWaitcnts
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Interface definition for SIInstrInfo.
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:403
#define DEBUG_TYPE
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt)
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:439
unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
static MachineOperand CreateImm(int64_t Val)
void push_back(MachineInstr *MI)
#define I(x, y, z)
Definition: MD5.cpp:58
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:307
typename std::vector< NodeRef >::reverse_iterator rpo_iterator
unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version)
#define ForAllWaitEventType(w)
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
void print(raw_ostream &OS, bool SkipOpers=false, bool SkipDebugLoc=false, const TargetInstrInfo *TII=nullptr) const
Debugging supportPrint this MI to OS.
MachineInstr * removeFromParent()
Unlink &#39;this&#39; from the containing basic block, and return it without deleting it. ...
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:626
bool memoperands_empty() const
Return true if we don&#39;t have any memory operands which described the the memory access done by this i...
Definition: MachineInstr.h:394
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
constexpr char Size[]
Key for Kernel::Arg::Metadata::mSize.
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:44
#define DEBUG(X)
Definition: Debug.h:118
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
static bool usesLGKM_CNT(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:295
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:156
WaitEventType
IsaVersion getIsaVersion(const FeatureBitset &Features)