LLVM  15.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert wait instructions for memory reads and writes.
11 ///
12 /// Memory reads and writes are issued asynchronously, so we need to insert
13 /// S_WAITCNT instructions when we want to access any of their results or
14 /// overwrite any register that's used asynchronously.
15 ///
16 /// TODO: This pass currently keeps one timeline per hardware counter. A more
17 /// finely-grained approach that keeps one timeline per event type could
18 /// sometimes get away with generating weaker s_waitcnt instructions. For
19 /// example, when both SMEM and LDS are in flight and we need to wait for
20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21 /// but the pass will currently generate a conservative lgkmcnt(0) because
22 /// multiple event types are in flight.
23 //
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPU.h"
27 #include "GCNSubtarget.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "Utils/AMDGPUBaseInfo.h"
31 #include "llvm/ADT/MapVector.h"
33 #include "llvm/ADT/Sequence.h"
36 #include "llvm/InitializePasses.h"
39 using namespace llvm;
40 
41 #define DEBUG_TYPE "si-insert-waitcnts"
42 
43 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
44  "Force emit s_waitcnt expcnt(0) instrs");
45 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
46  "Force emit s_waitcnt lgkmcnt(0) instrs");
47 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
48  "Force emit s_waitcnt vmcnt(0) instrs");
49 
51  "amdgpu-waitcnt-forcezero",
52  cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
53  cl::init(false), cl::Hidden);
54 
55 namespace {
56 // Class of object that encapsulates latest instruction counter score
57 // associated with the operand. Used for determining whether
58 // s_waitcnt instruction needs to be emitted.
59 
60 #define CNT_MASK(t) (1u << (t))
61 
62 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
63 } // namespace
64 
65 namespace llvm {
66 template <> struct enum_iteration_traits<InstCounterType> {
67  static constexpr bool is_iterable = true;
68 };
69 } // namespace llvm
70 
71 namespace {
72 auto inst_counter_types() { return enum_seq(VM_CNT, NUM_INST_CNTS); }
73 
74 using RegInterval = std::pair<int, int>;
75 
76 struct HardwareLimits {
77  unsigned VmcntMax;
78  unsigned ExpcntMax;
79  unsigned LgkmcntMax;
80  unsigned VscntMax;
81 };
82 
83 struct RegisterEncoding {
84  unsigned VGPR0;
85  unsigned VGPRL;
86  unsigned SGPR0;
87  unsigned SGPRL;
88 };
89 
90 enum WaitEventType {
91  VMEM_ACCESS, // vector-memory read & write
92  VMEM_READ_ACCESS, // vector-memory read
93  VMEM_WRITE_ACCESS, // vector-memory write
94  LDS_ACCESS, // lds read & write
95  GDS_ACCESS, // gds read & write
96  SQ_MESSAGE, // send message
97  SMEM_ACCESS, // scalar-memory read & write
98  EXP_GPR_LOCK, // export holding on its data src
99  GDS_GPR_LOCK, // GDS holding on its data and addr src
100  EXP_POS_ACCESS, // write to export position
101  EXP_PARAM_ACCESS, // write to export parameter
102  VMW_GPR_LOCK, // vector-memory write holding on its data src
103  EXP_LDS_ACCESS, // read by ldsdir counting as export
104  NUM_WAIT_EVENTS,
105 };
106 
107 static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
108  (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
109  (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
110  (1 << SQ_MESSAGE),
111  (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
112  (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS),
113  (1 << VMEM_WRITE_ACCESS)};
114 
115 // The mapping is:
116 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
117 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
118 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
119 // We reserve a fixed number of VGPR slots in the scoring tables for
120 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
121 enum RegisterMapping {
122  SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
123  AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
124  SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
125  NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
126  EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes.
127  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
128 };
129 
130 // Enumerate different types of result-returning VMEM operations. Although
131 // s_waitcnt orders them all with a single vmcnt counter, in the absence of
132 // s_waitcnt only instructions of the same VmemType are guaranteed to write
133 // their results in order -- so there is no need to insert an s_waitcnt between
134 // two instructions of the same type that write the same vgpr.
135 enum VmemType {
136  // BUF instructions and MIMG instructions without a sampler.
137  VMEM_NOSAMPLER,
138  // MIMG instructions with a sampler.
139  VMEM_SAMPLER,
140  // BVH instructions
141  VMEM_BVH
142 };
143 
144 VmemType getVmemType(const MachineInstr &Inst) {
146  if (!SIInstrInfo::isMIMG(Inst))
147  return VMEM_NOSAMPLER;
148  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
149  const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
150  AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
151  return BaseInfo->BVH ? VMEM_BVH
152  : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
153 }
154 
155 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
156  switch (T) {
157  case VM_CNT:
158  Wait.VmCnt = std::min(Wait.VmCnt, Count);
159  break;
160  case EXP_CNT:
161  Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
162  break;
163  case LGKM_CNT:
164  Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
165  break;
166  case VS_CNT:
167  Wait.VsCnt = std::min(Wait.VsCnt, Count);
168  break;
169  default:
170  llvm_unreachable("bad InstCounterType");
171  }
172 }
173 
174 // This objects maintains the current score brackets of each wait counter, and
175 // a per-register scoreboard for each wait counter.
176 //
177 // We also maintain the latest score for every event type that can change the
178 // waitcnt in order to know if there are multiple types of events within
179 // the brackets. When multiple types of event happen in the bracket,
180 // wait count may get decreased out of order, therefore we need to put in
181 // "s_waitcnt 0" before use.
182 class WaitcntBrackets {
183 public:
184  WaitcntBrackets(const GCNSubtarget *SubTarget, HardwareLimits Limits,
185  RegisterEncoding Encoding)
186  : ST(SubTarget), Limits(Limits), Encoding(Encoding) {}
187 
188  unsigned getWaitCountMax(InstCounterType T) const {
189  switch (T) {
190  case VM_CNT:
191  return Limits.VmcntMax;
192  case LGKM_CNT:
193  return Limits.LgkmcntMax;
194  case EXP_CNT:
195  return Limits.ExpcntMax;
196  case VS_CNT:
197  return Limits.VscntMax;
198  default:
199  break;
200  }
201  return 0;
202  }
203 
204  unsigned getScoreLB(InstCounterType T) const {
205  assert(T < NUM_INST_CNTS);
206  return ScoreLBs[T];
207  }
208 
209  unsigned getScoreUB(InstCounterType T) const {
210  assert(T < NUM_INST_CNTS);
211  return ScoreUBs[T];
212  }
213 
214  // Mapping from event to counter.
215  InstCounterType eventCounter(WaitEventType E) {
216  if (WaitEventMaskForInst[VM_CNT] & (1 << E))
217  return VM_CNT;
218  if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
219  return LGKM_CNT;
220  if (WaitEventMaskForInst[VS_CNT] & (1 << E))
221  return VS_CNT;
222  assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
223  return EXP_CNT;
224  }
225 
226  unsigned getRegScore(int GprNo, InstCounterType T) {
227  if (GprNo < NUM_ALL_VGPRS) {
228  return VgprScores[T][GprNo];
229  }
230  assert(T == LGKM_CNT);
231  return SgprScores[GprNo - NUM_ALL_VGPRS];
232  }
233 
234  bool merge(const WaitcntBrackets &Other);
235 
236  RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
237  const MachineRegisterInfo *MRI,
238  const SIRegisterInfo *TRI, unsigned OpNo) const;
239 
240  bool counterOutOfOrder(InstCounterType T) const;
241  void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
242  void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
243  void determineWait(InstCounterType T, unsigned ScoreToWait,
244  AMDGPU::Waitcnt &Wait) const;
245  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
246  void applyWaitcnt(InstCounterType T, unsigned Count);
247  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
248  const MachineRegisterInfo *MRI, WaitEventType E,
249  MachineInstr &MI);
250 
251  bool hasPending() const { return PendingEvents != 0; }
252  bool hasPendingEvent(WaitEventType E) const {
253  return PendingEvents & (1 << E);
254  }
255 
256  bool hasMixedPendingEvents(InstCounterType T) const {
257  unsigned Events = PendingEvents & WaitEventMaskForInst[T];
258  // Return true if more than one bit is set in Events.
259  return Events & (Events - 1);
260  }
261 
262  bool hasPendingFlat() const {
263  return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
264  LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
265  (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
266  LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
267  }
268 
269  void setPendingFlat() {
270  LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
271  LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
272  }
273 
274  // Return true if there might be pending writes to the specified vgpr by VMEM
275  // instructions with types different from V.
276  bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
277  assert(GprNo < NUM_ALL_VGPRS);
278  return VgprVmemTypes[GprNo] & ~(1 << V);
279  }
280 
281  void clearVgprVmemTypes(int GprNo) {
282  assert(GprNo < NUM_ALL_VGPRS);
283  VgprVmemTypes[GprNo] = 0;
284  }
285 
286  void print(raw_ostream &);
287  void dump() { print(dbgs()); }
288 
289 private:
290  struct MergeInfo {
291  unsigned OldLB;
292  unsigned OtherLB;
293  unsigned MyShift;
294  unsigned OtherShift;
295  };
296  static bool mergeScore(const MergeInfo &M, unsigned &Score,
297  unsigned OtherScore);
298 
299  void setScoreLB(InstCounterType T, unsigned Val) {
300  assert(T < NUM_INST_CNTS);
301  ScoreLBs[T] = Val;
302  }
303 
304  void setScoreUB(InstCounterType T, unsigned Val) {
305  assert(T < NUM_INST_CNTS);
306  ScoreUBs[T] = Val;
307  if (T == EXP_CNT) {
308  unsigned UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
309  if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
310  ScoreLBs[T] = UB;
311  }
312  }
313 
314  void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
315  if (GprNo < NUM_ALL_VGPRS) {
316  VgprUB = std::max(VgprUB, GprNo);
317  VgprScores[T][GprNo] = Val;
318  } else {
319  assert(T == LGKM_CNT);
320  SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
321  SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
322  }
323  }
324 
325  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
326  const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
327  unsigned OpNo, unsigned Val);
328 
329  const GCNSubtarget *ST = nullptr;
330  HardwareLimits Limits = {};
331  RegisterEncoding Encoding = {};
332  unsigned ScoreLBs[NUM_INST_CNTS] = {0};
333  unsigned ScoreUBs[NUM_INST_CNTS] = {0};
334  unsigned PendingEvents = 0;
335  // Remember the last flat memory operation.
336  unsigned LastFlat[NUM_INST_CNTS] = {0};
337  // wait_cnt scores for every vgpr.
338  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
339  int VgprUB = -1;
340  int SgprUB = -1;
341  unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
342  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
343  unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
344  // Bitmask of the VmemTypes of VMEM instructions that might have a pending
345  // write to each vgpr.
346  unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
347 };
348 
349 class SIInsertWaitcnts : public MachineFunctionPass {
350 private:
351  const GCNSubtarget *ST = nullptr;
352  const SIInstrInfo *TII = nullptr;
353  const SIRegisterInfo *TRI = nullptr;
354  const MachineRegisterInfo *MRI = nullptr;
355  AMDGPU::IsaVersion IV;
356 
357  DenseSet<MachineInstr *> TrackedWaitcntSet;
359  DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
360  MachineLoopInfo *MLI;
362 
363  struct BlockInfo {
365  std::unique_ptr<WaitcntBrackets> Incoming;
366  bool Dirty = true;
367 
368  explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
369  };
370 
372 
373  // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
374  // because of amdgpu-waitcnt-forcezero flag
375  bool ForceEmitZeroWaitcnts;
376  bool ForceEmitWaitcnt[NUM_INST_CNTS];
377 
378 public:
379  static char ID;
380 
381  SIInsertWaitcnts() : MachineFunctionPass(ID) {
382  (void)ForceExpCounter;
383  (void)ForceLgkmCounter;
384  (void)ForceVMCounter;
385  }
386 
387  bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
388  bool isPreheaderToFlush(MachineBasicBlock &MBB,
389  WaitcntBrackets &ScoreBrackets);
390  bool runOnMachineFunction(MachineFunction &MF) override;
391 
392  StringRef getPassName() const override {
393  return "SI insert wait instructions";
394  }
395 
396  void getAnalysisUsage(AnalysisUsage &AU) const override {
397  AU.setPreservesCFG();
401  }
402 
403  bool isForceEmitWaitcnt() const {
404  for (auto T : inst_counter_types())
405  if (ForceEmitWaitcnt[T])
406  return true;
407  return false;
408  }
409 
410  void setForceEmitWaitcnt() {
411 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
412 // For debug builds, get the debug counter info and adjust if need be
413 #ifndef NDEBUG
414  if (DebugCounter::isCounterSet(ForceExpCounter) &&
415  DebugCounter::shouldExecute(ForceExpCounter)) {
416  ForceEmitWaitcnt[EXP_CNT] = true;
417  } else {
418  ForceEmitWaitcnt[EXP_CNT] = false;
419  }
420 
421  if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
422  DebugCounter::shouldExecute(ForceLgkmCounter)) {
423  ForceEmitWaitcnt[LGKM_CNT] = true;
424  } else {
425  ForceEmitWaitcnt[LGKM_CNT] = false;
426  }
427 
428  if (DebugCounter::isCounterSet(ForceVMCounter) &&
429  DebugCounter::shouldExecute(ForceVMCounter)) {
430  ForceEmitWaitcnt[VM_CNT] = true;
431  } else {
432  ForceEmitWaitcnt[VM_CNT] = false;
433  }
434 #endif // NDEBUG
435  }
436 
437  bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
438  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
439  bool generateWaitcntInstBefore(MachineInstr &MI,
440  WaitcntBrackets &ScoreBrackets,
441  MachineInstr *OldWaitcntInstr,
442  bool FlushVmCnt);
443  bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
444  WaitcntBrackets &ScoreBrackets,
445  MachineInstr *OldWaitcntInstr);
446  bool generateWaitcnt(AMDGPU::Waitcnt Wait,
448  MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
449  MachineInstr *OldWaitcntInstr);
450  void updateEventWaitcntAfter(MachineInstr &Inst,
451  WaitcntBrackets *ScoreBrackets);
452  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
453  WaitcntBrackets &ScoreBrackets);
454  bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
455  MachineInstr &OldWaitcntInstr,
456  AMDGPU::Waitcnt &Wait,
458 };
459 
460 } // end anonymous namespace
461 
462 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
463  const SIInstrInfo *TII,
464  const MachineRegisterInfo *MRI,
465  const SIRegisterInfo *TRI,
466  unsigned OpNo) const {
467  const MachineOperand &Op = MI->getOperand(OpNo);
468  if (!TRI->isInAllocatableClass(Op.getReg()))
469  return {-1, -1};
470 
471  // A use via a PW operand does not need a waitcnt.
472  // A partial write is not a WAW.
473  assert(!Op.getSubReg() || !Op.isUndef());
474 
475  RegInterval Result;
476 
477  unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
478 
479  if (TRI->isVectorRegister(*MRI, Op.getReg())) {
480  assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
481  Result.first = Reg - Encoding.VGPR0;
482  if (TRI->isAGPR(*MRI, Op.getReg()))
483  Result.first += AGPR_OFFSET;
484  assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
485  } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
486  assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
487  Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
488  assert(Result.first >= NUM_ALL_VGPRS &&
489  Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
490  }
491  // TODO: Handle TTMP
492  // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
493  else
494  return {-1, -1};
495 
496  const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo);
497  unsigned Size = TRI->getRegSizeInBits(*RC);
498  Result.second = Result.first + ((Size + 16) / 32);
499 
500  return Result;
501 }
502 
503 void WaitcntBrackets::setExpScore(const MachineInstr *MI,
504  const SIInstrInfo *TII,
505  const SIRegisterInfo *TRI,
506  const MachineRegisterInfo *MRI, unsigned OpNo,
507  unsigned Val) {
508  RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
509  assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
510  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
511  setRegScore(RegNo, EXP_CNT, Val);
512  }
513 }
514 
515 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS written
516 // can be accessed. A load from LDS to VMEM does not need a wait.
517 static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
518  return SIInstrInfo::isVALU(MI) &&
520  MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
521 }
522 
523 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
524  const SIRegisterInfo *TRI,
525  const MachineRegisterInfo *MRI,
526  WaitEventType E, MachineInstr &Inst) {
527  InstCounterType T = eventCounter(E);
528  unsigned CurrScore = getScoreUB(T) + 1;
529  if (CurrScore == 0)
530  report_fatal_error("InsertWaitcnt score wraparound");
531  // PendingEvents and ScoreUB need to be update regardless if this event
532  // changes the score of a register or not.
533  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
534  PendingEvents |= 1 << E;
535  setScoreUB(T, CurrScore);
536 
537  if (T == EXP_CNT) {
538  // Put score on the source vgprs. If this is a store, just use those
539  // specific register(s).
540  if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
541  int AddrOpIdx =
542  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
543  // All GDS operations must protect their address register (same as
544  // export.)
545  if (AddrOpIdx != -1) {
546  setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
547  }
548 
549  if (Inst.mayStore()) {
551  AMDGPU::OpName::data0) != -1) {
552  setExpScore(
553  &Inst, TII, TRI, MRI,
554  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
555  CurrScore);
556  }
558  AMDGPU::OpName::data1) != -1) {
559  setExpScore(&Inst, TII, TRI, MRI,
561  AMDGPU::OpName::data1),
562  CurrScore);
563  }
564  } else if (SIInstrInfo::isAtomicRet(Inst) &&
565  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
566  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
567  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
568  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
569  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
570  Inst.getOpcode() != AMDGPU::DS_APPEND &&
571  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
573  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
574  const MachineOperand &Op = Inst.getOperand(I);
575  if (Op.isReg() && !Op.isDef() &&
576  TRI->isVectorRegister(*MRI, Op.getReg())) {
577  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
578  }
579  }
580  }
581  } else if (TII->isFLAT(Inst)) {
582  if (Inst.mayStore()) {
583  setExpScore(
584  &Inst, TII, TRI, MRI,
585  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
586  CurrScore);
587  } else if (SIInstrInfo::isAtomicRet(Inst)) {
588  setExpScore(
589  &Inst, TII, TRI, MRI,
590  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
591  CurrScore);
592  }
593  } else if (TII->isMIMG(Inst)) {
594  if (Inst.mayStore()) {
595  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
596  } else if (SIInstrInfo::isAtomicRet(Inst)) {
597  setExpScore(
598  &Inst, TII, TRI, MRI,
599  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
600  CurrScore);
601  }
602  } else if (TII->isMTBUF(Inst)) {
603  if (Inst.mayStore()) {
604  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
605  }
606  } else if (TII->isMUBUF(Inst)) {
607  if (Inst.mayStore()) {
608  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
609  } else if (SIInstrInfo::isAtomicRet(Inst)) {
610  setExpScore(
611  &Inst, TII, TRI, MRI,
612  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
613  CurrScore);
614  }
615  } else if (TII->isLDSDIR(Inst)) {
616  // LDSDIR instructions attach the score to the destination.
617  setExpScore(
618  &Inst, TII, TRI, MRI,
619  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
620  CurrScore);
621  } else {
622  if (TII->isEXP(Inst)) {
623  // For export the destination registers are really temps that
624  // can be used as the actual source after export patching, so
625  // we need to treat them like sources and set the EXP_CNT
626  // score.
627  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
628  MachineOperand &DefMO = Inst.getOperand(I);
629  if (DefMO.isReg() && DefMO.isDef() &&
630  TRI->isVGPR(*MRI, DefMO.getReg())) {
631  setRegScore(
633  EXP_CNT, CurrScore);
634  }
635  }
636  }
637  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
638  MachineOperand &MO = Inst.getOperand(I);
639  if (MO.isReg() && !MO.isDef() &&
640  TRI->isVectorRegister(*MRI, MO.getReg())) {
641  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
642  }
643  }
644  }
645 #if 0 // TODO: check if this is handled by MUBUF code above.
646  } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
647  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
648  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
649  MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
650  unsigned OpNo;//TODO: find the OpNo for this operand;
651  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo);
652  for (int RegNo = Interval.first; RegNo < Interval.second;
653  ++RegNo) {
654  setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
655  }
656 #endif
657  } else {
658  // Match the score to the destination registers.
659  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
660  auto &Op = Inst.getOperand(I);
661  if (!Op.isReg() || !Op.isDef())
662  continue;
663  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I);
664  if (T == VM_CNT) {
665  if (Interval.first >= NUM_ALL_VGPRS)
666  continue;
667  if (SIInstrInfo::isVMEM(Inst)) {
668  VmemType V = getVmemType(Inst);
669  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
670  VgprVmemTypes[RegNo] |= 1 << V;
671  }
672  }
673  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
674  setRegScore(RegNo, T, CurrScore);
675  }
676  }
677  if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
678  setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
679  }
680  }
681 }
682 
684  OS << '\n';
685  for (auto T : inst_counter_types()) {
686  unsigned LB = getScoreLB(T);
687  unsigned UB = getScoreUB(T);
688 
689  switch (T) {
690  case VM_CNT:
691  OS << " VM_CNT(" << UB - LB << "): ";
692  break;
693  case LGKM_CNT:
694  OS << " LGKM_CNT(" << UB - LB << "): ";
695  break;
696  case EXP_CNT:
697  OS << " EXP_CNT(" << UB - LB << "): ";
698  break;
699  case VS_CNT:
700  OS << " VS_CNT(" << UB - LB << "): ";
701  break;
702  default:
703  OS << " UNKNOWN(" << UB - LB << "): ";
704  break;
705  }
706 
707  if (LB < UB) {
708  // Print vgpr scores.
709  for (int J = 0; J <= VgprUB; J++) {
710  unsigned RegScore = getRegScore(J, T);
711  if (RegScore <= LB)
712  continue;
713  unsigned RelScore = RegScore - LB - 1;
714  if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
715  OS << RelScore << ":v" << J << " ";
716  } else {
717  OS << RelScore << ":ds ";
718  }
719  }
720  // Also need to print sgpr scores for lgkm_cnt.
721  if (T == LGKM_CNT) {
722  for (int J = 0; J <= SgprUB; J++) {
723  unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
724  if (RegScore <= LB)
725  continue;
726  unsigned RelScore = RegScore - LB - 1;
727  OS << RelScore << ":s" << J << " ";
728  }
729  }
730  }
731  OS << '\n';
732  }
733  OS << '\n';
734 }
735 
736 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
737 /// whether a waitcnt instruction is needed at all.
738 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
739  simplifyWaitcnt(VM_CNT, Wait.VmCnt);
740  simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
741  simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
742  simplifyWaitcnt(VS_CNT, Wait.VsCnt);
743 }
744 
745 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
746  unsigned &Count) const {
747  const unsigned LB = getScoreLB(T);
748  const unsigned UB = getScoreUB(T);
749 
750  // The number of outstanding events for this type, T, can be calculated
751  // as (UB - LB). If the current Count is greater than or equal to the number
752  // of outstanding events, then the wait for this counter is redundant.
753  if (Count >= UB - LB)
754  Count = ~0u;
755 }
756 
757 void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait,
758  AMDGPU::Waitcnt &Wait) const {
759  // If the score of src_operand falls within the bracket, we need an
760  // s_waitcnt instruction.
761  const unsigned LB = getScoreLB(T);
762  const unsigned UB = getScoreUB(T);
763  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
764  if ((T == VM_CNT || T == LGKM_CNT) &&
765  hasPendingFlat() &&
766  !ST->hasFlatLgkmVMemCountInOrder()) {
767  // If there is a pending FLAT operation, and this is a VMem or LGKM
768  // waitcnt and the target can report early completion, then we need
769  // to force a waitcnt 0.
770  addWait(Wait, T, 0);
771  } else if (counterOutOfOrder(T)) {
772  // Counter can get decremented out-of-order when there
773  // are multiple types event in the bracket. Also emit an s_wait counter
774  // with a conservative value of 0 for the counter.
775  addWait(Wait, T, 0);
776  } else {
777  // If a counter has been maxed out avoid overflow by waiting for
778  // MAX(CounterType) - 1 instead.
779  unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
780  addWait(Wait, T, NeededWait);
781  }
782  }
783 }
784 
785 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
786  applyWaitcnt(VM_CNT, Wait.VmCnt);
787  applyWaitcnt(EXP_CNT, Wait.ExpCnt);
788  applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
789  applyWaitcnt(VS_CNT, Wait.VsCnt);
790 }
791 
792 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
793  const unsigned UB = getScoreUB(T);
794  if (Count >= UB)
795  return;
796  if (Count != 0) {
797  if (counterOutOfOrder(T))
798  return;
799  setScoreLB(T, std::max(getScoreLB(T), UB - Count));
800  } else {
801  setScoreLB(T, UB);
802  PendingEvents &= ~WaitEventMaskForInst[T];
803  }
804 }
805 
806 // Where there are multiple types of event in the bracket of a counter,
807 // the decrement may go out of order.
808 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
809  // Scalar memory read always can go out of order.
810  if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
811  return true;
812  return hasMixedPendingEvents(T);
813 }
814 
815 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
816  false)
819 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
820  false)
821 
822 char SIInsertWaitcnts::ID = 0;
823 
824 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
825 
827  return new SIInsertWaitcnts();
828 }
829 
830 /// Combine consecutive waitcnt instructions that precede \p It and follow
831 /// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
832 /// by previous passes. Currently this pass conservatively assumes that these
833 /// preexisting waitcnt are required for correctness.
834 bool SIInsertWaitcnts::applyPreexistingWaitcnt(
835  WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
836  AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) {
837  bool Modified = false;
838  MachineInstr *WaitcntInstr = nullptr;
839  MachineInstr *WaitcntVsCntInstr = nullptr;
840 
841  for (auto &II :
842  make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
843  if (II.isMetaInstruction())
844  continue;
845 
846  if (II.getOpcode() == AMDGPU::S_WAITCNT) {
847  // Conservatively update required wait if this waitcnt was added in an
848  // earlier pass. In this case it will not exist in the tracked waitcnt
849  // set.
850  if (!TrackedWaitcntSet.count(&II)) {
851  unsigned IEnc = II.getOperand(0).getImm();
852  AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
853  Wait = Wait.combined(OldWait);
854  }
855 
856  // Merge consecutive waitcnt of the same type by erasing multiples.
857  if (!WaitcntInstr) {
858  WaitcntInstr = &II;
859  } else {
860  II.eraseFromParent();
861  Modified = true;
862  }
863 
864  } else {
865  assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
866  assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
867  if (!TrackedWaitcntSet.count(&II)) {
868  unsigned OldVSCnt =
869  TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
870  Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
871  }
872 
873  if (!WaitcntVsCntInstr) {
874  WaitcntVsCntInstr = &II;
875  } else {
876  II.eraseFromParent();
877  Modified = true;
878  }
879  }
880  }
881 
882  // Updated encoding of merged waitcnt with the required wait.
883  if (WaitcntInstr) {
884  if (Wait.hasWaitExceptVsCnt()) {
885  unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
886  unsigned OldEnc = WaitcntInstr->getOperand(0).getImm();
887  if (OldEnc != NewEnc) {
888  WaitcntInstr->getOperand(0).setImm(NewEnc);
889  Modified = true;
890  }
891  ScoreBrackets.applyWaitcnt(Wait);
892  Wait.VmCnt = ~0u;
893  Wait.LgkmCnt = ~0u;
894  Wait.ExpCnt = ~0u;
895 
896  LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
897  ? dbgs() << "applyPreexistingWaitcnt\n"
898  << "New Instr at block end: " << *WaitcntInstr
899  << '\n'
900  : dbgs() << "applyPreexistingWaitcnt\n"
901  << "Old Instr: " << *It
902  << "New Instr: " << *WaitcntInstr << '\n');
903 
904  } else {
905  WaitcntInstr->eraseFromParent();
906  Modified = true;
907  }
908  }
909 
910  if (WaitcntVsCntInstr) {
911  if (Wait.hasWaitVsCnt()) {
912  assert(ST->hasVscnt());
913  unsigned OldVSCnt =
914  TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
915  ->getImm();
916  if (Wait.VsCnt != OldVSCnt) {
917  TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
918  ->setImm(Wait.VsCnt);
919  Modified = true;
920  }
921  ScoreBrackets.applyWaitcnt(Wait);
922  Wait.VsCnt = ~0u;
923 
924  LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
925  ? dbgs() << "applyPreexistingWaitcnt\n"
926  << "New Instr at block end: "
927  << *WaitcntVsCntInstr << '\n'
928  : dbgs() << "applyPreexistingWaitcnt\n"
929  << "Old Instr: " << *It
930  << "New Instr: " << *WaitcntVsCntInstr << '\n');
931  } else {
932  WaitcntVsCntInstr->eraseFromParent();
933  Modified = true;
934  }
935  }
936 
937  return Modified;
938 }
939 
940 static bool readsVCCZ(const MachineInstr &MI) {
941  unsigned Opc = MI.getOpcode();
942  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
943  !MI.getOperand(1).isUndef();
944 }
945 
946 /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
948  // Currently all conventions wait, but this may not always be the case.
949  //
950  // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
951  // senses to omit the wait and do it in the caller.
952  return true;
953 }
954 
955 /// \returns true if the callee is expected to wait for any outstanding waits
956 /// before returning.
958  return true;
959 }
960 
961 /// Generate s_waitcnt instruction to be placed before cur_Inst.
962 /// Instructions of a given type are returned in order,
963 /// but instructions of different types can complete out of order.
964 /// We rely on this in-order completion
965 /// and simply assign a score to the memory access instructions.
966 /// We keep track of the active "score bracket" to determine
967 /// if an access of a memory read requires an s_waitcnt
968 /// and if so what the value of each counter is.
969 /// The "score bracket" is bound by the lower bound and upper bound
970 /// scores (*_score_LB and *_score_ub respectively).
971 /// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
972 /// flush the vmcnt counter here.
973 bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
974  WaitcntBrackets &ScoreBrackets,
975  MachineInstr *OldWaitcntInstr,
976  bool FlushVmCnt) {
977  setForceEmitWaitcnt();
978 
979  if (MI.isMetaInstruction())
980  return false;
981 
982  AMDGPU::Waitcnt Wait;
983 
984  // FIXME: This should have already been handled by the memory legalizer.
985  // Removing this currently doesn't affect any lit tests, but we need to
986  // verify that nothing was relying on this. The number of buffer invalidates
987  // being handled here should not be expanded.
988  if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
989  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
990  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
991  MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
992  MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
993  Wait.VmCnt = 0;
994  }
995 
996  // All waits must be resolved at call return.
997  // NOTE: this could be improved with knowledge of all call sites or
998  // with knowledge of the called routines.
999  if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1000  MI.getOpcode() == AMDGPU::SI_RETURN ||
1001  MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1002  (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1003  Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
1004  }
1005  // Resolve vm waits before gs-done.
1006  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
1007  MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1008  ST->hasLegacyGeometry() &&
1009  ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1011  Wait.VmCnt = 0;
1012  }
1013 #if 0 // TODO: the following blocks of logic when we have fence.
1014  else if (MI.getOpcode() == SC_FENCE) {
1015  const unsigned int group_size =
1016  context->shader_info->GetMaxThreadGroupSize();
1017  // group_size == 0 means thread group size is unknown at compile time
1018  const bool group_is_multi_wave =
1019  (group_size == 0 || group_size > target_info->GetWaveFrontSize());
1020  const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
1021 
1022  for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
1023  SCRegType src_type = Inst->GetSrcType(i);
1024  switch (src_type) {
1025  case SCMEM_LDS:
1026  if (group_is_multi_wave ||
1027  context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
1028  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
1029  ScoreBrackets->getScoreUB(LGKM_CNT));
1030  // LDS may have to wait for VM_CNT after buffer load to LDS
1031  if (target_info->HasBufferLoadToLDS()) {
1032  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
1033  ScoreBrackets->getScoreUB(VM_CNT));
1034  }
1035  }
1036  break;
1037 
1038  case SCMEM_GDS:
1039  if (group_is_multi_wave || fence_is_global) {
1040  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1041  ScoreBrackets->getScoreUB(EXP_CNT));
1042  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
1043  ScoreBrackets->getScoreUB(LGKM_CNT));
1044  }
1045  break;
1046 
1047  case SCMEM_UAV:
1048  case SCMEM_TFBUF:
1049  case SCMEM_RING:
1050  case SCMEM_SCATTER:
1051  if (group_is_multi_wave || fence_is_global) {
1052  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1053  ScoreBrackets->getScoreUB(EXP_CNT));
1054  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
1055  ScoreBrackets->getScoreUB(VM_CNT));
1056  }
1057  break;
1058 
1059  case SCMEM_SCRATCH:
1060  default:
1061  break;
1062  }
1063  }
1064  }
1065 #endif
1066 
1067  // Export & GDS instructions do not read the EXEC mask until after the export
1068  // is granted (which can occur well after the instruction is issued).
1069  // The shader program must flush all EXP operations on the export-count
1070  // before overwriting the EXEC mask.
1071  else {
1072  if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1073  // Export and GDS are tracked individually, either may trigger a waitcnt
1074  // for EXEC.
1075  if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1076  ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1077  ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1078  ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1079  Wait.ExpCnt = 0;
1080  }
1081  }
1082 
1083  if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1084  // The function is going to insert a wait on everything in its prolog.
1085  // This still needs to be careful if the call target is a load (e.g. a GOT
1086  // load). We also need to check WAW dependency with saved PC.
1087  Wait = AMDGPU::Waitcnt();
1088 
1089  int CallAddrOpIdx =
1090  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
1091 
1092  if (MI.getOperand(CallAddrOpIdx).isReg()) {
1093  RegInterval CallAddrOpInterval =
1094  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx);
1095 
1096  for (int RegNo = CallAddrOpInterval.first;
1097  RegNo < CallAddrOpInterval.second; ++RegNo)
1098  ScoreBrackets.determineWait(
1099  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1100 
1101  int RtnAddrOpIdx =
1102  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
1103  if (RtnAddrOpIdx != -1) {
1104  RegInterval RtnAddrOpInterval =
1105  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx);
1106 
1107  for (int RegNo = RtnAddrOpInterval.first;
1108  RegNo < RtnAddrOpInterval.second; ++RegNo)
1109  ScoreBrackets.determineWait(
1110  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1111  }
1112  }
1113  } else {
1114  // FIXME: Should not be relying on memoperands.
1115  // Look at the source operands of every instruction to see if
1116  // any of them results from a previous memory operation that affects
1117  // its current usage. If so, an s_waitcnt instruction needs to be
1118  // emitted.
1119  // If the source operand was defined by a load, add the s_waitcnt
1120  // instruction.
1121  //
1122  // Two cases are handled for destination operands:
1123  // 1) If the destination operand was defined by a load, add the s_waitcnt
1124  // instruction to guarantee the right WAW order.
1125  // 2) If a destination operand that was used by a recent export/store ins,
1126  // add s_waitcnt on exp_cnt to guarantee the WAR order.
1127  for (const MachineMemOperand *Memop : MI.memoperands()) {
1128  const Value *Ptr = Memop->getValue();
1129  if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
1130  addWait(Wait, LGKM_CNT, 0);
1131  if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1132  SLoadAddresses.erase(Ptr);
1133  }
1134  unsigned AS = Memop->getAddrSpace();
1136  continue;
1137  // No need to wait before load from VMEM to LDS.
1139  continue;
1140  unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1141  // VM_CNT is only relevant to vgpr or LDS.
1142  ScoreBrackets.determineWait(
1143  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1144  if (Memop->isStore()) {
1145  ScoreBrackets.determineWait(
1146  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1147  }
1148  }
1149 
1150  // Loop over use and def operands.
1151  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1152  MachineOperand &Op = MI.getOperand(I);
1153  if (!Op.isReg())
1154  continue;
1155  RegInterval Interval =
1156  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
1157 
1158  const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1159  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1160  if (IsVGPR) {
1161  // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1162  // previous write and this write are the same type of VMEM
1163  // instruction, in which case they're guaranteed to write their
1164  // results in order anyway.
1165  if (Op.isUse() || !SIInstrInfo::isVMEM(MI) ||
1166  ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1167  getVmemType(MI))) {
1168  ScoreBrackets.determineWait(
1169  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1170  ScoreBrackets.clearVgprVmemTypes(RegNo);
1171  }
1172  if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1173  ScoreBrackets.determineWait(
1174  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1175  }
1176  }
1177  ScoreBrackets.determineWait(
1178  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1179  }
1180  }
1181  }
1182  }
1183 
1184  // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1185  // occurs before the instruction. Doing it here prevents any additional
1186  // S_WAITCNTs from being emitted if the instruction was marked as
1187  // requiring a WAITCNT beforehand.
1188  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1189  !ST->hasAutoWaitcntBeforeBarrier()) {
1190  Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
1191  }
1192 
1193  // TODO: Remove this work-around, enable the assert for Bug 457939
1194  // after fixing the scheduler. Also, the Shader Compiler code is
1195  // independent of target.
1196  if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1197  if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1198  ScoreBrackets.getScoreUB(LGKM_CNT) &&
1199  ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1200  Wait.LgkmCnt = 0;
1201  }
1202  }
1203 
1204  // Verify that the wait is actually needed.
1205  ScoreBrackets.simplifyWaitcnt(Wait);
1206 
1207  if (ForceEmitZeroWaitcnts)
1208  Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt());
1209 
1210  if (ForceEmitWaitcnt[VM_CNT])
1211  Wait.VmCnt = 0;
1212  if (ForceEmitWaitcnt[EXP_CNT])
1213  Wait.ExpCnt = 0;
1214  if (ForceEmitWaitcnt[LGKM_CNT])
1215  Wait.LgkmCnt = 0;
1216  if (ForceEmitWaitcnt[VS_CNT])
1217  Wait.VsCnt = 0;
1218 
1219  if (FlushVmCnt) {
1220  unsigned UB = ScoreBrackets.getScoreUB(VM_CNT);
1221  unsigned LB = ScoreBrackets.getScoreLB(VM_CNT);
1222  if (UB - LB != 0)
1223  Wait.VmCnt = 0;
1224  }
1225 
1226  return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
1227  OldWaitcntInstr);
1228 }
1229 
1230 // Add a waitcnt to flush the vmcnt counter at the end of the given block if
1231 // needed.
1232 bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
1233  WaitcntBrackets &ScoreBrackets,
1234  MachineInstr *OldWaitcntInstr) {
1235  AMDGPU::Waitcnt Wait;
1236 
1237  unsigned UB = ScoreBrackets.getScoreUB(VM_CNT);
1238  unsigned LB = ScoreBrackets.getScoreLB(VM_CNT);
1239  if (UB - LB == 0)
1240  return false;
1241 
1242  Wait.VmCnt = 0;
1243 
1244  return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
1245  OldWaitcntInstr);
1246 }
1247 
1248 bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1250  MachineBasicBlock &Block,
1251  WaitcntBrackets &ScoreBrackets,
1252  MachineInstr *OldWaitcntInstr) {
1253  bool Modified = false;
1254  const DebugLoc &DL = Block.findDebugLoc(It);
1255 
1256  if (OldWaitcntInstr)
1257  // Try to merge the required wait with preexisting waitcnt instructions.
1258  // Also erase redundant waitcnt.
1259  Modified =
1260  applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
1261  else
1262  ScoreBrackets.applyWaitcnt(Wait);
1263 
1264  // ExpCnt can be merged into VINTERP.
1265  if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
1266  SIInstrInfo::isVINTERP(*It)) {
1267  MachineOperand *WaitExp =
1268  TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1269  if (Wait.ExpCnt < WaitExp->getImm()) {
1270  WaitExp->setImm(Wait.ExpCnt);
1271  Modified = true;
1272  }
1273  Wait.ExpCnt = ~0u;
1274 
1275  LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1276  << "Update Instr: " << *It);
1277  }
1278 
1279  // Build new waitcnt instructions unless no wait is needed or the old waitcnt
1280  // instruction was modified to handle the required wait.
1281  if (Wait.hasWaitExceptVsCnt()) {
1282  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1283  auto SWaitInst =
1284  BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1285  TrackedWaitcntSet.insert(SWaitInst);
1286  Modified = true;
1287 
1288  LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1289  if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1290  dbgs() << "New Instr: " << *SWaitInst << '\n');
1291  }
1292 
1293  if (Wait.hasWaitVsCnt()) {
1294  assert(ST->hasVscnt());
1295 
1296  auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1297  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1298  .addImm(Wait.VsCnt);
1299  TrackedWaitcntSet.insert(SWaitInst);
1300  Modified = true;
1301 
1302  LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1303  if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1304  dbgs() << "New Instr: " << *SWaitInst << '\n');
1305  }
1306  return Modified;
1307 }
1308 
1309 // This is a flat memory operation. Check to see if it has memory tokens other
1310 // than LDS. Other address spaces supported by flat memory operations involve
1311 // global memory.
1312 bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1313  assert(TII->isFLAT(MI));
1314 
1315  // All flat instructions use the VMEM counter.
1316  assert(TII->usesVM_CNT(MI));
1317 
1318  // If there are no memory operands then conservatively assume the flat
1319  // operation may access VMEM.
1320  if (MI.memoperands_empty())
1321  return true;
1322 
1323  // See if any memory operand specifies an address space that involves VMEM.
1324  // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1325  // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1326  // (GDS) address space is not supported by flat operations. Therefore, simply
1327  // return true unless only the LDS address space is found.
1328  for (const MachineMemOperand *Memop : MI.memoperands()) {
1329  unsigned AS = Memop->getAddrSpace();
1331  if (AS != AMDGPUAS::LOCAL_ADDRESS)
1332  return true;
1333  }
1334 
1335  return false;
1336 }
1337 
1338 // This is a flat memory operation. Check to see if it has memory tokens for
1339 // either LDS or FLAT.
1340 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1341  assert(TII->isFLAT(MI));
1342 
1343  // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1344  if (!TII->usesLGKM_CNT(MI))
1345  return false;
1346 
1347  // If in tgsplit mode then there can be no use of LDS.
1348  if (ST->isTgSplitEnabled())
1349  return false;
1350 
1351  // If there are no memory operands then conservatively assume the flat
1352  // operation may access LDS.
1353  if (MI.memoperands_empty())
1354  return true;
1355 
1356  // See if any memory operand specifies an address space that involves LDS.
1357  for (const MachineMemOperand *Memop : MI.memoperands()) {
1358  unsigned AS = Memop->getAddrSpace();
1360  return true;
1361  }
1362 
1363  return false;
1364 }
1365 
1366 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1367  WaitcntBrackets *ScoreBrackets) {
1368  // Now look at the instruction opcode. If it is a memory access
1369  // instruction, update the upper-bound of the appropriate counter's
1370  // bracket and the destination operand scores.
1371  // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1372  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1373  if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1374  TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1375  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1376  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1377  } else {
1378  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1379  }
1380  } else if (TII->isFLAT(Inst)) {
1381  assert(Inst.mayLoadOrStore());
1382 
1383  int FlatASCount = 0;
1384 
1385  if (mayAccessVMEMThroughFlat(Inst)) {
1386  ++FlatASCount;
1387  if (!ST->hasVscnt())
1388  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1389  else if (Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst))
1390  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1391  else
1392  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1393  }
1394 
1395  if (mayAccessLDSThroughFlat(Inst)) {
1396  ++FlatASCount;
1397  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1398  }
1399 
1400  // A Flat memory operation must access at least one address space.
1401  assert(FlatASCount);
1402 
1403  // This is a flat memory operation that access both VMEM and LDS, so note it
1404  // - it will require that both the VM and LGKM be flushed to zero if it is
1405  // pending when a VM or LGKM dependency occurs.
1406  if (FlatASCount > 1)
1407  ScoreBrackets->setPendingFlat();
1408  } else if (SIInstrInfo::isVMEM(Inst) &&
1410  if (!ST->hasVscnt())
1411  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1412  else if ((Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst)) ||
1413  /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
1414  (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
1415  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1416  else if (Inst.mayStore())
1417  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1418 
1419  if (ST->vmemWriteNeedsExpWaitcnt() &&
1420  (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
1421  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1422  }
1423  } else if (TII->isSMRD(Inst)) {
1424  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1425  } else if (Inst.isCall()) {
1426  if (callWaitsOnFunctionReturn(Inst)) {
1427  // Act as a wait on everything
1428  ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
1429  } else {
1430  // May need to way wait for anything.
1431  ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
1432  }
1433  } else if (SIInstrInfo::isLDSDIR(Inst)) {
1434  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
1435  } else if (TII->isVINTERP(Inst)) {
1436  int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
1437  ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
1438  } else if (SIInstrInfo::isEXP(Inst)) {
1439  unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1441  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1443  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1444  else
1445  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1446  } else {
1447  switch (Inst.getOpcode()) {
1448  case AMDGPU::S_SENDMSG:
1449  case AMDGPU::S_SENDMSG_RTN_B32:
1450  case AMDGPU::S_SENDMSG_RTN_B64:
1451  case AMDGPU::S_SENDMSGHALT:
1452  ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1453  break;
1454  case AMDGPU::S_MEMTIME:
1455  case AMDGPU::S_MEMREALTIME:
1456  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1457  break;
1458  }
1459  }
1460 }
1461 
1462 bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
1463  unsigned OtherScore) {
1464  unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
1465  unsigned OtherShifted =
1466  OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
1467  Score = std::max(MyShifted, OtherShifted);
1468  return OtherShifted > MyShifted;
1469 }
1470 
1471 /// Merge the pending events and associater score brackets of \p Other into
1472 /// this brackets status.
1473 ///
1474 /// Returns whether the merge resulted in a change that requires tighter waits
1475 /// (i.e. the merged brackets strictly dominate the original brackets).
1476 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
1477  bool StrictDom = false;
1478 
1479  VgprUB = std::max(VgprUB, Other.VgprUB);
1480  SgprUB = std::max(SgprUB, Other.SgprUB);
1481 
1482  for (auto T : inst_counter_types()) {
1483  // Merge event flags for this counter
1484  const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
1485  const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
1486  if (OtherEvents & ~OldEvents)
1487  StrictDom = true;
1488  PendingEvents |= OtherEvents;
1489 
1490  // Merge scores for this counter
1491  const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
1492  const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
1493  const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
1494  if (NewUB < ScoreLBs[T])
1495  report_fatal_error("waitcnt score overflow");
1496 
1497  MergeInfo M;
1498  M.OldLB = ScoreLBs[T];
1499  M.OtherLB = Other.ScoreLBs[T];
1500  M.MyShift = NewUB - ScoreUBs[T];
1501  M.OtherShift = NewUB - Other.ScoreUBs[T];
1502 
1503  ScoreUBs[T] = NewUB;
1504 
1505  StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
1506 
1507  bool RegStrictDom = false;
1508  for (int J = 0; J <= VgprUB; J++) {
1509  RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
1510  }
1511 
1512  if (T == VM_CNT) {
1513  for (int J = 0; J <= VgprUB; J++) {
1514  unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
1515  RegStrictDom |= NewVmemTypes != VgprVmemTypes[J];
1516  VgprVmemTypes[J] = NewVmemTypes;
1517  }
1518  }
1519 
1520  if (T == LGKM_CNT) {
1521  for (int J = 0; J <= SgprUB; J++) {
1522  RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
1523  }
1524  }
1525 
1526  if (RegStrictDom)
1527  StrictDom = true;
1528  }
1529 
1530  return StrictDom;
1531 }
1532 
1533 // Generate s_waitcnt instructions where needed.
1534 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1535  MachineBasicBlock &Block,
1536  WaitcntBrackets &ScoreBrackets) {
1537  bool Modified = false;
1538 
1539  LLVM_DEBUG({
1540  dbgs() << "*** Block" << Block.getNumber() << " ***";
1541  ScoreBrackets.dump();
1542  });
1543 
1544  // Track the correctness of vccz through this basic block. There are two
1545  // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
1546  // ST->partialVCCWritesUpdateVCCZ().
1547  bool VCCZCorrect = true;
1548  if (ST->hasReadVCCZBug()) {
1549  // vccz could be incorrect at a basic block boundary if a predecessor wrote
1550  // to vcc and then issued an smem load.
1551  VCCZCorrect = false;
1552  } else if (!ST->partialVCCWritesUpdateVCCZ()) {
1553  // vccz could be incorrect at a basic block boundary if a predecessor wrote
1554  // to vcc_lo or vcc_hi.
1555  VCCZCorrect = false;
1556  }
1557 
1558  // Walk over the instructions.
1559  MachineInstr *OldWaitcntInstr = nullptr;
1560 
1561  for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
1562  E = Block.instr_end();
1563  Iter != E;) {
1564  MachineInstr &Inst = *Iter;
1565 
1566  // Track pre-existing waitcnts that were added in earlier iterations or by
1567  // the memory legalizer.
1568  if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
1569  (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1570  Inst.getOperand(0).isReg() &&
1571  Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL)) {
1572  if (!OldWaitcntInstr)
1573  OldWaitcntInstr = &Inst;
1574  ++Iter;
1575  continue;
1576  }
1577 
1578  bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
1579  isPreheaderToFlush(Block, ScoreBrackets);
1580 
1581  // Generate an s_waitcnt instruction to be placed before Inst, if needed.
1582  Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
1583  FlushVmCnt);
1584  OldWaitcntInstr = nullptr;
1585 
1586  // Restore vccz if it's not known to be correct already.
1587  bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
1588 
1589  // Don't examine operands unless we need to track vccz correctness.
1590  if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
1591  if (Inst.definesRegister(AMDGPU::VCC_LO) ||
1592  Inst.definesRegister(AMDGPU::VCC_HI)) {
1593  // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
1594  if (!ST->partialVCCWritesUpdateVCCZ())
1595  VCCZCorrect = false;
1596  } else if (Inst.definesRegister(AMDGPU::VCC)) {
1597  // There is a hardware bug on CI/SI where SMRD instruction may corrupt
1598  // vccz bit, so when we detect that an instruction may read from a
1599  // corrupt vccz bit, we need to:
1600  // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
1601  // operations to complete.
1602  // 2. Restore the correct value of vccz by writing the current value
1603  // of vcc back to vcc.
1604  if (ST->hasReadVCCZBug() &&
1605  ScoreBrackets.getScoreLB(LGKM_CNT) <
1606  ScoreBrackets.getScoreUB(LGKM_CNT) &&
1607  ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1608  // Writes to vcc while there's an outstanding smem read may get
1609  // clobbered as soon as any read completes.
1610  VCCZCorrect = false;
1611  } else {
1612  // Writes to vcc will fix any incorrect value in vccz.
1613  VCCZCorrect = true;
1614  }
1615  }
1616  }
1617 
1618  if (TII->isSMRD(Inst)) {
1619  for (const MachineMemOperand *Memop : Inst.memoperands()) {
1620  // No need to handle invariant loads when avoiding WAR conflicts, as
1621  // there cannot be a vector store to the same memory location.
1622  if (!Memop->isInvariant()) {
1623  const Value *Ptr = Memop->getValue();
1624  SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
1625  }
1626  }
1627  if (ST->hasReadVCCZBug()) {
1628  // This smem read could complete and clobber vccz at any time.
1629  VCCZCorrect = false;
1630  }
1631  }
1632 
1633  updateEventWaitcntAfter(Inst, &ScoreBrackets);
1634 
1635 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1636  // If this instruction generates a S_SETVSKIP because it is an
1637  // indexed resource, and we are on Tahiti, then it will also force
1638  // an S_WAITCNT vmcnt(0)
1639  if (RequireCheckResourceType(Inst, context)) {
1640  // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1641  ScoreBrackets->setScoreLB(VM_CNT,
1642  ScoreBrackets->getScoreUB(VM_CNT));
1643  }
1644 #endif
1645 
1646  LLVM_DEBUG({
1647  Inst.print(dbgs());
1648  ScoreBrackets.dump();
1649  });
1650 
1651  // TODO: Remove this work-around after fixing the scheduler and enable the
1652  // assert above.
1653  if (RestoreVCCZ) {
1654  // Restore the vccz bit. Any time a value is written to vcc, the vcc
1655  // bit is updated, so we can restore the bit by reading the value of
1656  // vcc and then writing it back to the register.
1657  BuildMI(Block, Inst, Inst.getDebugLoc(),
1658  TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1659  TRI->getVCC())
1660  .addReg(TRI->getVCC());
1661  VCCZCorrect = true;
1662  Modified = true;
1663  }
1664 
1665  ++Iter;
1666  }
1667 
1668  if (Block.getFirstTerminator() == Block.end() &&
1669  isPreheaderToFlush(Block, ScoreBrackets))
1670  Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
1671 
1672  return Modified;
1673 }
1674 
1675 // Return true if the given machine basic block is a preheader of a loop in
1676 // which we want to flush the vmcnt counter, and false otherwise.
1677 bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
1678  WaitcntBrackets &ScoreBrackets) {
1679  if (PreheadersToFlush.count(&MBB))
1680  return PreheadersToFlush[&MBB];
1681 
1682  auto UpdateCache = [&](bool val) {
1683  PreheadersToFlush[&MBB] = val;
1684  return val;
1685  };
1686 
1688  if (!Succ)
1689  return UpdateCache(false);
1690 
1691  MachineLoop *Loop = MLI->getLoopFor(Succ);
1692  if (!Loop)
1693  return UpdateCache(false);
1694 
1695  if (Loop->getLoopPreheader() == &MBB && shouldFlushVmCnt(Loop, ScoreBrackets))
1696  return UpdateCache(true);
1697 
1698  return UpdateCache(false);
1699 }
1700 
1701 // Return true if it is better to flush the vmcnt counter in the preheader of
1702 // the given loop. We currently decide to flush in two situations:
1703 // 1. The loop contains vmem store(s), no vmem load and at least one use of a
1704 // vgpr containing a value that is loaded outside of the loop. (Only on
1705 // targets with no vscnt counter).
1706 // 2. The loop contains vmem load(s), but the loaded values are not used in the
1707 // loop, and at least one use of a vgpr containing a value that is loaded
1708 // outside of the loop.
1709 bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
1710  WaitcntBrackets &Brackets) {
1711  bool HasVMemLoad = false;
1712  bool HasVMemStore = false;
1713  bool UsesVgprLoadedOutside = false;
1714  DenseSet<Register> VgprUse;
1715  DenseSet<Register> VgprDef;
1716 
1717  for (MachineBasicBlock *MBB : ML->blocks()) {
1718  for (MachineInstr &MI : *MBB) {
1719  if (SIInstrInfo::isVMEM(MI)) {
1720  if (MI.mayLoad())
1721  HasVMemLoad = true;
1722  if (MI.mayStore())
1723  HasVMemStore = true;
1724  }
1725  for (unsigned I = 0; I < MI.getNumOperands(); I++) {
1726  MachineOperand &Op = MI.getOperand(I);
1727  if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
1728  continue;
1729  RegInterval Interval = Brackets.getRegInterval(&MI, TII, MRI, TRI, I);
1730  // Vgpr use
1731  if (Op.isUse()) {
1732  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1733  // If we find a register that is loaded inside the loop, 1. and 2.
1734  // are invalidated and we can exit.
1735  if (VgprDef.contains(RegNo))
1736  return false;
1737  VgprUse.insert(RegNo);
1738  // If at least one of Op's registers is in the score brackets, the
1739  // value is likely loaded outside of the loop.
1740  if (Brackets.getRegScore(RegNo, VM_CNT) > 0) {
1741  UsesVgprLoadedOutside = true;
1742  break;
1743  }
1744  }
1745  }
1746  // VMem load vgpr def
1747  else if (SIInstrInfo::isVMEM(MI) && MI.mayLoad() && Op.isDef())
1748  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1749  // If we find a register that is loaded inside the loop, 1. and 2.
1750  // are invalidated and we can exit.
1751  if (VgprUse.contains(RegNo))
1752  return false;
1753  VgprDef.insert(RegNo);
1754  }
1755  }
1756  }
1757  }
1758  if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
1759  return true;
1760  return HasVMemLoad && UsesVgprLoadedOutside;
1761 }
1762 
1763 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1764  ST = &MF.getSubtarget<GCNSubtarget>();
1765  TII = ST->getInstrInfo();
1766  TRI = &TII->getRegisterInfo();
1767  MRI = &MF.getRegInfo();
1768  IV = AMDGPU::getIsaVersion(ST->getCPU());
1770  MLI = &getAnalysis<MachineLoopInfo>();
1771  PDT = &getAnalysis<MachinePostDominatorTree>();
1772 
1773  ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1774  for (auto T : inst_counter_types())
1775  ForceEmitWaitcnt[T] = false;
1776 
1777  HardwareLimits Limits = {};
1778  Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1779  Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1780  Limits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1781  Limits.VscntMax = ST->hasVscnt() ? 63 : 0;
1782 
1783  unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
1784  unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
1785  assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1786  assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1787 
1788  RegisterEncoding Encoding = {};
1789  Encoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1790  Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
1791  Encoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1792  Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
1793 
1794  TrackedWaitcntSet.clear();
1795  BlockInfos.clear();
1796  bool Modified = false;
1797 
1798  if (!MFI->isEntryFunction()) {
1799  // Wait for any outstanding memory operations that the input registers may
1800  // depend on. We can't track them and it's better to do the wait after the
1801  // costly call sequence.
1802 
1803  // TODO: Could insert earlier and schedule more liberally with operations
1804  // that only use caller preserved registers.
1805  MachineBasicBlock &EntryBB = MF.front();
1806  MachineBasicBlock::iterator I = EntryBB.begin();
1807  for (MachineBasicBlock::iterator E = EntryBB.end();
1808  I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
1809  ;
1810  BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
1811  if (ST->hasVscnt())
1812  BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
1813  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1814  .addImm(0);
1815 
1816  Modified = true;
1817  }
1818 
1819  // Keep iterating over the blocks in reverse post order, inserting and
1820  // updating s_waitcnt where needed, until a fix point is reached.
1822  BlockInfos.insert({MBB, BlockInfo(MBB)});
1823 
1824  std::unique_ptr<WaitcntBrackets> Brackets;
1825  bool Repeat;
1826  do {
1827  Repeat = false;
1828 
1829  for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
1830  ++BII) {
1831  BlockInfo &BI = BII->second;
1832  if (!BI.Dirty)
1833  continue;
1834 
1835  if (BI.Incoming) {
1836  if (!Brackets)
1837  Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
1838  else
1839  *Brackets = *BI.Incoming;
1840  } else {
1841  if (!Brackets)
1842  Brackets = std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
1843  else
1844  *Brackets = WaitcntBrackets(ST, Limits, Encoding);
1845  }
1846 
1847  Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
1848  BI.Dirty = false;
1849 
1850  if (Brackets->hasPending()) {
1851  BlockInfo *MoveBracketsToSucc = nullptr;
1852  for (MachineBasicBlock *Succ : BI.MBB->successors()) {
1853  auto SuccBII = BlockInfos.find(Succ);
1854  BlockInfo &SuccBI = SuccBII->second;
1855  if (!SuccBI.Incoming) {
1856  SuccBI.Dirty = true;
1857  if (SuccBII <= BII)
1858  Repeat = true;
1859  if (!MoveBracketsToSucc) {
1860  MoveBracketsToSucc = &SuccBI;
1861  } else {
1862  SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
1863  }
1864  } else if (SuccBI.Incoming->merge(*Brackets)) {
1865  SuccBI.Dirty = true;
1866  if (SuccBII <= BII)
1867  Repeat = true;
1868  }
1869  }
1870  if (MoveBracketsToSucc)
1871  MoveBracketsToSucc->Incoming = std::move(Brackets);
1872  }
1873  }
1874  } while (Repeat);
1875 
1876  if (ST->hasScalarStores()) {
1878  bool HaveScalarStores = false;
1879 
1880  for (MachineBasicBlock &MBB : MF) {
1881  for (MachineInstr &MI : MBB) {
1882  if (!HaveScalarStores && TII->isScalarStore(MI))
1883  HaveScalarStores = true;
1884 
1885  if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1886  MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1887  EndPgmBlocks.push_back(&MBB);
1888  }
1889  }
1890 
1891  if (HaveScalarStores) {
1892  // If scalar writes are used, the cache must be flushed or else the next
1893  // wave to reuse the same scratch memory can be clobbered.
1894  //
1895  // Insert s_dcache_wb at wave termination points if there were any scalar
1896  // stores, and only if the cache hasn't already been flushed. This could
1897  // be improved by looking across blocks for flushes in postdominating
1898  // blocks from the stores but an explicitly requested flush is probably
1899  // very rare.
1900  for (MachineBasicBlock *MBB : EndPgmBlocks) {
1901  bool SeenDCacheWB = false;
1902 
1903  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
1904  I != E; ++I) {
1905  if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1906  SeenDCacheWB = true;
1907  else if (TII->isScalarStore(*I))
1908  SeenDCacheWB = false;
1909 
1910  // FIXME: It would be better to insert this before a waitcnt if any.
1911  if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1912  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1913  !SeenDCacheWB) {
1914  Modified = true;
1915  BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1916  }
1917  }
1918  }
1919  }
1920  }
1921 
1922  return Modified;
1923 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:77
i
i
Definition: README.txt:29
llvm::AMDGPU::getMUBUFIsBufferInv
bool getMUBUFIsBufferInv(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:358
llvm::AMDGPU::getMCReg
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
Definition: AMDGPUBaseInfo.cpp:1861
llvm::SIInstrInfo::isAtomicRet
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:577
readsVCCZ
static bool readsVCCZ(const MachineInstr &MI)
Definition: SIInsertWaitcnts.cpp:940
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:104
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
llvm::MachineLoopInfo::getLoopFor
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
Definition: MachineLoopInfo.h:126
print
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Definition: ArchiveWriter.cpp:189
llvm::SIInstrInfo::isVINTERP
static bool isVINTERP(const MachineInstr &MI)
Definition: SIInstrInfo.h:688
llvm::AMDGPU::getIsaVersion
IsaVersion getIsaVersion(StringRef GPU)
Definition: TargetParser.cpp:193
SIMachineFunctionInfo.h
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:50
llvm::SPIRV::SamplerAddressingMode::Repeat
@ Repeat
llvm::MachineInstr::mayLoadOrStore
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition: MachineInstr.h:1035
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:546
llvm::MachineInstr::mayLoad
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:1012
Waitcnts
SI Insert Waitcnts
Definition: SIInsertWaitcnts.cpp:819
llvm::MapVector::clear
void clear()
Definition: MapVector.h:89
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1185
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) INITIALIZE_PASS_END(SIInsertWaitcnts
llvm::TargetRegisterInfo::isInAllocatableClass
bool isInAllocatableClass(MCRegister RegNo) const
Return true if the register is in the allocation of any register class.
Definition: TargetRegisterInfo.h:362
llvm::X86Disassembler::Reg
Reg
All possible values of the reg field in the ModR/M byte.
Definition: X86DisassemblerDecoder.h:462
MapVector.h
llvm::SIInstrInfo::isEXP
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:553
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
llvm::enum_iteration_traits
Definition: Sequence.h:99
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:139
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::erase
bool erase(const KeyT &Val)
Definition: DenseMap.h:304
llvm::MachineOperand::setImm
void setImm(int64_t immVal)
Definition: MachineOperand.h:664
llvm::MCRegisterInfo::getEncodingValue
uint16_t getEncodingValue(MCRegister RegNo) const
Returns the encoding for RegNo.
Definition: MCRegisterInfo.h:553
llvm::AMDGPU::Exp::ET_POS0
@ ET_POS0
Definition: SIDefines.h:860
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:127
llvm::SIInsertWaitcntsID
char & SIInsertWaitcntsID
Definition: SIInsertWaitcnts.cpp:824
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:147
T
#define T
Definition: Mips16ISelLowering.cpp:341
llvm::MapVector
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::max
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:337
llvm::AMDGPU::getVmcntBitMask
unsigned getVmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:987
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:877
TargetParser.h
llvm::AMDGPU::getNamedOperandIdx
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
DEBUG_COUNTER
DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", "Force emit s_waitcnt expcnt(0) instrs")
llvm::AMDGPU::Exp::ET_PARAM0
@ ET_PARAM0
Definition: SIDefines.h:867
Sequence.h
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::count
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1628
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:103
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::DebugCounter::isCounterSet
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:102
llvm::MachineLoopInfo
Definition: MachineLoopInfo.h:89
llvm::AMDGPU::SendMsg::ID_MASK_PreGFX11_
@ ID_MASK_PreGFX11_
Definition: SIDefines.h:341
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::MachineFunction::front
const MachineBasicBlock & front() const
Definition: MachineFunction.h:865
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:666
llvm::MapVector::begin
iterator begin()
Definition: MapVector.h:70
llvm::MachineInstr::isCall
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:830
MachineLoopInfo.h
llvm::SIInstrInfo::isMIMG
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:500
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineOperand::getImm
int64_t getImm() const
Definition: MachineOperand.h:546
llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:754
llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:501
llvm::AMDGPUISD::DS_ORDERED_COUNT
@ DS_ORDERED_COUNT
Definition: AMDGPUISelLowering.h:502
llvm::createSIInsertWaitcntsPass
FunctionPass * createSIInsertWaitcntsPass()
Definition: SIInsertWaitcnts.cpp:826
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:45
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
t
bitcast float %x to i32 %s=and i32 %t, 2147483647 %d=bitcast i32 %s to float ret float %d } declare float @fabsf(float %n) define float @bar(float %x) nounwind { %d=call float @fabsf(float %x) ret float %d } This IR(from PR6194):target datalayout="e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple="x86_64-apple-darwin10.0.0" %0=type { double, double } %struct.float3=type { float, float, float } define void @test(%0, %struct.float3 *nocapture %res) nounwind noinline ssp { entry:%tmp18=extractvalue %0 %0, 0 t
Definition: README-SSE.txt:788
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
llvm::LoopBase::blocks
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:194
false
Definition: StackSlotColoring.cpp:141
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125
merge
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Definition: LoopDeletion.cpp:53
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:48
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:363
llvm::DebugCounter::shouldExecute
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:74
llvm::report_fatal_error
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:143
llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:54
LoopDeletionResult::Modified
@ Modified
llvm::AMDGPU::decodeWaitcnt
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
Definition: AMDGPUBaseInfo.cpp:1031
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:30
llvm::MachineInstr::definesRegister
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr fully defines the specified register.
Definition: MachineInstr.h:1376
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:94
llvm::AMDGPU::Exp::ET_POS_LAST
@ ET_POS_LAST
Definition: SIDefines.h:863
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::SIInstrFlags::VM_CNT
@ VM_CNT
Definition: SIDefines.h:73
llvm::DenseSet
Implements a dense probed hash-table based set.
Definition: DenseSet.h:268
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:656
llvm::cl::opt< bool >
val
The initial backend is deliberately restricted to z10 We should add support for later architectures at some point If an asm ties an i32 r result to an i64 the input will be treated as an leaving the upper bits uninitialised For i64 store i32 val
Definition: README.txt:15
llvm::MachineInstr::getDebugLoc
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:420
llvm::AMDGPU::getExpcntBitMask
unsigned getExpcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:993
llvm::MachineLoop
Definition: MachineLoopInfo.h:44
AMDGPUMCTargetDesc.h
llvm::MapVector::find
iterator find(const KeyT &Key)
Definition: MapVector.h:148
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:320
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:66
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SIInsertWaitcnts.cpp:41
llvm::Interval
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:36
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::MachineBasicBlock::getSingleSuccessor
const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: MachineBasicBlock.cpp:918
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
move
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional move
Definition: README.txt:546
llvm::DenseMap
Definition: DenseMap.h:716
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::AMDGPUMachineFunction::isEntryFunction
bool isEntryFunction() const
Definition: AMDGPUMachineFunction.h:86
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:432
llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:618
llvm::LoopBase::getLoopPreheader
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Definition: LoopInfoImpl.h:166
callWaitsOnFunctionEntry
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
Definition: SIInsertWaitcnts.cpp:947
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::find
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:152
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
MachinePostDominators.h
llvm::AMDGPU::getMIMGInfo
const LLVM_READONLY MIMGInfo * getMIMGInfo(unsigned Opc)
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:359
llvm::MachineFunction
Definition: MachineFunction.h:257
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::clear
void clear()
Definition: DenseSet.h:92
llvm::AMDGPU::getMIMGBaseOpcodeInfo
const LLVM_READONLY MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
llvm::enum_iteration_traits::is_iterable
static constexpr bool is_iterable
Definition: Sequence.h:100
llvm::MapVector::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:118
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:263
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
AMDGPU.h
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:491
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::contains
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:82
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::MachineInstr::print
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
Definition: MachineInstr.cpp:1527
llvm::MachineOperand::isDef
bool isDef() const
Definition: MachineOperand.h:374
llvm::SIInstrInfo::isMUBUF
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:464
llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:288
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:209
llvm::MachinePostDominatorTree
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
Definition: MachinePostDominators.h:27
MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105
llvm::MapVector::end
iterator end()
Definition: MapVector.h:72
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::AMDGPU::encodeWaitcnt
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Definition: AMDGPUBaseInfo.cpp:1067
llvm::MachinePostDominatorTree::dominates
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
Definition: MachinePostDominators.h:54
llvm::TargetRegisterInfo::getRegSizeInBits
unsigned getRegSizeInBits(const TargetRegisterClass &RC) const
Return the size in bits of a register from class RC.
Definition: TargetRegisterInfo.h:277
ForceEmitZeroFlag
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
llvm::SIInstrInfo::isAtomicNoRet
static bool isAtomicNoRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:569
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:345
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:360
llvm::MachineInstr::mayStore
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:1025
llvm::enum_seq
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition: Sequence.h:328
llvm::SIInstrInfo::isVMEM
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:368
DebugCounter.h
llvm::ReversePostOrderTraversal
Definition: PostOrderIterator.h:291
llvm::AMDGPU::getLgkmcntBitMask
unsigned getLgkmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:997
llvm::RISCVMatInt::Imm
@ Imm
Definition: RISCVMatInt.h:23
llvm::SIInstrInfo
Definition: SIInstrInfo.h:44
llvm::SIInstrFlags::LGKM_CNT
@ LGKM_CNT
Definition: SIDefines.h:75
PostOrderIterator.h
mayWriteLDSThroughDMA
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
Definition: SIInsertWaitcnts.cpp:517
callWaitsOnFunctionReturn
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
Definition: SIInsertWaitcnts.cpp:957
llvm::MachineBasicBlock::begin
iterator begin()
Definition: MachineBasicBlock.h:278
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:328
llvm::AMDGPU::SendMsg::ID_GS_DONE_PreGFX11
@ ID_GS_DONE_PreGFX11
Definition: SIDefines.h:319
llvm::SIInstrInfo::isVALU
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:360
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:358
llvm::MachineInstr::getNumOperands
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:494
llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition: SIMachineFunctionInfo.h:349
llvm::SIInstrFlags::EXP_CNT
@ EXP_CNT
Definition: SIDefines.h:74
llvm::MachineInstr::memoperands
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:691
IV
static const uint32_t IV[8]
Definition: blake3_impl.h:85
llvm::sys::Wait
ProcessInfo Wait(const ProcessInfo &PI, unsigned SecondsToWait, bool WaitUntilTerminates, std::string *ErrMsg=nullptr, Optional< ProcessStatistics > *ProcStat=nullptr)
This function waits for the process specified by PI to finish.
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
llvm::AMDGPU::Exp::ET_PARAM31
@ ET_PARAM31
Definition: SIDefines.h:868
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::AMDGPU::Waitcnt::allZero
static Waitcnt allZero(bool HasVscnt)
Definition: AMDGPUBaseInfo.h:530
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::SIInstrInfo::isFLAT
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:516
llvm::cl::desc
Definition: CommandLine.h:405
llvm::SIInstrInfo::isLDSDIR
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:680
llvm::MachineInstr::eraseFromParent
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Definition: MachineInstr.cpp:650
llvm::MachineInstrBundleIterator< MachineInstr >
llvm::pdb::PDB_SymType::Block
@ Block
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:280
Other
Optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1236
AMDGPUBaseInfo.h
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition: MachineInstrBuilder.h:52