LLVM  14.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert wait instructions for memory reads and writes.
11 ///
12 /// Memory reads and writes are issued asynchronously, so we need to insert
13 /// S_WAITCNT instructions when we want to access any of their results or
14 /// overwrite any register that's used asynchronously.
15 ///
16 /// TODO: This pass currently keeps one timeline per hardware counter. A more
17 /// finely-grained approach that keeps one timeline per event type could
18 /// sometimes get away with generating weaker s_waitcnt instructions. For
19 /// example, when both SMEM and LDS are in flight and we need to wait for
20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21 /// but the pass will currently generate a conservative lgkmcnt(0) because
22 /// multiple event types are in flight.
23 //
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPU.h"
27 #include "GCNSubtarget.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "Utils/AMDGPUBaseInfo.h"
31 #include "llvm/ADT/MapVector.h"
34 #include "llvm/InitializePasses.h"
37 using namespace llvm;
38 
39 #define DEBUG_TYPE "si-insert-waitcnts"
40 
41 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
42  "Force emit s_waitcnt expcnt(0) instrs");
43 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
44  "Force emit s_waitcnt lgkmcnt(0) instrs");
45 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
46  "Force emit s_waitcnt vmcnt(0) instrs");
47 
49  "amdgpu-waitcnt-forcezero",
50  cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
51  cl::init(false), cl::Hidden);
52 
53 namespace {
54 
55 template <typename EnumT>
56 class enum_iterator
57  : public iterator_facade_base<enum_iterator<EnumT>,
58  std::forward_iterator_tag, const EnumT> {
59  EnumT Value;
60 public:
61  enum_iterator() = default;
62  enum_iterator(EnumT Value) : Value(Value) {}
63 
64  enum_iterator &operator++() {
65  Value = static_cast<EnumT>(Value + 1);
66  return *this;
67  }
68 
69  bool operator==(const enum_iterator &RHS) const { return Value == RHS.Value; }
70 
71  EnumT operator*() const { return Value; }
72 };
73 
74 // Class of object that encapsulates latest instruction counter score
75 // associated with the operand. Used for determining whether
76 // s_waitcnt instruction needs to be emitted.
77 
78 #define CNT_MASK(t) (1u << (t))
79 
80 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
81 
83  return make_range(enum_iterator<InstCounterType>(VM_CNT),
84  enum_iterator<InstCounterType>(NUM_INST_CNTS));
85 }
86 
87 using RegInterval = std::pair<int, int>;
88 
89 struct {
90  unsigned VmcntMax;
91  unsigned ExpcntMax;
92  unsigned LgkmcntMax;
93  unsigned VscntMax;
94 } HardwareLimits;
95 
96 struct {
97  unsigned VGPR0;
98  unsigned VGPRL;
99  unsigned SGPR0;
100  unsigned SGPRL;
101 } RegisterEncoding;
102 
103 enum WaitEventType {
104  VMEM_ACCESS, // vector-memory read & write
105  VMEM_READ_ACCESS, // vector-memory read
106  VMEM_WRITE_ACCESS,// vector-memory write
107  LDS_ACCESS, // lds read & write
108  GDS_ACCESS, // gds read & write
109  SQ_MESSAGE, // send message
110  SMEM_ACCESS, // scalar-memory read & write
111  EXP_GPR_LOCK, // export holding on its data src
112  GDS_GPR_LOCK, // GDS holding on its data and addr src
113  EXP_POS_ACCESS, // write to export position
114  EXP_PARAM_ACCESS, // write to export parameter
115  VMW_GPR_LOCK, // vector-memory write holding on its data src
116  NUM_WAIT_EVENTS,
117 };
118 
119 static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
120  (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
121  (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
122  (1 << SQ_MESSAGE),
123  (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
124  (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS),
125  (1 << VMEM_WRITE_ACCESS)
126 };
127 
128 // The mapping is:
129 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
130 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
131 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
132 // We reserve a fixed number of VGPR slots in the scoring tables for
133 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
134 enum RegisterMapping {
135  SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
136  AGPR_OFFSET = 226, // Maximum programmable ArchVGPRs across all targets.
137  SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
138  NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
139  EXTRA_VGPR_LDS = 0, // This is a placeholder the Shader algorithm uses.
140  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
141 };
142 
143 // Enumerate different types of result-returning VMEM operations. Although
144 // s_waitcnt orders them all with a single vmcnt counter, in the absence of
145 // s_waitcnt only instructions of the same VmemType are guaranteed to write
146 // their results in order -- so there is no need to insert an s_waitcnt between
147 // two instructions of the same type that write the same vgpr.
148 enum VmemType {
149  // BUF instructions and MIMG instructions without a sampler.
150  VMEM_NOSAMPLER,
151  // MIMG instructions with a sampler.
152  VMEM_SAMPLER,
153 };
154 
155 VmemType getVmemType(const MachineInstr &Inst) {
157  if (!SIInstrInfo::isMIMG(Inst))
158  return VMEM_NOSAMPLER;
160  return AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler
161  ? VMEM_SAMPLER
162  : VMEM_NOSAMPLER;
163 }
164 
165 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
166  switch (T) {
167  case VM_CNT:
168  Wait.VmCnt = std::min(Wait.VmCnt, Count);
169  break;
170  case EXP_CNT:
171  Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
172  break;
173  case LGKM_CNT:
174  Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
175  break;
176  case VS_CNT:
177  Wait.VsCnt = std::min(Wait.VsCnt, Count);
178  break;
179  default:
180  llvm_unreachable("bad InstCounterType");
181  }
182 }
183 
184 // This objects maintains the current score brackets of each wait counter, and
185 // a per-register scoreboard for each wait counter.
186 //
187 // We also maintain the latest score for every event type that can change the
188 // waitcnt in order to know if there are multiple types of events within
189 // the brackets. When multiple types of event happen in the bracket,
190 // wait count may get decreased out of order, therefore we need to put in
191 // "s_waitcnt 0" before use.
192 class WaitcntBrackets {
193 public:
194  WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {}
195 
196  static unsigned getWaitCountMax(InstCounterType T) {
197  switch (T) {
198  case VM_CNT:
199  return HardwareLimits.VmcntMax;
200  case LGKM_CNT:
201  return HardwareLimits.LgkmcntMax;
202  case EXP_CNT:
203  return HardwareLimits.ExpcntMax;
204  case VS_CNT:
205  return HardwareLimits.VscntMax;
206  default:
207  break;
208  }
209  return 0;
210  }
211 
212  unsigned getScoreLB(InstCounterType T) const {
213  assert(T < NUM_INST_CNTS);
214  return ScoreLBs[T];
215  }
216 
217  unsigned getScoreUB(InstCounterType T) const {
218  assert(T < NUM_INST_CNTS);
219  return ScoreUBs[T];
220  }
221 
222  // Mapping from event to counter.
223  InstCounterType eventCounter(WaitEventType E) {
224  if (WaitEventMaskForInst[VM_CNT] & (1 << E))
225  return VM_CNT;
226  if (WaitEventMaskForInst[LGKM_CNT] & (1 << E))
227  return LGKM_CNT;
228  if (WaitEventMaskForInst[VS_CNT] & (1 << E))
229  return VS_CNT;
230  assert(WaitEventMaskForInst[EXP_CNT] & (1 << E));
231  return EXP_CNT;
232  }
233 
234  unsigned getRegScore(int GprNo, InstCounterType T) {
235  if (GprNo < NUM_ALL_VGPRS) {
236  return VgprScores[T][GprNo];
237  }
238  assert(T == LGKM_CNT);
239  return SgprScores[GprNo - NUM_ALL_VGPRS];
240  }
241 
242  bool merge(const WaitcntBrackets &Other);
243 
244  RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
245  const MachineRegisterInfo *MRI,
246  const SIRegisterInfo *TRI, unsigned OpNo) const;
247 
248  bool counterOutOfOrder(InstCounterType T) const;
249  void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
250  void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
251  void determineWait(InstCounterType T, unsigned ScoreToWait,
252  AMDGPU::Waitcnt &Wait) const;
253  void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
254  void applyWaitcnt(InstCounterType T, unsigned Count);
255  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
256  const MachineRegisterInfo *MRI, WaitEventType E,
257  MachineInstr &MI);
258 
259  bool hasPending() const { return PendingEvents != 0; }
260  bool hasPendingEvent(WaitEventType E) const {
261  return PendingEvents & (1 << E);
262  }
263 
264  bool hasMixedPendingEvents(InstCounterType T) const {
265  unsigned Events = PendingEvents & WaitEventMaskForInst[T];
266  // Return true if more than one bit is set in Events.
267  return Events & (Events - 1);
268  }
269 
270  bool hasPendingFlat() const {
271  return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
272  LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
273  (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
274  LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
275  }
276 
277  void setPendingFlat() {
278  LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
279  LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
280  }
281 
282  // Return true if there might be pending writes to the specified vgpr by VMEM
283  // instructions with types different from V.
284  bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
285  assert(GprNo < NUM_ALL_VGPRS);
286  return VgprVmemTypes[GprNo] & ~(1 << V);
287  }
288 
289  void clearVgprVmemTypes(int GprNo) {
290  assert(GprNo < NUM_ALL_VGPRS);
291  VgprVmemTypes[GprNo] = 0;
292  }
293 
294  void print(raw_ostream &);
295  void dump() { print(dbgs()); }
296 
297 private:
298  struct MergeInfo {
299  unsigned OldLB;
300  unsigned OtherLB;
301  unsigned MyShift;
302  unsigned OtherShift;
303  };
304  static bool mergeScore(const MergeInfo &M, unsigned &Score,
305  unsigned OtherScore);
306 
307  void setScoreLB(InstCounterType T, unsigned Val) {
308  assert(T < NUM_INST_CNTS);
309  ScoreLBs[T] = Val;
310  }
311 
312  void setScoreUB(InstCounterType T, unsigned Val) {
313  assert(T < NUM_INST_CNTS);
314  ScoreUBs[T] = Val;
315  if (T == EXP_CNT) {
316  unsigned UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
317  if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
318  ScoreLBs[T] = UB;
319  }
320  }
321 
322  void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
323  if (GprNo < NUM_ALL_VGPRS) {
324  VgprUB = std::max(VgprUB, GprNo);
325  VgprScores[T][GprNo] = Val;
326  } else {
327  assert(T == LGKM_CNT);
328  SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
329  SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
330  }
331  }
332 
333  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
334  const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
335  unsigned OpNo, unsigned Val);
336 
337  const GCNSubtarget *ST = nullptr;
338  unsigned ScoreLBs[NUM_INST_CNTS] = {0};
339  unsigned ScoreUBs[NUM_INST_CNTS] = {0};
340  unsigned PendingEvents = 0;
341  // Remember the last flat memory operation.
342  unsigned LastFlat[NUM_INST_CNTS] = {0};
343  // wait_cnt scores for every vgpr.
344  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
345  int VgprUB = -1;
346  int SgprUB = -1;
347  unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
348  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
349  unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
350  // Bitmask of the VmemTypes of VMEM instructions that might have a pending
351  // write to each vgpr.
352  unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
353 };
354 
355 class SIInsertWaitcnts : public MachineFunctionPass {
356 private:
357  const GCNSubtarget *ST = nullptr;
358  const SIInstrInfo *TII = nullptr;
359  const SIRegisterInfo *TRI = nullptr;
360  const MachineRegisterInfo *MRI = nullptr;
362 
363  DenseSet<MachineInstr *> TrackedWaitcntSet;
366 
367  struct BlockInfo {
369  std::unique_ptr<WaitcntBrackets> Incoming;
370  bool Dirty = true;
371 
372  explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
373  };
374 
376 
377  // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
378  // because of amdgpu-waitcnt-forcezero flag
379  bool ForceEmitZeroWaitcnts;
380  bool ForceEmitWaitcnt[NUM_INST_CNTS];
381 
382 public:
383  static char ID;
384 
385  SIInsertWaitcnts() : MachineFunctionPass(ID) {
386  (void)ForceExpCounter;
387  (void)ForceLgkmCounter;
388  (void)ForceVMCounter;
389  }
390 
391  bool runOnMachineFunction(MachineFunction &MF) override;
392 
393  StringRef getPassName() const override {
394  return "SI insert wait instructions";
395  }
396 
397  void getAnalysisUsage(AnalysisUsage &AU) const override {
398  AU.setPreservesCFG();
401  }
402 
403  bool isForceEmitWaitcnt() const {
404  for (auto T : inst_counter_types())
405  if (ForceEmitWaitcnt[T])
406  return true;
407  return false;
408  }
409 
410  void setForceEmitWaitcnt() {
411 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
412 // For debug builds, get the debug counter info and adjust if need be
413 #ifndef NDEBUG
414  if (DebugCounter::isCounterSet(ForceExpCounter) &&
415  DebugCounter::shouldExecute(ForceExpCounter)) {
416  ForceEmitWaitcnt[EXP_CNT] = true;
417  } else {
418  ForceEmitWaitcnt[EXP_CNT] = false;
419  }
420 
421  if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
422  DebugCounter::shouldExecute(ForceLgkmCounter)) {
423  ForceEmitWaitcnt[LGKM_CNT] = true;
424  } else {
425  ForceEmitWaitcnt[LGKM_CNT] = false;
426  }
427 
428  if (DebugCounter::isCounterSet(ForceVMCounter) &&
429  DebugCounter::shouldExecute(ForceVMCounter)) {
430  ForceEmitWaitcnt[VM_CNT] = true;
431  } else {
432  ForceEmitWaitcnt[VM_CNT] = false;
433  }
434 #endif // NDEBUG
435  }
436 
437  bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
438  bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
439  bool generateWaitcntInstBefore(MachineInstr &MI,
440  WaitcntBrackets &ScoreBrackets,
441  MachineInstr *OldWaitcntInstr);
442  void updateEventWaitcntAfter(MachineInstr &Inst,
443  WaitcntBrackets *ScoreBrackets);
444  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
445  WaitcntBrackets &ScoreBrackets);
446  bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
447  MachineInstr &OldWaitcntInstr,
449 };
450 
451 } // end anonymous namespace
452 
453 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
454  const SIInstrInfo *TII,
455  const MachineRegisterInfo *MRI,
456  const SIRegisterInfo *TRI,
457  unsigned OpNo) const {
458  const MachineOperand &Op = MI->getOperand(OpNo);
459  if (!TRI->isInAllocatableClass(Op.getReg()))
460  return {-1, -1};
461 
462  // A use via a PW operand does not need a waitcnt.
463  // A partial write is not a WAW.
464  assert(!Op.getSubReg() || !Op.isUndef());
465 
466  RegInterval Result;
467 
468  unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
469 
470  if (TRI->isVectorRegister(*MRI, Op.getReg())) {
471  assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
472  Result.first = Reg - RegisterEncoding.VGPR0;
473  if (TRI->isAGPR(*MRI, Op.getReg()))
474  Result.first += AGPR_OFFSET;
475  assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
476  } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
477  assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
478  Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
479  assert(Result.first >= NUM_ALL_VGPRS &&
480  Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
481  }
482  // TODO: Handle TTMP
483  // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
484  else
485  return {-1, -1};
486 
487  const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo);
488  unsigned Size = TRI->getRegSizeInBits(*RC);
489  Result.second = Result.first + ((Size + 16) / 32);
490 
491  return Result;
492 }
493 
494 void WaitcntBrackets::setExpScore(const MachineInstr *MI,
495  const SIInstrInfo *TII,
496  const SIRegisterInfo *TRI,
497  const MachineRegisterInfo *MRI, unsigned OpNo,
498  unsigned Val) {
499  RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
500  assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
501  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
502  setRegScore(RegNo, EXP_CNT, Val);
503  }
504 }
505 
506 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
507  const SIRegisterInfo *TRI,
508  const MachineRegisterInfo *MRI,
509  WaitEventType E, MachineInstr &Inst) {
510  InstCounterType T = eventCounter(E);
511  unsigned CurrScore = getScoreUB(T) + 1;
512  if (CurrScore == 0)
513  report_fatal_error("InsertWaitcnt score wraparound");
514  // PendingEvents and ScoreUB need to be update regardless if this event
515  // changes the score of a register or not.
516  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
517  PendingEvents |= 1 << E;
518  setScoreUB(T, CurrScore);
519 
520  if (T == EXP_CNT) {
521  // Put score on the source vgprs. If this is a store, just use those
522  // specific register(s).
523  if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
524  int AddrOpIdx =
525  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
526  // All GDS operations must protect their address register (same as
527  // export.)
528  if (AddrOpIdx != -1) {
529  setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
530  }
531 
532  if (Inst.mayStore()) {
534  AMDGPU::OpName::data0) != -1) {
535  setExpScore(
536  &Inst, TII, TRI, MRI,
537  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
538  CurrScore);
539  }
541  AMDGPU::OpName::data1) != -1) {
542  setExpScore(&Inst, TII, TRI, MRI,
544  AMDGPU::OpName::data1),
545  CurrScore);
546  }
547  } else if (SIInstrInfo::isAtomicRet(Inst) &&
548  Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
549  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
550  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
551  Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
552  Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
553  Inst.getOpcode() != AMDGPU::DS_APPEND &&
554  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
556  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
557  const MachineOperand &Op = Inst.getOperand(I);
558  if (Op.isReg() && !Op.isDef() &&
559  TRI->isVectorRegister(*MRI, Op.getReg())) {
560  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
561  }
562  }
563  }
564  } else if (TII->isFLAT(Inst)) {
565  if (Inst.mayStore()) {
566  setExpScore(
567  &Inst, TII, TRI, MRI,
568  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
569  CurrScore);
570  } else if (SIInstrInfo::isAtomicRet(Inst)) {
571  setExpScore(
572  &Inst, TII, TRI, MRI,
573  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
574  CurrScore);
575  }
576  } else if (TII->isMIMG(Inst)) {
577  if (Inst.mayStore()) {
578  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
579  } else if (SIInstrInfo::isAtomicRet(Inst)) {
580  setExpScore(
581  &Inst, TII, TRI, MRI,
582  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
583  CurrScore);
584  }
585  } else if (TII->isMTBUF(Inst)) {
586  if (Inst.mayStore()) {
587  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
588  }
589  } else if (TII->isMUBUF(Inst)) {
590  if (Inst.mayStore()) {
591  setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
592  } else if (SIInstrInfo::isAtomicRet(Inst)) {
593  setExpScore(
594  &Inst, TII, TRI, MRI,
595  AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
596  CurrScore);
597  }
598  } else {
599  if (TII->isEXP(Inst)) {
600  // For export the destination registers are really temps that
601  // can be used as the actual source after export patching, so
602  // we need to treat them like sources and set the EXP_CNT
603  // score.
604  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
605  MachineOperand &DefMO = Inst.getOperand(I);
606  if (DefMO.isReg() && DefMO.isDef() &&
607  TRI->isVGPR(*MRI, DefMO.getReg())) {
608  setRegScore(
610  EXP_CNT, CurrScore);
611  }
612  }
613  }
614  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
615  MachineOperand &MO = Inst.getOperand(I);
616  if (MO.isReg() && !MO.isDef() &&
617  TRI->isVectorRegister(*MRI, MO.getReg())) {
618  setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
619  }
620  }
621  }
622 #if 0 // TODO: check if this is handled by MUBUF code above.
623  } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
624  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
625  Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
626  MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
627  unsigned OpNo;//TODO: find the OpNo for this operand;
628  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo);
629  for (int RegNo = Interval.first; RegNo < Interval.second;
630  ++RegNo) {
631  setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
632  }
633 #endif
634  } else {
635  // Match the score to the destination registers.
636  for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
637  auto &Op = Inst.getOperand(I);
638  if (!Op.isReg() || !Op.isDef())
639  continue;
640  RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I);
641  if (T == VM_CNT) {
642  if (Interval.first >= NUM_ALL_VGPRS)
643  continue;
644  if (SIInstrInfo::isVMEM(Inst)) {
645  VmemType V = getVmemType(Inst);
646  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
647  VgprVmemTypes[RegNo] |= 1 << V;
648  }
649  }
650  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
651  setRegScore(RegNo, T, CurrScore);
652  }
653  }
654  if (TII->isDS(Inst) && Inst.mayStore()) {
655  setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
656  }
657  }
658 }
659 
661  OS << '\n';
662  for (auto T : inst_counter_types()) {
663  unsigned LB = getScoreLB(T);
664  unsigned UB = getScoreUB(T);
665 
666  switch (T) {
667  case VM_CNT:
668  OS << " VM_CNT(" << UB - LB << "): ";
669  break;
670  case LGKM_CNT:
671  OS << " LGKM_CNT(" << UB - LB << "): ";
672  break;
673  case EXP_CNT:
674  OS << " EXP_CNT(" << UB - LB << "): ";
675  break;
676  case VS_CNT:
677  OS << " VS_CNT(" << UB - LB << "): ";
678  break;
679  default:
680  OS << " UNKNOWN(" << UB - LB << "): ";
681  break;
682  }
683 
684  if (LB < UB) {
685  // Print vgpr scores.
686  for (int J = 0; J <= VgprUB; J++) {
687  unsigned RegScore = getRegScore(J, T);
688  if (RegScore <= LB)
689  continue;
690  unsigned RelScore = RegScore - LB - 1;
691  if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
692  OS << RelScore << ":v" << J << " ";
693  } else {
694  OS << RelScore << ":ds ";
695  }
696  }
697  // Also need to print sgpr scores for lgkm_cnt.
698  if (T == LGKM_CNT) {
699  for (int J = 0; J <= SgprUB; J++) {
700  unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
701  if (RegScore <= LB)
702  continue;
703  unsigned RelScore = RegScore - LB - 1;
704  OS << RelScore << ":s" << J << " ";
705  }
706  }
707  }
708  OS << '\n';
709  }
710  OS << '\n';
711 }
712 
713 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
714 /// whether a waitcnt instruction is needed at all.
715 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
716  simplifyWaitcnt(VM_CNT, Wait.VmCnt);
717  simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
718  simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
719  simplifyWaitcnt(VS_CNT, Wait.VsCnt);
720 }
721 
722 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
723  unsigned &Count) const {
724  const unsigned LB = getScoreLB(T);
725  const unsigned UB = getScoreUB(T);
726 
727  // The number of outstanding events for this type, T, can be calculated
728  // as (UB - LB). If the current Count is greater than or equal to the number
729  // of outstanding events, then the wait for this counter is redundant.
730  if (Count >= UB - LB)
731  Count = ~0u;
732 }
733 
734 void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait,
735  AMDGPU::Waitcnt &Wait) const {
736  // If the score of src_operand falls within the bracket, we need an
737  // s_waitcnt instruction.
738  const unsigned LB = getScoreLB(T);
739  const unsigned UB = getScoreUB(T);
740  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
741  if ((T == VM_CNT || T == LGKM_CNT) &&
742  hasPendingFlat() &&
743  !ST->hasFlatLgkmVMemCountInOrder()) {
744  // If there is a pending FLAT operation, and this is a VMem or LGKM
745  // waitcnt and the target can report early completion, then we need
746  // to force a waitcnt 0.
747  addWait(Wait, T, 0);
748  } else if (counterOutOfOrder(T)) {
749  // Counter can get decremented out-of-order when there
750  // are multiple types event in the bracket. Also emit an s_wait counter
751  // with a conservative value of 0 for the counter.
752  addWait(Wait, T, 0);
753  } else {
754  // If a counter has been maxed out avoid overflow by waiting for
755  // MAX(CounterType) - 1 instead.
756  unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
757  addWait(Wait, T, NeededWait);
758  }
759  }
760 }
761 
762 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
763  applyWaitcnt(VM_CNT, Wait.VmCnt);
764  applyWaitcnt(EXP_CNT, Wait.ExpCnt);
765  applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
766  applyWaitcnt(VS_CNT, Wait.VsCnt);
767 }
768 
769 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
770  const unsigned UB = getScoreUB(T);
771  if (Count >= UB)
772  return;
773  if (Count != 0) {
774  if (counterOutOfOrder(T))
775  return;
776  setScoreLB(T, std::max(getScoreLB(T), UB - Count));
777  } else {
778  setScoreLB(T, UB);
779  PendingEvents &= ~WaitEventMaskForInst[T];
780  }
781 }
782 
783 // Where there are multiple types of event in the bracket of a counter,
784 // the decrement may go out of order.
785 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
786  // Scalar memory read always can go out of order.
787  if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
788  return true;
789  return hasMixedPendingEvents(T);
790 }
791 
792 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
793  false)
795 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
796  false)
797 
798 char SIInsertWaitcnts::ID = 0;
799 
800 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
801 
803  return new SIInsertWaitcnts();
804 }
805 
806 /// Combine consecutive waitcnt instructions that precede \p MI and follow
807 /// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
808 /// by previous passes. Currently this pass conservatively assumes that these
809 /// preexisting waitcnt are required for correctness.
810 bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
811  MachineInstr &OldWaitcntInstr,
813  const MachineInstr *MI) {
814  bool Modified = false;
815  MachineInstr *WaitcntInstr = nullptr;
816  MachineInstr *WaitcntVsCntInstr = nullptr;
817  for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II);
818  &*II != MI; II = NextI, ++NextI) {
819  if (II->isMetaInstruction())
820  continue;
821 
822  if (II->getOpcode() == AMDGPU::S_WAITCNT) {
823  // Conservatively update required wait if this waitcnt was added in an
824  // earlier pass. In this case it will not exist in the tracked waitcnt
825  // set.
826  if (!TrackedWaitcntSet.count(&*II)) {
827  unsigned IEnc = II->getOperand(0).getImm();
828  AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
829  Wait = Wait.combined(OldWait);
830  }
831 
832  // Merge consecutive waitcnt of the same type by erasing multiples.
833  if (!WaitcntInstr) {
834  WaitcntInstr = &*II;
835  } else {
836  II->eraseFromParent();
837  Modified = true;
838  }
839 
840  } else {
841  assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
842  assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
843  if (!TrackedWaitcntSet.count(&*II)) {
844  unsigned OldVSCnt =
845  TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm();
846  Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
847  }
848 
849  if (!WaitcntVsCntInstr) {
850  WaitcntVsCntInstr = &*II;
851  } else {
852  II->eraseFromParent();
853  Modified = true;
854  }
855  }
856  }
857 
858  // Updated encoding of merged waitcnt with the required wait.
859  if (WaitcntInstr) {
860  if (Wait.hasWaitExceptVsCnt()) {
861  unsigned NewEnc = AMDGPU::encodeWaitcnt(IV, Wait);
862  unsigned OldEnc = WaitcntInstr->getOperand(0).getImm();
863  if (OldEnc != NewEnc) {
864  WaitcntInstr->getOperand(0).setImm(NewEnc);
865  Modified = true;
866  }
867  ScoreBrackets.applyWaitcnt(Wait);
868  Wait.VmCnt = ~0u;
869  Wait.LgkmCnt = ~0u;
870  Wait.ExpCnt = ~0u;
871 
872  LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
873  << "Old Instr: " << MI << "New Instr: " << *WaitcntInstr
874  << '\n');
875  } else {
876  WaitcntInstr->eraseFromParent();
877  Modified = true;
878  }
879  }
880 
881  if (WaitcntVsCntInstr) {
882  if (Wait.hasWaitVsCnt()) {
883  assert(ST->hasVscnt());
884  unsigned OldVSCnt =
885  TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
886  ->getImm();
887  if (Wait.VsCnt != OldVSCnt) {
888  TII->getNamedOperand(*WaitcntVsCntInstr, AMDGPU::OpName::simm16)
889  ->setImm(Wait.VsCnt);
890  Modified = true;
891  }
892  ScoreBrackets.applyWaitcnt(Wait);
893  Wait.VsCnt = ~0u;
894 
895  LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
896  << "Old Instr: " << MI
897  << "New Instr: " << *WaitcntVsCntInstr << '\n');
898  } else {
899  WaitcntVsCntInstr->eraseFromParent();
900  Modified = true;
901  }
902  }
903 
904  return Modified;
905 }
906 
907 static bool readsVCCZ(const MachineInstr &MI) {
908  unsigned Opc = MI.getOpcode();
909  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
910  !MI.getOperand(1).isUndef();
911 }
912 
913 /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
915  // Currently all conventions wait, but this may not always be the case.
916  //
917  // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
918  // senses to omit the wait and do it in the caller.
919  return true;
920 }
921 
922 /// \returns true if the callee is expected to wait for any outstanding waits
923 /// before returning.
925  return true;
926 }
927 
928 /// Generate s_waitcnt instruction to be placed before cur_Inst.
929 /// Instructions of a given type are returned in order,
930 /// but instructions of different types can complete out of order.
931 /// We rely on this in-order completion
932 /// and simply assign a score to the memory access instructions.
933 /// We keep track of the active "score bracket" to determine
934 /// if an access of a memory read requires an s_waitcnt
935 /// and if so what the value of each counter is.
936 /// The "score bracket" is bound by the lower bound and upper bound
937 /// scores (*_score_LB and *_score_ub respectively).
938 bool SIInsertWaitcnts::generateWaitcntInstBefore(
939  MachineInstr &MI, WaitcntBrackets &ScoreBrackets,
940  MachineInstr *OldWaitcntInstr) {
941  setForceEmitWaitcnt();
942 
943  if (MI.isMetaInstruction())
944  return false;
945 
947  bool Modified = false;
948 
949  // FIXME: This should have already been handled by the memory legalizer.
950  // Removing this currently doesn't affect any lit tests, but we need to
951  // verify that nothing was relying on this. The number of buffer invalidates
952  // being handled here should not be expanded.
953  if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
954  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
955  MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
956  MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
957  MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
958  Wait.VmCnt = 0;
959  }
960 
961  // All waits must be resolved at call return.
962  // NOTE: this could be improved with knowledge of all call sites or
963  // with knowledge of the called routines.
964  if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
965  MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
966  MI.getOpcode() == AMDGPU::S_SETPC_B64_return_gfx ||
967  (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
968  Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
969  }
970  // Resolve vm waits before gs-done.
971  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
972  MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
973  ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
975  Wait.VmCnt = 0;
976  }
977 #if 0 // TODO: the following blocks of logic when we have fence.
978  else if (MI.getOpcode() == SC_FENCE) {
979  const unsigned int group_size =
980  context->shader_info->GetMaxThreadGroupSize();
981  // group_size == 0 means thread group size is unknown at compile time
982  const bool group_is_multi_wave =
983  (group_size == 0 || group_size > target_info->GetWaveFrontSize());
984  const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
985 
986  for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
987  SCRegType src_type = Inst->GetSrcType(i);
988  switch (src_type) {
989  case SCMEM_LDS:
990  if (group_is_multi_wave ||
991  context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
992  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
993  ScoreBrackets->getScoreUB(LGKM_CNT));
994  // LDS may have to wait for VM_CNT after buffer load to LDS
995  if (target_info->HasBufferLoadToLDS()) {
996  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
997  ScoreBrackets->getScoreUB(VM_CNT));
998  }
999  }
1000  break;
1001 
1002  case SCMEM_GDS:
1003  if (group_is_multi_wave || fence_is_global) {
1004  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1005  ScoreBrackets->getScoreUB(EXP_CNT));
1006  EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
1007  ScoreBrackets->getScoreUB(LGKM_CNT));
1008  }
1009  break;
1010 
1011  case SCMEM_UAV:
1012  case SCMEM_TFBUF:
1013  case SCMEM_RING:
1014  case SCMEM_SCATTER:
1015  if (group_is_multi_wave || fence_is_global) {
1016  EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1017  ScoreBrackets->getScoreUB(EXP_CNT));
1018  EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
1019  ScoreBrackets->getScoreUB(VM_CNT));
1020  }
1021  break;
1022 
1023  case SCMEM_SCRATCH:
1024  default:
1025  break;
1026  }
1027  }
1028  }
1029 #endif
1030 
1031  // Export & GDS instructions do not read the EXEC mask until after the export
1032  // is granted (which can occur well after the instruction is issued).
1033  // The shader program must flush all EXP operations on the export-count
1034  // before overwriting the EXEC mask.
1035  else {
1036  if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1037  // Export and GDS are tracked individually, either may trigger a waitcnt
1038  // for EXEC.
1039  if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1040  ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1041  ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1042  ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1043  Wait.ExpCnt = 0;
1044  }
1045  }
1046 
1047  if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1048  // The function is going to insert a wait on everything in its prolog.
1049  // This still needs to be careful if the call target is a load (e.g. a GOT
1050  // load). We also need to check WAW depenancy with saved PC.
1051  Wait = AMDGPU::Waitcnt();
1052 
1053  int CallAddrOpIdx =
1054  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
1055 
1056  if (MI.getOperand(CallAddrOpIdx).isReg()) {
1057  RegInterval CallAddrOpInterval =
1058  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx);
1059 
1060  for (int RegNo = CallAddrOpInterval.first;
1061  RegNo < CallAddrOpInterval.second; ++RegNo)
1062  ScoreBrackets.determineWait(
1063  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1064 
1065  int RtnAddrOpIdx =
1066  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
1067  if (RtnAddrOpIdx != -1) {
1068  RegInterval RtnAddrOpInterval =
1069  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx);
1070 
1071  for (int RegNo = RtnAddrOpInterval.first;
1072  RegNo < RtnAddrOpInterval.second; ++RegNo)
1073  ScoreBrackets.determineWait(
1074  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1075  }
1076  }
1077  } else {
1078  // FIXME: Should not be relying on memoperands.
1079  // Look at the source operands of every instruction to see if
1080  // any of them results from a previous memory operation that affects
1081  // its current usage. If so, an s_waitcnt instruction needs to be
1082  // emitted.
1083  // If the source operand was defined by a load, add the s_waitcnt
1084  // instruction.
1085  //
1086  // Two cases are handled for destination operands:
1087  // 1) If the destination operand was defined by a load, add the s_waitcnt
1088  // instruction to guarantee the right WAW order.
1089  // 2) If a destination operand that was used by a recent export/store ins,
1090  // add s_waitcnt on exp_cnt to guarantee the WAR order.
1091  for (const MachineMemOperand *Memop : MI.memoperands()) {
1092  const Value *Ptr = Memop->getValue();
1093  if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
1094  addWait(Wait, LGKM_CNT, 0);
1095  if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1096  SLoadAddresses.erase(Ptr);
1097  }
1098  unsigned AS = Memop->getAddrSpace();
1099  if (AS != AMDGPUAS::LOCAL_ADDRESS)
1100  continue;
1101  unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1102  // VM_CNT is only relevant to vgpr or LDS.
1103  ScoreBrackets.determineWait(
1104  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1105  if (Memop->isStore()) {
1106  ScoreBrackets.determineWait(
1107  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1108  }
1109  }
1110 
1111  // Loop over use and def operands.
1112  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1113  MachineOperand &Op = MI.getOperand(I);
1114  if (!Op.isReg())
1115  continue;
1116  RegInterval Interval =
1117  ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
1118 
1119  const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1120  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1121  if (IsVGPR) {
1122  // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1123  // previous write and this write are the same type of VMEM
1124  // instruction, in which case they're guaranteed to write their
1125  // results in order anyway.
1126  if (Op.isUse() || !SIInstrInfo::isVMEM(MI) ||
1127  ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1128  getVmemType(MI))) {
1129  ScoreBrackets.determineWait(
1130  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
1131  ScoreBrackets.clearVgprVmemTypes(RegNo);
1132  }
1133  if (Op.isDef()) {
1134  ScoreBrackets.determineWait(
1135  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
1136  }
1137  }
1138  ScoreBrackets.determineWait(
1139  LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
1140  }
1141  }
1142  }
1143  }
1144 
1145  // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
1146  // occurs before the instruction. Doing it here prevents any additional
1147  // S_WAITCNTs from being emitted if the instruction was marked as
1148  // requiring a WAITCNT beforehand.
1149  if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1150  !ST->hasAutoWaitcntBeforeBarrier()) {
1151  Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
1152  }
1153 
1154  // TODO: Remove this work-around, enable the assert for Bug 457939
1155  // after fixing the scheduler. Also, the Shader Compiler code is
1156  // independent of target.
1157  if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1158  if (ScoreBrackets.getScoreLB(LGKM_CNT) <
1159  ScoreBrackets.getScoreUB(LGKM_CNT) &&
1160  ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1161  Wait.LgkmCnt = 0;
1162  }
1163  }
1164 
1165  // Verify that the wait is actually needed.
1166  ScoreBrackets.simplifyWaitcnt(Wait);
1167 
1168  if (ForceEmitZeroWaitcnts)
1169  Wait = AMDGPU::Waitcnt::allZero(ST->hasVscnt());
1170 
1171  if (ForceEmitWaitcnt[VM_CNT])
1172  Wait.VmCnt = 0;
1173  if (ForceEmitWaitcnt[EXP_CNT])
1174  Wait.ExpCnt = 0;
1175  if (ForceEmitWaitcnt[LGKM_CNT])
1176  Wait.LgkmCnt = 0;
1177  if (ForceEmitWaitcnt[VS_CNT])
1178  Wait.VsCnt = 0;
1179 
1180  if (OldWaitcntInstr) {
1181  // Try to merge the required wait with preexisting waitcnt instructions.
1182  // Also erase redundant waitcnt.
1183  Modified =
1184  applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, &MI);
1185  } else {
1186  // Update waitcnt brackets after determining the required wait.
1187  ScoreBrackets.applyWaitcnt(Wait);
1188  }
1189 
1190  // Build new waitcnt instructions unless no wait is needed or the old waitcnt
1191  // instruction was modified to handle the required wait.
1192  if (Wait.hasWaitExceptVsCnt()) {
1193  unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1194  auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(),
1195  MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
1196  .addImm(Enc);
1197  TrackedWaitcntSet.insert(SWaitInst);
1198  Modified = true;
1199 
1200  LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1201  << "Old Instr: " << MI
1202  << "New Instr: " << *SWaitInst << '\n');
1203  }
1204 
1205  if (Wait.hasWaitVsCnt()) {
1206  assert(ST->hasVscnt());
1207 
1208  auto SWaitInst =
1209  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1210  TII->get(AMDGPU::S_WAITCNT_VSCNT))
1211  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1212  .addImm(Wait.VsCnt);
1213  TrackedWaitcntSet.insert(SWaitInst);
1214  Modified = true;
1215 
1216  LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1217  << "Old Instr: " << MI
1218  << "New Instr: " << *SWaitInst << '\n');
1219  }
1220 
1221  return Modified;
1222 }
1223 
1224 // This is a flat memory operation. Check to see if it has memory tokens other
1225 // than LDS. Other address spaces supported by flat memory operations involve
1226 // global memory.
1227 bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1228  assert(TII->isFLAT(MI));
1229 
1230  // All flat instructions use the VMEM counter.
1231  assert(TII->usesVM_CNT(MI));
1232 
1233  // If there are no memory operands then conservatively assume the flat
1234  // operation may access VMEM.
1235  if (MI.memoperands_empty())
1236  return true;
1237 
1238  // See if any memory operand specifies an address space that involves VMEM.
1239  // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1240  // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1241  // (GDS) address space is not supported by flat operations. Therefore, simply
1242  // return true unless only the LDS address space is found.
1243  for (const MachineMemOperand *Memop : MI.memoperands()) {
1244  unsigned AS = Memop->getAddrSpace();
1246  if (AS != AMDGPUAS::LOCAL_ADDRESS)
1247  return true;
1248  }
1249 
1250  return false;
1251 }
1252 
1253 // This is a flat memory operation. Check to see if it has memory tokens for
1254 // either LDS or FLAT.
1255 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1256  assert(TII->isFLAT(MI));
1257 
1258  // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1259  if (!TII->usesLGKM_CNT(MI))
1260  return false;
1261 
1262  // If in tgsplit mode then there can be no use of LDS.
1263  if (ST->isTgSplitEnabled())
1264  return false;
1265 
1266  // If there are no memory operands then conservatively assume the flat
1267  // operation may access LDS.
1268  if (MI.memoperands_empty())
1269  return true;
1270 
1271  // See if any memory operand specifies an address space that involves LDS.
1272  for (const MachineMemOperand *Memop : MI.memoperands()) {
1273  unsigned AS = Memop->getAddrSpace();
1275  return true;
1276  }
1277 
1278  return false;
1279 }
1280 
1281 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1282  WaitcntBrackets *ScoreBrackets) {
1283  // Now look at the instruction opcode. If it is a memory access
1284  // instruction, update the upper-bound of the appropriate counter's
1285  // bracket and the destination operand scores.
1286  // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1287  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1288  if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1289  TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1290  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1291  ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1292  } else {
1293  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1294  }
1295  } else if (TII->isFLAT(Inst)) {
1296  assert(Inst.mayLoadOrStore());
1297 
1298  int FlatASCount = 0;
1299 
1300  if (mayAccessVMEMThroughFlat(Inst)) {
1301  ++FlatASCount;
1302  if (!ST->hasVscnt())
1303  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1304  else if (Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst))
1305  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1306  else
1307  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1308  }
1309 
1310  if (mayAccessLDSThroughFlat(Inst)) {
1311  ++FlatASCount;
1312  ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1313  }
1314 
1315  // A Flat memory operation must access at least one address space.
1316  assert(FlatASCount);
1317 
1318  // This is a flat memory operation that access both VMEM and LDS, so note it
1319  // - it will require that both the VM and LGKM be flushed to zero if it is
1320  // pending when a VM or LGKM dependency occurs.
1321  if (FlatASCount > 1)
1322  ScoreBrackets->setPendingFlat();
1323  } else if (SIInstrInfo::isVMEM(Inst) &&
1325  if (!ST->hasVscnt())
1326  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
1327  else if ((Inst.mayLoad() && !SIInstrInfo::isAtomicNoRet(Inst)) ||
1328  /* IMAGE_GET_RESINFO / IMAGE_GET_LOD */
1329  (TII->isMIMG(Inst) && !Inst.mayLoad() && !Inst.mayStore()))
1330  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_READ_ACCESS, Inst);
1331  else if (Inst.mayStore())
1332  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_WRITE_ACCESS, Inst);
1333 
1334  if (ST->vmemWriteNeedsExpWaitcnt() &&
1335  (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
1336  ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1337  }
1338  } else if (TII->isSMRD(Inst)) {
1339  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1340  } else if (Inst.isCall()) {
1341  if (callWaitsOnFunctionReturn(Inst)) {
1342  // Act as a wait on everything
1343  ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
1344  } else {
1345  // May need to way wait for anything.
1346  ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
1347  }
1348  } else if (SIInstrInfo::isEXP(Inst)) {
1349  unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1350  if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
1351  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1352  else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
1353  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1354  else
1355  ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1356  } else {
1357  switch (Inst.getOpcode()) {
1358  case AMDGPU::S_SENDMSG:
1359  case AMDGPU::S_SENDMSGHALT:
1360  ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1361  break;
1362  case AMDGPU::S_MEMTIME:
1363  case AMDGPU::S_MEMREALTIME:
1364  ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1365  break;
1366  }
1367  }
1368 }
1369 
1370 bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
1371  unsigned OtherScore) {
1372  unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
1373  unsigned OtherShifted =
1374  OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
1375  Score = std::max(MyShifted, OtherShifted);
1376  return OtherShifted > MyShifted;
1377 }
1378 
1379 /// Merge the pending events and associater score brackets of \p Other into
1380 /// this brackets status.
1381 ///
1382 /// Returns whether the merge resulted in a change that requires tighter waits
1383 /// (i.e. the merged brackets strictly dominate the original brackets).
1384 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
1385  bool StrictDom = false;
1386 
1387  VgprUB = std::max(VgprUB, Other.VgprUB);
1388  SgprUB = std::max(SgprUB, Other.SgprUB);
1389 
1390  for (auto T : inst_counter_types()) {
1391  // Merge event flags for this counter
1392  const bool OldOutOfOrder = counterOutOfOrder(T);
1393  const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
1394  const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
1395  if (OtherEvents & ~OldEvents)
1396  StrictDom = true;
1397  PendingEvents |= OtherEvents;
1398 
1399  // Merge scores for this counter
1400  const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
1401  const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
1402  const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
1403  if (NewUB < ScoreLBs[T])
1404  report_fatal_error("waitcnt score overflow");
1405 
1406  MergeInfo M;
1407  M.OldLB = ScoreLBs[T];
1408  M.OtherLB = Other.ScoreLBs[T];
1409  M.MyShift = NewUB - ScoreUBs[T];
1410  M.OtherShift = NewUB - Other.ScoreUBs[T];
1411 
1412  ScoreUBs[T] = NewUB;
1413 
1414  StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
1415 
1416  bool RegStrictDom = false;
1417  for (int J = 0; J <= VgprUB; J++) {
1418  RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
1419  }
1420 
1421  if (T == VM_CNT) {
1422  for (int J = 0; J <= VgprUB; J++) {
1423  unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
1424  RegStrictDom |= NewVmemTypes != VgprVmemTypes[J];
1425  VgprVmemTypes[J] = NewVmemTypes;
1426  }
1427  }
1428 
1429  if (T == LGKM_CNT) {
1430  for (int J = 0; J <= SgprUB; J++) {
1431  RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
1432  }
1433  }
1434 
1435  if (RegStrictDom && !OldOutOfOrder)
1436  StrictDom = true;
1437  }
1438 
1439  return StrictDom;
1440 }
1441 
1442 // Generate s_waitcnt instructions where needed.
1443 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1444  MachineBasicBlock &Block,
1445  WaitcntBrackets &ScoreBrackets) {
1446  bool Modified = false;
1447 
1448  LLVM_DEBUG({
1449  dbgs() << "*** Block" << Block.getNumber() << " ***";
1450  ScoreBrackets.dump();
1451  });
1452 
1453  // Track the correctness of vccz through this basic block. There are two
1454  // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
1455  // ST->partialVCCWritesUpdateVCCZ().
1456  bool VCCZCorrect = true;
1457  if (ST->hasReadVCCZBug()) {
1458  // vccz could be incorrect at a basic block boundary if a predecessor wrote
1459  // to vcc and then issued an smem load.
1460  VCCZCorrect = false;
1461  } else if (!ST->partialVCCWritesUpdateVCCZ()) {
1462  // vccz could be incorrect at a basic block boundary if a predecessor wrote
1463  // to vcc_lo or vcc_hi.
1464  VCCZCorrect = false;
1465  }
1466 
1467  // Walk over the instructions.
1468  MachineInstr *OldWaitcntInstr = nullptr;
1469 
1470  for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
1471  E = Block.instr_end();
1472  Iter != E;) {
1473  MachineInstr &Inst = *Iter;
1474 
1475  // Track pre-existing waitcnts that were added in earlier iterations or by
1476  // the memory legalizer.
1477  if (Inst.getOpcode() == AMDGPU::S_WAITCNT ||
1478  (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1479  Inst.getOperand(0).isReg() &&
1480  Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL)) {
1481  if (!OldWaitcntInstr)
1482  OldWaitcntInstr = &Inst;
1483  ++Iter;
1484  continue;
1485  }
1486 
1487  // Generate an s_waitcnt instruction to be placed before Inst, if needed.
1488  Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
1489  OldWaitcntInstr = nullptr;
1490 
1491  // Restore vccz if it's not known to be correct already.
1492  bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
1493 
1494  // Don't examine operands unless we need to track vccz correctness.
1495  if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
1496  if (Inst.definesRegister(AMDGPU::VCC_LO) ||
1497  Inst.definesRegister(AMDGPU::VCC_HI)) {
1498  // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
1499  if (!ST->partialVCCWritesUpdateVCCZ())
1500  VCCZCorrect = false;
1501  } else if (Inst.definesRegister(AMDGPU::VCC)) {
1502  // There is a hardware bug on CI/SI where SMRD instruction may corrupt
1503  // vccz bit, so when we detect that an instruction may read from a
1504  // corrupt vccz bit, we need to:
1505  // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
1506  // operations to complete.
1507  // 2. Restore the correct value of vccz by writing the current value
1508  // of vcc back to vcc.
1509  if (ST->hasReadVCCZBug() &&
1510  ScoreBrackets.getScoreLB(LGKM_CNT) <
1511  ScoreBrackets.getScoreUB(LGKM_CNT) &&
1512  ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1513  // Writes to vcc while there's an outstanding smem read may get
1514  // clobbered as soon as any read completes.
1515  VCCZCorrect = false;
1516  } else {
1517  // Writes to vcc will fix any incorrect value in vccz.
1518  VCCZCorrect = true;
1519  }
1520  }
1521  }
1522 
1523  if (TII->isSMRD(Inst)) {
1524  for (const MachineMemOperand *Memop : Inst.memoperands()) {
1525  // No need to handle invariant loads when avoiding WAR conflicts, as
1526  // there cannot be a vector store to the same memory location.
1527  if (!Memop->isInvariant()) {
1528  const Value *Ptr = Memop->getValue();
1529  SLoadAddresses.insert(std::make_pair(Ptr, Inst.getParent()));
1530  }
1531  }
1532  if (ST->hasReadVCCZBug()) {
1533  // This smem read could complete and clobber vccz at any time.
1534  VCCZCorrect = false;
1535  }
1536  }
1537 
1538  updateEventWaitcntAfter(Inst, &ScoreBrackets);
1539 
1540 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1541  // If this instruction generates a S_SETVSKIP because it is an
1542  // indexed resource, and we are on Tahiti, then it will also force
1543  // an S_WAITCNT vmcnt(0)
1544  if (RequireCheckResourceType(Inst, context)) {
1545  // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1546  ScoreBrackets->setScoreLB(VM_CNT,
1547  ScoreBrackets->getScoreUB(VM_CNT));
1548  }
1549 #endif
1550 
1551  LLVM_DEBUG({
1552  Inst.print(dbgs());
1553  ScoreBrackets.dump();
1554  });
1555 
1556  // TODO: Remove this work-around after fixing the scheduler and enable the
1557  // assert above.
1558  if (RestoreVCCZ) {
1559  // Restore the vccz bit. Any time a value is written to vcc, the vcc
1560  // bit is updated, so we can restore the bit by reading the value of
1561  // vcc and then writing it back to the register.
1562  BuildMI(Block, Inst, Inst.getDebugLoc(),
1563  TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1564  TRI->getVCC())
1565  .addReg(TRI->getVCC());
1566  VCCZCorrect = true;
1567  Modified = true;
1568  }
1569 
1570  ++Iter;
1571  }
1572 
1573  return Modified;
1574 }
1575 
1576 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1577  ST = &MF.getSubtarget<GCNSubtarget>();
1578  TII = ST->getInstrInfo();
1579  TRI = &TII->getRegisterInfo();
1580  MRI = &MF.getRegInfo();
1581  IV = AMDGPU::getIsaVersion(ST->getCPU());
1583  PDT = &getAnalysis<MachinePostDominatorTree>();
1584 
1585  ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1586  for (auto T : inst_counter_types())
1587  ForceEmitWaitcnt[T] = false;
1588 
1589  HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1590  HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1591  HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1592  HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;
1593 
1594  unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
1595  unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
1596  assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1597  assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1598 
1599  RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1600  RegisterEncoding.VGPRL = RegisterEncoding.VGPR0 + NumVGPRsMax - 1;
1601  RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1602  RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + NumSGPRsMax - 1;
1603 
1604  TrackedWaitcntSet.clear();
1605  BlockInfos.clear();
1606  bool Modified = false;
1607 
1608  if (!MFI->isEntryFunction()) {
1609  // Wait for any outstanding memory operations that the input registers may
1610  // depend on. We can't track them and it's better to do the wait after the
1611  // costly call sequence.
1612 
1613  // TODO: Could insert earlier and schedule more liberally with operations
1614  // that only use caller preserved registers.
1615  MachineBasicBlock &EntryBB = MF.front();
1616  MachineBasicBlock::iterator I = EntryBB.begin();
1617  for (MachineBasicBlock::iterator E = EntryBB.end();
1618  I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
1619  ;
1620  BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
1621  if (ST->hasVscnt())
1622  BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
1623  .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1624  .addImm(0);
1625 
1626  Modified = true;
1627  }
1628 
1629  // Keep iterating over the blocks in reverse post order, inserting and
1630  // updating s_waitcnt where needed, until a fix point is reached.
1632  BlockInfos.insert({MBB, BlockInfo(MBB)});
1633 
1634  std::unique_ptr<WaitcntBrackets> Brackets;
1635  bool Repeat;
1636  do {
1637  Repeat = false;
1638 
1639  for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
1640  ++BII) {
1641  BlockInfo &BI = BII->second;
1642  if (!BI.Dirty)
1643  continue;
1644 
1645  if (BI.Incoming) {
1646  if (!Brackets)
1647  Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
1648  else
1649  *Brackets = *BI.Incoming;
1650  } else {
1651  if (!Brackets)
1652  Brackets = std::make_unique<WaitcntBrackets>(ST);
1653  else
1654  *Brackets = WaitcntBrackets(ST);
1655  }
1656 
1657  Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
1658  BI.Dirty = false;
1659 
1660  if (Brackets->hasPending()) {
1661  BlockInfo *MoveBracketsToSucc = nullptr;
1662  for (MachineBasicBlock *Succ : BI.MBB->successors()) {
1663  auto SuccBII = BlockInfos.find(Succ);
1664  BlockInfo &SuccBI = SuccBII->second;
1665  if (!SuccBI.Incoming) {
1666  SuccBI.Dirty = true;
1667  if (SuccBII <= BII)
1668  Repeat = true;
1669  if (!MoveBracketsToSucc) {
1670  MoveBracketsToSucc = &SuccBI;
1671  } else {
1672  SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
1673  }
1674  } else if (SuccBI.Incoming->merge(*Brackets)) {
1675  SuccBI.Dirty = true;
1676  if (SuccBII <= BII)
1677  Repeat = true;
1678  }
1679  }
1680  if (MoveBracketsToSucc)
1681  MoveBracketsToSucc->Incoming = std::move(Brackets);
1682  }
1683  }
1684  } while (Repeat);
1685 
1687 
1688  bool HaveScalarStores = false;
1689 
1690  for (MachineBasicBlock &MBB : MF) {
1691  for (MachineInstr &MI : MBB) {
1692  if (!HaveScalarStores && TII->isScalarStore(MI))
1693  HaveScalarStores = true;
1694 
1695  if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1696  MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1697  EndPgmBlocks.push_back(&MBB);
1698  }
1699  }
1700 
1701  if (HaveScalarStores) {
1702  // If scalar writes are used, the cache must be flushed or else the next
1703  // wave to reuse the same scratch memory can be clobbered.
1704  //
1705  // Insert s_dcache_wb at wave termination points if there were any scalar
1706  // stores, and only if the cache hasn't already been flushed. This could be
1707  // improved by looking across blocks for flushes in postdominating blocks
1708  // from the stores but an explicitly requested flush is probably very rare.
1709  for (MachineBasicBlock *MBB : EndPgmBlocks) {
1710  bool SeenDCacheWB = false;
1711 
1712  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1713  ++I) {
1714  if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1715  SeenDCacheWB = true;
1716  else if (TII->isScalarStore(*I))
1717  SeenDCacheWB = false;
1718 
1719  // FIXME: It would be better to insert this before a waitcnt if any.
1720  if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1721  I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1722  !SeenDCacheWB) {
1723  Modified = true;
1724  BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1725  }
1726  }
1727  }
1728  }
1729 
1730  return Modified;
1731 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
llvm::AMDGPU::getMUBUFIsBufferInv
bool getMUBUFIsBufferInv(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:289
llvm::AMDGPU::getMCReg
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
Definition: AMDGPUBaseInfo.cpp:1545
llvm::SIInstrInfo::isAtomicRet
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:567
readsVCCZ
static bool readsVCCZ(const MachineInstr &MI)
Definition: SIInsertWaitcnts.cpp:907
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:105
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AllocatorList.h:23
Reg
unsigned Reg
Definition: MachineSink.cpp:1558
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
print
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Definition: ArchiveWriter.cpp:147
llvm::AMDGPU::getIsaVersion
IsaVersion getIsaVersion(StringRef GPU)
Definition: TargetParser.cpp:189
SIMachineFunctionInfo.h
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:52
T
llvm::MachineInstr::mayLoadOrStore
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition: MachineInstr.h:1028
llvm::MachineInstr::mayLoad
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:1005
Waitcnts
SI Insert Waitcnts
Definition: SIInsertWaitcnts.cpp:795
llvm::MapVector::clear
void clear()
Definition: MapVector.h:89
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) INITIALIZE_PASS_END(SIInsertWaitcnts
llvm::TargetRegisterInfo::isInAllocatableClass
bool isInAllocatableClass(MCRegister RegNo) const
Return true if the register is in the allocation of any register class.
Definition: TargetRegisterInfo.h:361
MapVector.h
llvm::SIInstrInfo::isEXP
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:551
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:143
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::erase
bool erase(const KeyT &Val)
Definition: DenseMap.h:302
llvm::MachineOperand::setImm
void setImm(int64_t immVal)
Definition: MachineOperand.h:655
llvm::MCRegisterInfo::getEncodingValue
uint16_t getEncodingValue(MCRegister RegNo) const
Returns the encoding for RegNo.
Definition: MCRegisterInfo.h:553
llvm::AMDGPU::Exp::ET_POS0
@ ET_POS0
Definition: SIDefines.h:747
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:128
llvm::SIInsertWaitcntsID
char & SIInsertWaitcntsID
Definition: SIInsertWaitcnts.cpp:800
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::count
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:145
llvm::MapVector
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:37
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::AMDGPU::getVmcntBitMask
unsigned getVmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:894
llvm::dump
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Definition: SparseBitVector.h:876
TargetParser.h
llvm::AMDGPU::getNamedOperandIdx
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
DEBUG_COUNTER
DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", "Force emit s_waitcnt expcnt(0) instrs")
llvm::AMDGPU::Exp::ET_PARAM0
@ ET_PARAM0
Definition: SIDefines.h:752
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::count
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1559
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:102
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::DebugCounter::isCounterSet
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:102
llvm::AMDGPU::SendMsg::ID_MASK_
@ ID_MASK_
Definition: SIDefines.h:321
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::AMDGPU::IsaVersion
Instruction set architecture version.
Definition: TargetParser.h:105
llvm::SIInstrFlags::EXP_CNT
@ EXP_CNT
Definition: SIDefines.h:67
llvm::MachineFunction::front
const MachineBasicBlock & front() const
Definition: MachineFunction.h:828
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:644
llvm::MapVector::begin
iterator begin()
Definition: MapVector.h:70
llvm::MachineInstr::isCall
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:823
llvm::SIInstrInfo::isMIMG
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:498
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineOperand::getImm
int64_t getImm() const
Definition: MachineOperand.h:537
llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:732
llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:499
llvm::AMDGPUISD::DS_ORDERED_COUNT
@ DS_ORDERED_COUNT
Definition: AMDGPUISelLowering.h:501
llvm::createSIInsertWaitcntsPass
FunctionPass * createSIInsertWaitcntsPass()
Definition: SIInsertWaitcnts.cpp:802
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
t
bitcast float %x to i32 %s=and i32 %t, 2147483647 %d=bitcast i32 %s to float ret float %d } declare float @fabsf(float %n) define float @bar(float %x) nounwind { %d=call float @fabsf(float %x) ret float %d } This IR(from PR6194):target datalayout="e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple="x86_64-apple-darwin10.0.0" %0=type { double, double } %struct.float3=type { float, float, float } define void @test(%0, %struct.float3 *nocapture %res) nounwind noinline ssp { entry:%tmp18=extractvalue %0 %0, 0 t
Definition: README-SSE.txt:788
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
false
Definition: StackSlotColoring.cpp:142
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:129
merge
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Definition: LoopDeletion.cpp:53
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:49
llvm::DebugCounter::shouldExecute
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:74
llvm::report_fatal_error
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:143
llvm::raw_ostream
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:53
LoopDeletionResult::Modified
@ Modified
llvm::AMDGPU::decodeWaitcnt
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
Definition: AMDGPUBaseInfo.cpp:945
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:30
llvm::MachineInstr::definesRegister
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr fully defines the specified register.
Definition: MachineInstr.h:1399
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:95
llvm::AMDGPU::Exp::ET_POS_LAST
@ ET_POS_LAST
Definition: SIDefines.h:750
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:363
llvm::DenseSet
Implements a dense probed hash-table based set.
Definition: DenseSet.h:268
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:634
llvm::cl::opt< bool >
llvm::MachineInstr::getDebugLoc
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:418
llvm::AMDGPU::getExpcntBitMask
unsigned getExpcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:903
AMDGPUMCTargetDesc.h
llvm::MapVector::find
iterator find(const KeyT &Key)
Definition: MapVector.h:148
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:321
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:64
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SIInsertWaitcnts.cpp:39
llvm::Interval
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:36
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
move
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional move
Definition: README.txt:546
llvm::DenseMap
Definition: DenseMap.h:714
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::AMDGPUMachineFunction::isEntryFunction
bool isEntryFunction() const
Definition: AMDGPUMachineFunction.h:78
llvm::iterator_facade_base
CRTP base class which implements the entire standard iterator facade in terms of a minimal subset of ...
Definition: iterator.h:80
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:441
callWaitsOnFunctionEntry
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
Definition: SIInsertWaitcnts.cpp:914
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::find
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:150
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::operator==
bool operator==(uint64_t V1, const APInt &V2)
Definition: APInt.h:1981
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
MachinePostDominators.h
llvm::AMDGPU::getMIMGInfo
const LLVM_READONLY MIMGInfo * getMIMGInfo(unsigned Opc)
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:360
llvm::MachineFunction
Definition: MachineFunction.h:234
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::clear
void clear()
Definition: DenseSet.h:92
llvm::AMDGPU::getMIMGBaseOpcodeInfo
const LLVM_READONLY MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
llvm::operator*
APInt operator*(APInt a, uint64_t RHS)
Definition: APInt.h:2105
llvm::MapVector::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:118
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::SIInstrFlags::LGKM_CNT
@ LGKM_CNT
Definition: SIDefines.h:68
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:253
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
AMDGPU.h
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:489
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:134
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:81
llvm::MachineInstr::print
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
Definition: MachineInstr.cpp:1578
llvm::MachineOperand::isDef
bool isDef() const
Definition: MachineOperand.h:375
llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:286
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:207
llvm::MachinePostDominatorTree
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
Definition: MachinePostDominators.h:27
MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105
llvm::AMDGPU::SendMsg::ID_GS_DONE
@ ID_GS_DONE
Definition: SIDefines.h:307
llvm::MapVector::end
iterator end()
Definition: MapVector.h:72
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::AMDGPU::encodeWaitcnt
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
Definition: AMDGPUBaseInfo.cpp:982
llvm::MachinePostDominatorTree::dominates
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
Definition: MachinePostDominators.h:54
llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition: MachineInstrBuilder.h:52
llvm::AMDGPU::Waitcnt
Represents the counter values to wait for in an s_waitcnt instruction.
Definition: AMDGPUBaseInfo.h:473
llvm::TargetRegisterInfo::getRegSizeInBits
unsigned getRegSizeInBits(const TargetRegisterClass &RC) const
Return the size in bits of a register from class RC.
Definition: TargetRegisterInfo.h:276
ForceEmitZeroFlag
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
llvm::SIInstrInfo::isAtomicNoRet
static bool isAtomicNoRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:559
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:324
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::MachineInstr::mayStore
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:1018
llvm::SIInstrInfo::isVMEM
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:366
DebugCounter.h
llvm::TargetStackID::Value
Value
Definition: TargetFrameLowering.h:27
llvm::AMDGPU::MIMGInfo
Definition: AMDGPUBaseInfo.h:359
llvm::ReversePostOrderTraversal
Definition: PostOrderIterator.h:290
llvm::AMDGPU::getLgkmcntBitMask
unsigned getLgkmcntBitMask(const IsaVersion &Version)
Definition: AMDGPUBaseInfo.cpp:907
llvm::SIInstrInfo
Definition: SIInstrInfo.h:38
PostOrderIterator.h
callWaitsOnFunctionReturn
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
Definition: SIInsertWaitcnts.cpp:924
llvm::MachineBasicBlock::begin
iterator begin()
Definition: MachineBasicBlock.h:268
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:328
llvm::MachineInstr::getNumOperands
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:492
llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition: SIMachineFunctionInfo.h:335
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
llvm::iterator_range
A range adaptor for a pair of iterators.
Definition: iterator_range.h:30
llvm::MachineInstr::memoperands
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:690
llvm::sys::Wait
ProcessInfo Wait(const ProcessInfo &PI, unsigned SecondsToWait, bool WaitUntilTerminates, std::string *ErrMsg=nullptr, Optional< ProcessStatistics > *ProcStat=nullptr)
This function waits for the process specified by PI to finish.
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
llvm::AMDGPU::Exp::ET_PARAM31
@ ET_PARAM31
Definition: SIDefines.h:753
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:360
llvm::AMDGPU::Waitcnt::allZero
static Waitcnt allZero(bool HasVscnt)
Definition: AMDGPUBaseInfo.h:483
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::cl::desc
Definition: CommandLine.h:412
llvm::SIInstrFlags::VM_CNT
@ VM_CNT
Definition: SIDefines.h:66
llvm::MachineInstr::eraseFromParent
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Definition: MachineInstr.cpp:680
llvm::MachineInstrBundleIterator< MachineInstr >
llvm::pdb::PDB_SymType::Block
@ Block
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:270
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:358
llvm::AMDGPU::MIMGBaseOpcodeInfo::Sampler
bool Sampler
Definition: AMDGPUBaseInfo.h:285
Other
Optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1191
AMDGPUBaseInfo.h
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38