LLVM 18.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
31#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/Sequence.h"
39using namespace llvm;
40
41#define DEBUG_TYPE "si-insert-waitcnts"
42
43DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
44 "Force emit s_waitcnt expcnt(0) instrs");
45DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
46 "Force emit s_waitcnt lgkmcnt(0) instrs");
47DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
48 "Force emit s_waitcnt vmcnt(0) instrs");
49
51 "amdgpu-waitcnt-forcezero",
52 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
53 cl::init(false), cl::Hidden);
54
55namespace {
56// Class of object that encapsulates latest instruction counter score
57// associated with the operand. Used for determining whether
58// s_waitcnt instruction needs to be emitted.
59
60enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
61} // namespace
62
63namespace llvm {
64template <> struct enum_iteration_traits<InstCounterType> {
65 static constexpr bool is_iterable = true;
66};
67} // namespace llvm
68
69namespace {
70auto inst_counter_types() { return enum_seq(VM_CNT, NUM_INST_CNTS); }
71
72using RegInterval = std::pair<int, int>;
73
74struct HardwareLimits {
75 unsigned VmcntMax;
76 unsigned ExpcntMax;
77 unsigned LgkmcntMax;
78 unsigned VscntMax;
79};
80
81struct RegisterEncoding {
82 unsigned VGPR0;
83 unsigned VGPRL;
84 unsigned SGPR0;
85 unsigned SGPRL;
86};
87
88enum WaitEventType {
89 VMEM_ACCESS, // vector-memory read & write
90 VMEM_READ_ACCESS, // vector-memory read
91 VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
92 SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
93 LDS_ACCESS, // lds read & write
94 GDS_ACCESS, // gds read & write
95 SQ_MESSAGE, // send message
96 SMEM_ACCESS, // scalar-memory read & write
97 EXP_GPR_LOCK, // export holding on its data src
98 GDS_GPR_LOCK, // GDS holding on its data and addr src
99 EXP_POS_ACCESS, // write to export position
100 EXP_PARAM_ACCESS, // write to export parameter
101 VMW_GPR_LOCK, // vector-memory write holding on its data src
102 EXP_LDS_ACCESS, // read by ldsdir counting as export
103 NUM_WAIT_EVENTS,
104};
105
106static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
107 (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
108 (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
109 (1 << SQ_MESSAGE),
110 (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
111 (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS),
112 (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS)};
113
114// The mapping is:
115// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
116// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
117// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
118// We reserve a fixed number of VGPR slots in the scoring tables for
119// special tokens like SCMEM_LDS (needed for buffer load to LDS).
120enum RegisterMapping {
121 SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
122 AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
123 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
124 NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
125 EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes.
126 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
127};
128
129// Enumerate different types of result-returning VMEM operations. Although
130// s_waitcnt orders them all with a single vmcnt counter, in the absence of
131// s_waitcnt only instructions of the same VmemType are guaranteed to write
132// their results in order -- so there is no need to insert an s_waitcnt between
133// two instructions of the same type that write the same vgpr.
134enum VmemType {
135 // BUF instructions and MIMG instructions without a sampler.
136 VMEM_NOSAMPLER,
137 // MIMG instructions with a sampler.
138 VMEM_SAMPLER,
139 // BVH instructions
140 VMEM_BVH
141};
142
143static bool updateVMCntOnly(const MachineInstr &Inst) {
144 return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
146}
147
148VmemType getVmemType(const MachineInstr &Inst) {
149 assert(updateVMCntOnly(Inst));
150 if (!SIInstrInfo::isMIMG(Inst))
151 return VMEM_NOSAMPLER;
153 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
155 return BaseInfo->BVH ? VMEM_BVH
156 : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
157}
158
159void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
160 switch (T) {
161 case VM_CNT:
162 Wait.VmCnt = std::min(Wait.VmCnt, Count);
163 break;
164 case EXP_CNT:
165 Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
166 break;
167 case LGKM_CNT:
168 Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
169 break;
170 case VS_CNT:
171 Wait.VsCnt = std::min(Wait.VsCnt, Count);
172 break;
173 default:
174 llvm_unreachable("bad InstCounterType");
175 }
176}
177
178// This objects maintains the current score brackets of each wait counter, and
179// a per-register scoreboard for each wait counter.
180//
181// We also maintain the latest score for every event type that can change the
182// waitcnt in order to know if there are multiple types of events within
183// the brackets. When multiple types of event happen in the bracket,
184// wait count may get decreased out of order, therefore we need to put in
185// "s_waitcnt 0" before use.
186class WaitcntBrackets {
187public:
188 WaitcntBrackets(const GCNSubtarget *SubTarget, HardwareLimits Limits,
189 RegisterEncoding Encoding)
190 : ST(SubTarget), Limits(Limits), Encoding(Encoding) {}
191
192 unsigned getWaitCountMax(InstCounterType T) const {
193 switch (T) {
194 case VM_CNT:
195 return Limits.VmcntMax;
196 case LGKM_CNT:
197 return Limits.LgkmcntMax;
198 case EXP_CNT:
199 return Limits.ExpcntMax;
200 case VS_CNT:
201 return Limits.VscntMax;
202 default:
203 break;
204 }
205 return 0;
206 }
207
208 unsigned getScoreLB(InstCounterType T) const {
209 assert(T < NUM_INST_CNTS);
210 return ScoreLBs[T];
211 }
212
213 unsigned getScoreUB(InstCounterType T) const {
214 assert(T < NUM_INST_CNTS);
215 return ScoreUBs[T];
216 }
217
218 unsigned getScoreRange(InstCounterType T) const {
219 return getScoreUB(T) - getScoreLB(T);
220 }
221
222 // Mapping from event to counter.
223 InstCounterType eventCounter(WaitEventType E) const {
224 for (auto T : inst_counter_types()) {
225 if (WaitEventMaskForInst[T] & (1 << E))
226 return T;
227 }
228 llvm_unreachable("event type has no associated counter");
229 }
230
231 unsigned getRegScore(int GprNo, InstCounterType T) const {
232 if (GprNo < NUM_ALL_VGPRS) {
233 return VgprScores[T][GprNo];
234 }
235 assert(T == LGKM_CNT);
236 return SgprScores[GprNo - NUM_ALL_VGPRS];
237 }
238
239 bool merge(const WaitcntBrackets &Other);
240
241 RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
243 const SIRegisterInfo *TRI, unsigned OpNo) const;
244
245 bool counterOutOfOrder(InstCounterType T) const;
246 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
247 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
248 void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
249 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
250 void applyWaitcnt(InstCounterType T, unsigned Count);
251 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
252 const MachineRegisterInfo *MRI, WaitEventType E,
254
255 unsigned hasPendingEvent() const { return PendingEvents; }
256 unsigned hasPendingEvent(WaitEventType E) const {
257 return PendingEvents & (1 << E);
258 }
259 unsigned hasPendingEvent(InstCounterType T) const {
260 unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
261 assert((HasPending != 0) == (getScoreRange(T) != 0));
262 return HasPending;
263 }
264
265 bool hasMixedPendingEvents(InstCounterType T) const {
266 unsigned Events = hasPendingEvent(T);
267 // Return true if more than one bit is set in Events.
268 return Events & (Events - 1);
269 }
270
271 bool hasPendingFlat() const {
272 return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
273 LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
274 (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
275 LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
276 }
277
278 void setPendingFlat() {
279 LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
280 LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
281 }
282
283 // Return true if there might be pending writes to the specified vgpr by VMEM
284 // instructions with types different from V.
285 bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
286 assert(GprNo < NUM_ALL_VGPRS);
287 return VgprVmemTypes[GprNo] & ~(1 << V);
288 }
289
290 void clearVgprVmemTypes(int GprNo) {
291 assert(GprNo < NUM_ALL_VGPRS);
292 VgprVmemTypes[GprNo] = 0;
293 }
294
295 void print(raw_ostream &);
296 void dump() { print(dbgs()); }
297
298private:
299 struct MergeInfo {
300 unsigned OldLB;
301 unsigned OtherLB;
302 unsigned MyShift;
303 unsigned OtherShift;
304 };
305 static bool mergeScore(const MergeInfo &M, unsigned &Score,
306 unsigned OtherScore);
307
308 void setScoreLB(InstCounterType T, unsigned Val) {
309 assert(T < NUM_INST_CNTS);
310 ScoreLBs[T] = Val;
311 }
312
313 void setScoreUB(InstCounterType T, unsigned Val) {
314 assert(T < NUM_INST_CNTS);
315 ScoreUBs[T] = Val;
316
317 if (T != EXP_CNT)
318 return;
319
320 if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
321 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
322 }
323
324 void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
325 if (GprNo < NUM_ALL_VGPRS) {
326 VgprUB = std::max(VgprUB, GprNo);
327 VgprScores[T][GprNo] = Val;
328 } else {
329 assert(T == LGKM_CNT);
330 SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
331 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
332 }
333 }
334
335 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
337 unsigned OpNo, unsigned Val);
338
339 const GCNSubtarget *ST = nullptr;
340 HardwareLimits Limits = {};
341 RegisterEncoding Encoding = {};
342 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
343 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
344 unsigned PendingEvents = 0;
345 // Remember the last flat memory operation.
346 unsigned LastFlat[NUM_INST_CNTS] = {0};
347 // wait_cnt scores for every vgpr.
348 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
349 int VgprUB = -1;
350 int SgprUB = -1;
351 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
352 // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
353 unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
354 // Bitmask of the VmemTypes of VMEM instructions that might have a pending
355 // write to each vgpr.
356 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
357};
358
359class SIInsertWaitcnts : public MachineFunctionPass {
360private:
361 const GCNSubtarget *ST = nullptr;
362 const SIInstrInfo *TII = nullptr;
363 const SIRegisterInfo *TRI = nullptr;
364 const MachineRegisterInfo *MRI = nullptr;
366
367 DenseSet<MachineInstr *> TrackedWaitcntSet;
369 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
370 MachineLoopInfo *MLI;
372
373 struct BlockInfo {
374 std::unique_ptr<WaitcntBrackets> Incoming;
375 bool Dirty = true;
376 };
377
379
380 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
381 // because of amdgpu-waitcnt-forcezero flag
382 bool ForceEmitZeroWaitcnts;
383 bool ForceEmitWaitcnt[NUM_INST_CNTS];
384
385 bool OptNone;
386
387 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
388 // message.
389 DenseSet<MachineInstr *> ReleaseVGPRInsts;
390
391public:
392 static char ID;
393
394 SIInsertWaitcnts() : MachineFunctionPass(ID) {
395 (void)ForceExpCounter;
396 (void)ForceLgkmCounter;
397 (void)ForceVMCounter;
398 }
399
400 bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
401 bool isPreheaderToFlush(MachineBasicBlock &MBB,
402 WaitcntBrackets &ScoreBrackets);
403 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
404 bool runOnMachineFunction(MachineFunction &MF) override;
405
406 StringRef getPassName() const override {
407 return "SI insert wait instructions";
408 }
409
410 void getAnalysisUsage(AnalysisUsage &AU) const override {
411 AU.setPreservesCFG();
415 }
416
417 bool isForceEmitWaitcnt() const {
418 for (auto T : inst_counter_types())
419 if (ForceEmitWaitcnt[T])
420 return true;
421 return false;
422 }
423
424 void setForceEmitWaitcnt() {
425// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
426// For debug builds, get the debug counter info and adjust if need be
427#ifndef NDEBUG
428 if (DebugCounter::isCounterSet(ForceExpCounter) &&
429 DebugCounter::shouldExecute(ForceExpCounter)) {
430 ForceEmitWaitcnt[EXP_CNT] = true;
431 } else {
432 ForceEmitWaitcnt[EXP_CNT] = false;
433 }
434
435 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
436 DebugCounter::shouldExecute(ForceLgkmCounter)) {
437 ForceEmitWaitcnt[LGKM_CNT] = true;
438 } else {
439 ForceEmitWaitcnt[LGKM_CNT] = false;
440 }
441
442 if (DebugCounter::isCounterSet(ForceVMCounter) &&
443 DebugCounter::shouldExecute(ForceVMCounter)) {
444 ForceEmitWaitcnt[VM_CNT] = true;
445 } else {
446 ForceEmitWaitcnt[VM_CNT] = false;
447 }
448#endif // NDEBUG
449 }
450
451 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
452 // FLAT instruction.
453 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
455 if (!ST->hasVscnt())
456 return VMEM_ACCESS;
457 if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) {
458 // FLAT and SCRATCH instructions may access scratch. Other VMEM
459 // instructions do not.
460 if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
461 return SCRATCH_WRITE_ACCESS;
462 return VMEM_WRITE_ACCESS;
463 }
464 return VMEM_READ_ACCESS;
465 }
466
467 bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
468 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
469 bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
470 bool generateWaitcntInstBefore(MachineInstr &MI,
471 WaitcntBrackets &ScoreBrackets,
472 MachineInstr *OldWaitcntInstr,
473 bool FlushVmCnt);
474 bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
475 WaitcntBrackets &ScoreBrackets,
476 MachineInstr *OldWaitcntInstr);
477 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
479 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
480 MachineInstr *OldWaitcntInstr);
481 void updateEventWaitcntAfter(MachineInstr &Inst,
482 WaitcntBrackets *ScoreBrackets);
483 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
484 WaitcntBrackets &ScoreBrackets);
485 bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
486 MachineInstr &OldWaitcntInstr,
489};
490
491} // end anonymous namespace
492
493RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
494 const SIInstrInfo *TII,
496 const SIRegisterInfo *TRI,
497 unsigned OpNo) const {
498 const MachineOperand &Op = MI->getOperand(OpNo);
499 if (!TRI->isInAllocatableClass(Op.getReg()))
500 return {-1, -1};
501
502 // A use via a PW operand does not need a waitcnt.
503 // A partial write is not a WAW.
504 assert(!Op.getSubReg() || !Op.isUndef());
505
506 RegInterval Result;
507
508 unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
510
511 if (TRI->isVectorRegister(*MRI, Op.getReg())) {
512 assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
513 Result.first = Reg - Encoding.VGPR0;
514 if (TRI->isAGPR(*MRI, Op.getReg()))
515 Result.first += AGPR_OFFSET;
516 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
517 } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
518 assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
519 Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
520 assert(Result.first >= NUM_ALL_VGPRS &&
521 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
522 }
523 // TODO: Handle TTMP
524 // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
525 else
526 return {-1, -1};
527
528 const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo);
529 unsigned Size = TRI->getRegSizeInBits(*RC);
530 Result.second = Result.first + ((Size + 16) / 32);
531
532 return Result;
533}
534
535void WaitcntBrackets::setExpScore(const MachineInstr *MI,
536 const SIInstrInfo *TII,
537 const SIRegisterInfo *TRI,
538 const MachineRegisterInfo *MRI, unsigned OpNo,
539 unsigned Val) {
540 RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
541 assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
542 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
543 setRegScore(RegNo, EXP_CNT, Val);
544 }
545}
546
547// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS written
548// can be accessed. A load from LDS to VMEM does not need a wait.
550 return SIInstrInfo::isVALU(MI) &&
552 MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
553}
554
555void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
556 const SIRegisterInfo *TRI,
558 WaitEventType E, MachineInstr &Inst) {
559 InstCounterType T = eventCounter(E);
560 unsigned CurrScore = getScoreUB(T) + 1;
561 if (CurrScore == 0)
562 report_fatal_error("InsertWaitcnt score wraparound");
563 // PendingEvents and ScoreUB need to be update regardless if this event
564 // changes the score of a register or not.
565 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
566 PendingEvents |= 1 << E;
567 setScoreUB(T, CurrScore);
568
569 if (T == EXP_CNT) {
570 // Put score on the source vgprs. If this is a store, just use those
571 // specific register(s).
572 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
573 int AddrOpIdx =
574 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
575 // All GDS operations must protect their address register (same as
576 // export.)
577 if (AddrOpIdx != -1) {
578 setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
579 }
580
581 if (Inst.mayStore()) {
582 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) {
583 setExpScore(
584 &Inst, TII, TRI, MRI,
585 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
586 CurrScore);
587 }
588 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) {
589 setExpScore(&Inst, TII, TRI, MRI,
591 AMDGPU::OpName::data1),
592 CurrScore);
593 }
594 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
595 Inst.getOpcode() != AMDGPU::DS_APPEND &&
596 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
597 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
598 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
599 const MachineOperand &Op = Inst.getOperand(I);
600 if (Op.isReg() && !Op.isDef() &&
601 TRI->isVectorRegister(*MRI, Op.getReg())) {
602 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
603 }
604 }
605 }
606 } else if (TII->isFLAT(Inst)) {
607 if (Inst.mayStore()) {
608 setExpScore(
609 &Inst, TII, TRI, MRI,
610 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
611 CurrScore);
612 } else if (SIInstrInfo::isAtomicRet(Inst)) {
613 setExpScore(
614 &Inst, TII, TRI, MRI,
615 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
616 CurrScore);
617 }
618 } else if (TII->isMIMG(Inst)) {
619 if (Inst.mayStore()) {
620 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
621 } else if (SIInstrInfo::isAtomicRet(Inst)) {
622 setExpScore(
623 &Inst, TII, TRI, MRI,
624 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
625 CurrScore);
626 }
627 } else if (TII->isMTBUF(Inst)) {
628 if (Inst.mayStore()) {
629 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
630 }
631 } else if (TII->isMUBUF(Inst)) {
632 if (Inst.mayStore()) {
633 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
634 } else if (SIInstrInfo::isAtomicRet(Inst)) {
635 setExpScore(
636 &Inst, TII, TRI, MRI,
637 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
638 CurrScore);
639 }
640 } else if (TII->isLDSDIR(Inst)) {
641 // LDSDIR instructions attach the score to the destination.
642 setExpScore(
643 &Inst, TII, TRI, MRI,
644 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
645 CurrScore);
646 } else {
647 if (TII->isEXP(Inst)) {
648 // For export the destination registers are really temps that
649 // can be used as the actual source after export patching, so
650 // we need to treat them like sources and set the EXP_CNT
651 // score.
652 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
653 MachineOperand &DefMO = Inst.getOperand(I);
654 if (DefMO.isReg() && DefMO.isDef() &&
655 TRI->isVGPR(*MRI, DefMO.getReg())) {
656 setRegScore(
657 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
658 EXP_CNT, CurrScore);
659 }
660 }
661 }
662 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
663 MachineOperand &MO = Inst.getOperand(I);
664 if (MO.isReg() && !MO.isDef() &&
665 TRI->isVectorRegister(*MRI, MO.getReg())) {
666 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
667 }
668 }
669 }
670#if 0 // TODO: check if this is handled by MUBUF code above.
671 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
672 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
673 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
674 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
675 unsigned OpNo;//TODO: find the OpNo for this operand;
676 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo);
677 for (int RegNo = Interval.first; RegNo < Interval.second;
678 ++RegNo) {
679 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
680 }
681#endif
682 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
683 // Match the score to the destination registers.
684 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
685 auto &Op = Inst.getOperand(I);
686 if (!Op.isReg() || !Op.isDef())
687 continue;
688 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I);
689 if (T == VM_CNT) {
690 if (Interval.first >= NUM_ALL_VGPRS)
691 continue;
692 if (updateVMCntOnly(Inst)) {
693 // updateVMCntOnly should only leave us with VGPRs
694 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
695 // defs. That's required for a sane index into `VgprMemTypes` below
696 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
697 VmemType V = getVmemType(Inst);
698 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
699 VgprVmemTypes[RegNo] |= 1 << V;
700 }
701 }
702 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
703 setRegScore(RegNo, T, CurrScore);
704 }
705 }
706 if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
707 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
708 }
709 }
710}
711
712void WaitcntBrackets::print(raw_ostream &OS) {
713 OS << '\n';
714 for (auto T : inst_counter_types()) {
715 unsigned SR = getScoreRange(T);
716
717 switch (T) {
718 case VM_CNT:
719 OS << " VM_CNT(" << SR << "): ";
720 break;
721 case LGKM_CNT:
722 OS << " LGKM_CNT(" << SR << "): ";
723 break;
724 case EXP_CNT:
725 OS << " EXP_CNT(" << SR << "): ";
726 break;
727 case VS_CNT:
728 OS << " VS_CNT(" << SR << "): ";
729 break;
730 default:
731 OS << " UNKNOWN(" << SR << "): ";
732 break;
733 }
734
735 if (SR != 0) {
736 // Print vgpr scores.
737 unsigned LB = getScoreLB(T);
738
739 for (int J = 0; J <= VgprUB; J++) {
740 unsigned RegScore = getRegScore(J, T);
741 if (RegScore <= LB)
742 continue;
743 unsigned RelScore = RegScore - LB - 1;
744 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
745 OS << RelScore << ":v" << J << " ";
746 } else {
747 OS << RelScore << ":ds ";
748 }
749 }
750 // Also need to print sgpr scores for lgkm_cnt.
751 if (T == LGKM_CNT) {
752 for (int J = 0; J <= SgprUB; J++) {
753 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
754 if (RegScore <= LB)
755 continue;
756 unsigned RelScore = RegScore - LB - 1;
757 OS << RelScore << ":s" << J << " ";
758 }
759 }
760 }
761 OS << '\n';
762 }
763 OS << '\n';
764}
765
766/// Simplify the waitcnt, in the sense of removing redundant counts, and return
767/// whether a waitcnt instruction is needed at all.
768void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
769 simplifyWaitcnt(VM_CNT, Wait.VmCnt);
770 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
771 simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
772 simplifyWaitcnt(VS_CNT, Wait.VsCnt);
773}
774
775void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
776 unsigned &Count) const {
777 // The number of outstanding events for this type, T, can be calculated
778 // as (UB - LB). If the current Count is greater than or equal to the number
779 // of outstanding events, then the wait for this counter is redundant.
780 if (Count >= getScoreRange(T))
781 Count = ~0u;
782}
783
784void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
785 AMDGPU::Waitcnt &Wait) const {
786 unsigned ScoreToWait = getRegScore(RegNo, T);
787
788 // If the score of src_operand falls within the bracket, we need an
789 // s_waitcnt instruction.
790 const unsigned LB = getScoreLB(T);
791 const unsigned UB = getScoreUB(T);
792 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
793 if ((T == VM_CNT || T == LGKM_CNT) &&
794 hasPendingFlat() &&
795 !ST->hasFlatLgkmVMemCountInOrder()) {
796 // If there is a pending FLAT operation, and this is a VMem or LGKM
797 // waitcnt and the target can report early completion, then we need
798 // to force a waitcnt 0.
799 addWait(Wait, T, 0);
800 } else if (counterOutOfOrder(T)) {
801 // Counter can get decremented out-of-order when there
802 // are multiple types event in the bracket. Also emit an s_wait counter
803 // with a conservative value of 0 for the counter.
804 addWait(Wait, T, 0);
805 } else {
806 // If a counter has been maxed out avoid overflow by waiting for
807 // MAX(CounterType) - 1 instead.
808 unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
809 addWait(Wait, T, NeededWait);
810 }
811 }
812}
813
814void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
815 applyWaitcnt(VM_CNT, Wait.VmCnt);
816 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
817 applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
818 applyWaitcnt(VS_CNT, Wait.VsCnt);
819}
820
821void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
822 const unsigned UB = getScoreUB(T);
823 if (Count >= UB)
824 return;
825 if (Count != 0) {
826 if (counterOutOfOrder(T))
827 return;
828 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
829 } else {
830 setScoreLB(T, UB);
831 PendingEvents &= ~WaitEventMaskForInst[T];
832 }
833}
834
835// Where there are multiple types of event in the bracket of a counter,
836// the decrement may go out of order.
837bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
838 // Scalar memory read always can go out of order.
839 if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
840 return true;
841 return hasMixedPendingEvents(T);
842}
843
844INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
845 false)
848INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
849 false)
850
851char SIInsertWaitcnts::ID = 0;
852
853char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
854
856 return new SIInsertWaitcnts();
857}
858
860 unsigned NewEnc) {
861 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
862 assert(OpIdx >= 0);
863
864 MachineOperand &MO = MI.getOperand(OpIdx);
865
866 if (NewEnc == MO.getImm())
867 return false;
868
869 MO.setImm(NewEnc);
870 return true;
871}
872
873/// Combine consecutive waitcnt instructions that precede \p It and follow
874/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
875/// by previous passes. Currently this pass conservatively assumes that these
876/// preexisting waitcnt are required for correctness.
877bool SIInsertWaitcnts::applyPreexistingWaitcnt(
878 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
880 bool Modified = false;
881 MachineInstr *WaitcntInstr = nullptr;
882 MachineInstr *WaitcntVsCntInstr = nullptr;
883
884 for (auto &II :
885 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
886 if (II.isMetaInstruction())
887 continue;
888
889 if (II.getOpcode() == AMDGPU::S_WAITCNT) {
890 // Conservatively update required wait if this waitcnt was added in an
891 // earlier pass. In this case it will not exist in the tracked waitcnt
892 // set.
893 if (!TrackedWaitcntSet.count(&II)) {
894 unsigned IEnc = II.getOperand(0).getImm();
896 Wait = Wait.combined(OldWait);
897 }
898
899 // Merge consecutive waitcnt of the same type by erasing multiples.
900 if (!WaitcntInstr) {
901 WaitcntInstr = &II;
902 } else {
903 II.eraseFromParent();
904 Modified = true;
905 }
906
907 } else {
908 assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
909 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
910 if (!TrackedWaitcntSet.count(&II)) {
911 unsigned OldVSCnt =
912 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
913 Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
914 }
915
916 if (!WaitcntVsCntInstr) {
917 WaitcntVsCntInstr = &II;
918 } else {
919 II.eraseFromParent();
920 Modified = true;
921 }
922 }
923 }
924
925 // Updated encoding of merged waitcnt with the required wait.
926 if (WaitcntInstr) {
927 if (Wait.hasWaitExceptVsCnt()) {
928 Modified |=
929 updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
931 ScoreBrackets.applyWaitcnt(Wait);
932 Wait.VmCnt = ~0u;
933 Wait.LgkmCnt = ~0u;
934 Wait.ExpCnt = ~0u;
935
936 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
937 ? dbgs() << "applyPreexistingWaitcnt\n"
938 << "New Instr at block end: " << *WaitcntInstr
939 << '\n'
940 : dbgs() << "applyPreexistingWaitcnt\n"
941 << "Old Instr: " << *It
942 << "New Instr: " << *WaitcntInstr << '\n');
943
944 } else {
945 WaitcntInstr->eraseFromParent();
946 Modified = true;
947 }
948 }
949
950 if (WaitcntVsCntInstr) {
951 if (Wait.hasWaitVsCnt()) {
952 assert(ST->hasVscnt());
953 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
954 AMDGPU::OpName::simm16, Wait.VsCnt);
955 ScoreBrackets.applyWaitcnt(Wait);
956 Wait.VsCnt = ~0u;
957
958 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
959 ? dbgs() << "applyPreexistingWaitcnt\n"
960 << "New Instr at block end: "
961 << *WaitcntVsCntInstr << '\n'
962 : dbgs() << "applyPreexistingWaitcnt\n"
963 << "Old Instr: " << *It
964 << "New Instr: " << *WaitcntVsCntInstr << '\n');
965 } else {
966 WaitcntVsCntInstr->eraseFromParent();
967 Modified = true;
968 }
969 }
970
971 return Modified;
972}
973
974static bool readsVCCZ(const MachineInstr &MI) {
975 unsigned Opc = MI.getOpcode();
976 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
977 !MI.getOperand(1).isUndef();
978}
979
980/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
982 // Currently all conventions wait, but this may not always be the case.
983 //
984 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
985 // senses to omit the wait and do it in the caller.
986 return true;
987}
988
989/// \returns true if the callee is expected to wait for any outstanding waits
990/// before returning.
992 return true;
993}
994
995/// Generate s_waitcnt instruction to be placed before cur_Inst.
996/// Instructions of a given type are returned in order,
997/// but instructions of different types can complete out of order.
998/// We rely on this in-order completion
999/// and simply assign a score to the memory access instructions.
1000/// We keep track of the active "score bracket" to determine
1001/// if an access of a memory read requires an s_waitcnt
1002/// and if so what the value of each counter is.
1003/// The "score bracket" is bound by the lower bound and upper bound
1004/// scores (*_score_LB and *_score_ub respectively).
1005/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1006/// flush the vmcnt counter here.
1007bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1008 WaitcntBrackets &ScoreBrackets,
1009 MachineInstr *OldWaitcntInstr,
1010 bool FlushVmCnt) {
1011 setForceEmitWaitcnt();
1012
1013 if (MI.isMetaInstruction())
1014 return false;
1015
1017
1018 // FIXME: This should have already been handled by the memory legalizer.
1019 // Removing this currently doesn't affect any lit tests, but we need to
1020 // verify that nothing was relying on this. The number of buffer invalidates
1021 // being handled here should not be expanded.
1022 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1023 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1024 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1025 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1026 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1027 Wait.VmCnt = 0;
1028 }
1029
1030 // All waits must be resolved at call return.
1031 // NOTE: this could be improved with knowledge of all call sites or
1032 // with knowledge of the called routines.
1033 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1034 MI.getOpcode() == AMDGPU::SI_RETURN ||
1035 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1036 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1038 }
1039 // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1040 // stores. In this case it can be useful to send a message to explicitly
1041 // release all VGPRs before the stores have completed, but it is only safe to
1042 // do this if there are no outstanding scratch stores.
1043 else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1044 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1045 if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone &&
1046 ScoreBrackets.getScoreRange(VS_CNT) != 0 &&
1047 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
1048 ReleaseVGPRInsts.insert(&MI);
1049 }
1050 // Resolve vm waits before gs-done.
1051 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
1052 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1053 ST->hasLegacyGeometry() &&
1054 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1056 Wait.VmCnt = 0;
1057 }
1058#if 0 // TODO: the following blocks of logic when we have fence.
1059 else if (MI.getOpcode() == SC_FENCE) {
1060 const unsigned int group_size =
1061 context->shader_info->GetMaxThreadGroupSize();
1062 // group_size == 0 means thread group size is unknown at compile time
1063 const bool group_is_multi_wave =
1064 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
1065 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
1066
1067 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
1068 SCRegType src_type = Inst->GetSrcType(i);
1069 switch (src_type) {
1070 case SCMEM_LDS:
1071 if (group_is_multi_wave ||
1072 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
1073 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
1074 ScoreBrackets->getScoreUB(LGKM_CNT));
1075 // LDS may have to wait for VM_CNT after buffer load to LDS
1076 if (target_info->HasBufferLoadToLDS()) {
1077 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
1078 ScoreBrackets->getScoreUB(VM_CNT));
1079 }
1080 }
1081 break;
1082
1083 case SCMEM_GDS:
1084 if (group_is_multi_wave || fence_is_global) {
1085 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1086 ScoreBrackets->getScoreUB(EXP_CNT));
1087 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
1088 ScoreBrackets->getScoreUB(LGKM_CNT));
1089 }
1090 break;
1091
1092 case SCMEM_UAV:
1093 case SCMEM_TFBUF:
1094 case SCMEM_RING:
1095 case SCMEM_SCATTER:
1096 if (group_is_multi_wave || fence_is_global) {
1097 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1098 ScoreBrackets->getScoreUB(EXP_CNT));
1099 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
1100 ScoreBrackets->getScoreUB(VM_CNT));
1101 }
1102 break;
1103
1104 case SCMEM_SCRATCH:
1105 default:
1106 break;
1107 }
1108 }
1109 }
1110#endif
1111
1112 // Export & GDS instructions do not read the EXEC mask until after the export
1113 // is granted (which can occur well after the instruction is issued).
1114 // The shader program must flush all EXP operations on the export-count
1115 // before overwriting the EXEC mask.
1116 else {
1117 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1118 // Export and GDS are tracked individually, either may trigger a waitcnt
1119 // for EXEC.
1120 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1121 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1122 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1123 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1124 Wait.ExpCnt = 0;
1125 }
1126 }
1127
1128 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1129 // The function is going to insert a wait on everything in its prolog.
1130 // This still needs to be careful if the call target is a load (e.g. a GOT
1131 // load). We also need to check WAW dependency with saved PC.
1133
1134 int CallAddrOpIdx =
1135 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
1136
1137 if (MI.getOperand(CallAddrOpIdx).isReg()) {
1138 RegInterval CallAddrOpInterval =
1139 ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx);
1140
1141 for (int RegNo = CallAddrOpInterval.first;
1142 RegNo < CallAddrOpInterval.second; ++RegNo)
1143 ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
1144
1145 int RtnAddrOpIdx =
1146 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
1147 if (RtnAddrOpIdx != -1) {
1148 RegInterval RtnAddrOpInterval =
1149 ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx);
1150
1151 for (int RegNo = RtnAddrOpInterval.first;
1152 RegNo < RtnAddrOpInterval.second; ++RegNo)
1153 ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
1154 }
1155 }
1156 } else {
1157 // FIXME: Should not be relying on memoperands.
1158 // Look at the source operands of every instruction to see if
1159 // any of them results from a previous memory operation that affects
1160 // its current usage. If so, an s_waitcnt instruction needs to be
1161 // emitted.
1162 // If the source operand was defined by a load, add the s_waitcnt
1163 // instruction.
1164 //
1165 // Two cases are handled for destination operands:
1166 // 1) If the destination operand was defined by a load, add the s_waitcnt
1167 // instruction to guarantee the right WAW order.
1168 // 2) If a destination operand that was used by a recent export/store ins,
1169 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1170 for (const MachineMemOperand *Memop : MI.memoperands()) {
1171 const Value *Ptr = Memop->getValue();
1172 if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
1173 addWait(Wait, LGKM_CNT, 0);
1174 if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1175 SLoadAddresses.erase(Ptr);
1176 }
1177 unsigned AS = Memop->getAddrSpace();
1179 continue;
1180 // No need to wait before load from VMEM to LDS.
1182 continue;
1183 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1184 // VM_CNT is only relevant to vgpr or LDS.
1185 ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
1186 if (Memop->isStore()) {
1187 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1188 }
1189 }
1190
1191 // Loop over use and def operands.
1192 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1193 MachineOperand &Op = MI.getOperand(I);
1194 if (!Op.isReg())
1195 continue;
1196
1197 // If the instruction does not read tied source, skip the operand.
1198 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1199 continue;
1200
1201 RegInterval Interval =
1202 ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
1203
1204 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1205 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1206 if (IsVGPR) {
1207 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1208 // previous write and this write are the same type of VMEM
1209 // instruction, in which case they're guaranteed to write their
1210 // results in order anyway.
1211 if (Op.isUse() || !updateVMCntOnly(MI) ||
1212 ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1213 getVmemType(MI))) {
1214 ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
1215 ScoreBrackets.clearVgprVmemTypes(RegNo);
1216 }
1217 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1218 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1219 }
1220 }
1221 ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
1222 }
1223 }
1224 }
1225 }
1226
1227 // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1228 // not, we need to ensure the subtarget is capable of backing off barrier
1229 // instructions in case there are any outstanding memory operations that may
1230 // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
1231 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1232 !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
1233 Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
1234 }
1235
1236 // TODO: Remove this work-around, enable the assert for Bug 457939
1237 // after fixing the scheduler. Also, the Shader Compiler code is
1238 // independent of target.
1239 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1240 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1241 Wait.LgkmCnt = 0;
1242 }
1243 }
1244
1245 // Verify that the wait is actually needed.
1246 ScoreBrackets.simplifyWaitcnt(Wait);
1247
1248 if (ForceEmitZeroWaitcnts)
1250
1251 if (ForceEmitWaitcnt[VM_CNT])
1252 Wait.VmCnt = 0;
1253 if (ForceEmitWaitcnt[EXP_CNT])
1254 Wait.ExpCnt = 0;
1255 if (ForceEmitWaitcnt[LGKM_CNT])
1256 Wait.LgkmCnt = 0;
1257
1258 if (FlushVmCnt) {
1259 if (ScoreBrackets.hasPendingEvent(VM_CNT))
1260 Wait.VmCnt = 0;
1261 }
1262
1263 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
1264 OldWaitcntInstr);
1265}
1266
1267// Add a waitcnt to flush the vmcnt counter at the end of the given block if
1268// needed.
1269bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
1270 WaitcntBrackets &ScoreBrackets,
1271 MachineInstr *OldWaitcntInstr) {
1273
1274 if (!ScoreBrackets.hasPendingEvent(VM_CNT))
1275 return false;
1276
1277 Wait.VmCnt = 0;
1278
1279 return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
1280 OldWaitcntInstr);
1281}
1282
1283bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1286 WaitcntBrackets &ScoreBrackets,
1287 MachineInstr *OldWaitcntInstr) {
1288 bool Modified = false;
1289 const DebugLoc &DL = Block.findDebugLoc(It);
1290
1291 if (OldWaitcntInstr)
1292 // Try to merge the required wait with preexisting waitcnt instructions.
1293 // Also erase redundant waitcnt.
1294 Modified =
1295 applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
1296 else
1297 ScoreBrackets.applyWaitcnt(Wait);
1298
1299 // ExpCnt can be merged into VINTERP.
1300 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
1302 MachineOperand *WaitExp =
1303 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1304 if (Wait.ExpCnt < WaitExp->getImm()) {
1305 WaitExp->setImm(Wait.ExpCnt);
1306 Modified = true;
1307 }
1308 Wait.ExpCnt = ~0u;
1309
1310 LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1311 << "Update Instr: " << *It);
1312 }
1313
1314 // Build new waitcnt instructions unless no wait is needed or the old waitcnt
1315 // instruction was modified to handle the required wait.
1316 if (Wait.hasWaitExceptVsCnt()) {
1317 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1318 auto SWaitInst =
1319 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1320 TrackedWaitcntSet.insert(SWaitInst);
1321 Modified = true;
1322
1323 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1324 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1325 dbgs() << "New Instr: " << *SWaitInst << '\n');
1326 }
1327
1328 if (Wait.hasWaitVsCnt()) {
1329 assert(ST->hasVscnt());
1330
1331 auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1332 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1333 .addImm(Wait.VsCnt);
1334 TrackedWaitcntSet.insert(SWaitInst);
1335 Modified = true;
1336
1337 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1338 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1339 dbgs() << "New Instr: " << *SWaitInst << '\n');
1340 }
1341 return Modified;
1342}
1343
1344// This is a flat memory operation. Check to see if it has memory tokens other
1345// than LDS. Other address spaces supported by flat memory operations involve
1346// global memory.
1347bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1348 assert(TII->isFLAT(MI));
1349
1350 // All flat instructions use the VMEM counter.
1351 assert(TII->usesVM_CNT(MI));
1352
1353 // If there are no memory operands then conservatively assume the flat
1354 // operation may access VMEM.
1355 if (MI.memoperands_empty())
1356 return true;
1357
1358 // See if any memory operand specifies an address space that involves VMEM.
1359 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1360 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1361 // (GDS) address space is not supported by flat operations. Therefore, simply
1362 // return true unless only the LDS address space is found.
1363 for (const MachineMemOperand *Memop : MI.memoperands()) {
1364 unsigned AS = Memop->getAddrSpace();
1366 if (AS != AMDGPUAS::LOCAL_ADDRESS)
1367 return true;
1368 }
1369
1370 return false;
1371}
1372
1373// This is a flat memory operation. Check to see if it has memory tokens for
1374// either LDS or FLAT.
1375bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1376 assert(TII->isFLAT(MI));
1377
1378 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1379 if (!TII->usesLGKM_CNT(MI))
1380 return false;
1381
1382 // If in tgsplit mode then there can be no use of LDS.
1383 if (ST->isTgSplitEnabled())
1384 return false;
1385
1386 // If there are no memory operands then conservatively assume the flat
1387 // operation may access LDS.
1388 if (MI.memoperands_empty())
1389 return true;
1390
1391 // See if any memory operand specifies an address space that involves LDS.
1392 for (const MachineMemOperand *Memop : MI.memoperands()) {
1393 unsigned AS = Memop->getAddrSpace();
1395 return true;
1396 }
1397
1398 return false;
1399}
1400
1401// This is a flat memory operation. Check to see if it has memory tokens for
1402// either scratch or FLAT.
1403bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
1404 const MachineInstr &MI) const {
1405 assert(TII->isFLAT(MI));
1406
1407 // SCRATCH instructions always access scratch.
1408 if (TII->isFLATScratch(MI))
1409 return true;
1410
1411 // GLOBAL instructions never access scratch.
1412 if (TII->isFLATGlobal(MI))
1413 return false;
1414
1415 // If there are no memory operands then conservatively assume the flat
1416 // operation may access scratch.
1417 if (MI.memoperands_empty())
1418 return true;
1419
1420 // See if any memory operand specifies an address space that involves scratch.
1421 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
1422 unsigned AS = Memop->getAddrSpace();
1423 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
1424 });
1425}
1426
1427void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1428 WaitcntBrackets *ScoreBrackets) {
1429 // Now look at the instruction opcode. If it is a memory access
1430 // instruction, update the upper-bound of the appropriate counter's
1431 // bracket and the destination operand scores.
1432 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1433 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1434 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1435 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1436 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1437 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1438 } else {
1439 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1440 }
1441 } else if (TII->isFLAT(Inst)) {
1442 assert(Inst.mayLoadOrStore());
1443
1444 int FlatASCount = 0;
1445
1446 if (mayAccessVMEMThroughFlat(Inst)) {
1447 ++FlatASCount;
1448 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
1449 Inst);
1450 }
1451
1452 if (mayAccessLDSThroughFlat(Inst)) {
1453 ++FlatASCount;
1454 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1455 }
1456
1457 // A Flat memory operation must access at least one address space.
1458 assert(FlatASCount);
1459
1460 // This is a flat memory operation that access both VMEM and LDS, so note it
1461 // - it will require that both the VM and LGKM be flushed to zero if it is
1462 // pending when a VM or LGKM dependency occurs.
1463 if (FlatASCount > 1)
1464 ScoreBrackets->setPendingFlat();
1465 } else if (SIInstrInfo::isVMEM(Inst) &&
1467 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
1468 Inst);
1469
1470 if (ST->vmemWriteNeedsExpWaitcnt() &&
1471 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
1472 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1473 }
1474 } else if (TII->isSMRD(Inst)) {
1475 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1476 } else if (Inst.isCall()) {
1477 if (callWaitsOnFunctionReturn(Inst)) {
1478 // Act as a wait on everything
1479 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt());
1480 } else {
1481 // May need to way wait for anything.
1482 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
1483 }
1484 } else if (SIInstrInfo::isLDSDIR(Inst)) {
1485 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
1486 } else if (TII->isVINTERP(Inst)) {
1487 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
1488 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
1489 } else if (SIInstrInfo::isEXP(Inst)) {
1490 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1492 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1493 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
1494 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1495 else
1496 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1497 } else {
1498 switch (Inst.getOpcode()) {
1499 case AMDGPU::S_SENDMSG:
1500 case AMDGPU::S_SENDMSG_RTN_B32:
1501 case AMDGPU::S_SENDMSG_RTN_B64:
1502 case AMDGPU::S_SENDMSGHALT:
1503 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1504 break;
1505 case AMDGPU::S_MEMTIME:
1506 case AMDGPU::S_MEMREALTIME:
1507 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1508 break;
1509 }
1510 }
1511}
1512
1513bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
1514 unsigned OtherScore) {
1515 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
1516 unsigned OtherShifted =
1517 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
1518 Score = std::max(MyShifted, OtherShifted);
1519 return OtherShifted > MyShifted;
1520}
1521
1522/// Merge the pending events and associater score brackets of \p Other into
1523/// this brackets status.
1524///
1525/// Returns whether the merge resulted in a change that requires tighter waits
1526/// (i.e. the merged brackets strictly dominate the original brackets).
1527bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
1528 bool StrictDom = false;
1529
1530 VgprUB = std::max(VgprUB, Other.VgprUB);
1531 SgprUB = std::max(SgprUB, Other.SgprUB);
1532
1533 for (auto T : inst_counter_types()) {
1534 // Merge event flags for this counter
1535 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
1536 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
1537 if (OtherEvents & ~OldEvents)
1538 StrictDom = true;
1539 PendingEvents |= OtherEvents;
1540
1541 // Merge scores for this counter
1542 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
1543 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
1544 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
1545 if (NewUB < ScoreLBs[T])
1546 report_fatal_error("waitcnt score overflow");
1547
1548 MergeInfo M;
1549 M.OldLB = ScoreLBs[T];
1550 M.OtherLB = Other.ScoreLBs[T];
1551 M.MyShift = NewUB - ScoreUBs[T];
1552 M.OtherShift = NewUB - Other.ScoreUBs[T];
1553
1554 ScoreUBs[T] = NewUB;
1555
1556 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
1557
1558 for (int J = 0; J <= VgprUB; J++)
1559 StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
1560
1561 if (T == LGKM_CNT) {
1562 for (int J = 0; J <= SgprUB; J++)
1563 StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
1564 }
1565 }
1566
1567 for (int J = 0; J <= VgprUB; J++) {
1568 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
1569 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
1570 VgprVmemTypes[J] = NewVmemTypes;
1571 }
1572
1573 return StrictDom;
1574}
1575
1576static bool isWaitInstr(MachineInstr &Inst) {
1577 return Inst.getOpcode() == AMDGPU::S_WAITCNT ||
1578 (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1579 Inst.getOperand(0).isReg() &&
1580 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1581}
1582
1583// Generate s_waitcnt instructions where needed.
1584bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1586 WaitcntBrackets &ScoreBrackets) {
1587 bool Modified = false;
1588
1589 LLVM_DEBUG({
1590 dbgs() << "*** Block" << Block.getNumber() << " ***";
1591 ScoreBrackets.dump();
1592 });
1593
1594 // Track the correctness of vccz through this basic block. There are two
1595 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
1596 // ST->partialVCCWritesUpdateVCCZ().
1597 bool VCCZCorrect = true;
1598 if (ST->hasReadVCCZBug()) {
1599 // vccz could be incorrect at a basic block boundary if a predecessor wrote
1600 // to vcc and then issued an smem load.
1601 VCCZCorrect = false;
1602 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
1603 // vccz could be incorrect at a basic block boundary if a predecessor wrote
1604 // to vcc_lo or vcc_hi.
1605 VCCZCorrect = false;
1606 }
1607
1608 // Walk over the instructions.
1609 MachineInstr *OldWaitcntInstr = nullptr;
1610
1611 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
1612 E = Block.instr_end();
1613 Iter != E;) {
1614 MachineInstr &Inst = *Iter;
1615
1616 // Track pre-existing waitcnts that were added in earlier iterations or by
1617 // the memory legalizer.
1618 if (isWaitInstr(Inst)) {
1619 if (!OldWaitcntInstr)
1620 OldWaitcntInstr = &Inst;
1621 ++Iter;
1622 continue;
1623 }
1624
1625 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
1626 isPreheaderToFlush(Block, ScoreBrackets);
1627
1628 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
1629 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
1630 FlushVmCnt);
1631 OldWaitcntInstr = nullptr;
1632
1633 // Restore vccz if it's not known to be correct already.
1634 bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
1635
1636 // Don't examine operands unless we need to track vccz correctness.
1637 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
1638 if (Inst.definesRegister(AMDGPU::VCC_LO) ||
1639 Inst.definesRegister(AMDGPU::VCC_HI)) {
1640 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
1641 if (!ST->partialVCCWritesUpdateVCCZ())
1642 VCCZCorrect = false;
1643 } else if (Inst.definesRegister(AMDGPU::VCC)) {
1644 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
1645 // vccz bit, so when we detect that an instruction may read from a
1646 // corrupt vccz bit, we need to:
1647 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
1648 // operations to complete.
1649 // 2. Restore the correct value of vccz by writing the current value
1650 // of vcc back to vcc.
1651 if (ST->hasReadVCCZBug() &&
1652 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1653 // Writes to vcc while there's an outstanding smem read may get
1654 // clobbered as soon as any read completes.
1655 VCCZCorrect = false;
1656 } else {
1657 // Writes to vcc will fix any incorrect value in vccz.
1658 VCCZCorrect = true;
1659 }
1660 }
1661 }
1662
1663 if (TII->isSMRD(Inst)) {
1664 for (const MachineMemOperand *Memop : Inst.memoperands()) {
1665 // No need to handle invariant loads when avoiding WAR conflicts, as
1666 // there cannot be a vector store to the same memory location.
1667 if (!Memop->isInvariant()) {
1668 const Value *Ptr = Memop->getValue();
1669 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
1670 }
1671 }
1672 if (ST->hasReadVCCZBug()) {
1673 // This smem read could complete and clobber vccz at any time.
1674 VCCZCorrect = false;
1675 }
1676 }
1677
1678 updateEventWaitcntAfter(Inst, &ScoreBrackets);
1679
1680#if 0 // TODO: implement resource type check controlled by options with ub = LB.
1681 // If this instruction generates a S_SETVSKIP because it is an
1682 // indexed resource, and we are on Tahiti, then it will also force
1683 // an S_WAITCNT vmcnt(0)
1684 if (RequireCheckResourceType(Inst, context)) {
1685 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1686 ScoreBrackets->setScoreLB(VM_CNT,
1687 ScoreBrackets->getScoreUB(VM_CNT));
1688 }
1689#endif
1690
1691 LLVM_DEBUG({
1692 Inst.print(dbgs());
1693 ScoreBrackets.dump();
1694 });
1695
1696 // TODO: Remove this work-around after fixing the scheduler and enable the
1697 // assert above.
1698 if (RestoreVCCZ) {
1699 // Restore the vccz bit. Any time a value is written to vcc, the vcc
1700 // bit is updated, so we can restore the bit by reading the value of
1701 // vcc and then writing it back to the register.
1702 BuildMI(Block, Inst, Inst.getDebugLoc(),
1703 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1704 TRI->getVCC())
1705 .addReg(TRI->getVCC());
1706 VCCZCorrect = true;
1707 Modified = true;
1708 }
1709
1710 ++Iter;
1711 }
1712
1713 if (Block.getFirstTerminator() == Block.end() &&
1714 isPreheaderToFlush(Block, ScoreBrackets))
1715 Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
1716
1717 return Modified;
1718}
1719
1720// Return true if the given machine basic block is a preheader of a loop in
1721// which we want to flush the vmcnt counter, and false otherwise.
1722bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
1723 WaitcntBrackets &ScoreBrackets) {
1724 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
1725 if (!IsInserted)
1726 return Iterator->second;
1727
1729 if (!Succ)
1730 return false;
1731
1732 MachineLoop *Loop = MLI->getLoopFor(Succ);
1733 if (!Loop)
1734 return false;
1735
1736 if (Loop->getLoopPreheader() == &MBB &&
1737 shouldFlushVmCnt(Loop, ScoreBrackets)) {
1738 Iterator->second = true;
1739 return true;
1740 }
1741
1742 return false;
1743}
1744
1745bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
1746 return SIInstrInfo::isVMEM(MI) ||
1747 (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
1748}
1749
1750// Return true if it is better to flush the vmcnt counter in the preheader of
1751// the given loop. We currently decide to flush in two situations:
1752// 1. The loop contains vmem store(s), no vmem load and at least one use of a
1753// vgpr containing a value that is loaded outside of the loop. (Only on
1754// targets with no vscnt counter).
1755// 2. The loop contains vmem load(s), but the loaded values are not used in the
1756// loop, and at least one use of a vgpr containing a value that is loaded
1757// outside of the loop.
1758bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
1759 WaitcntBrackets &Brackets) {
1760 bool HasVMemLoad = false;
1761 bool HasVMemStore = false;
1762 bool UsesVgprLoadedOutside = false;
1763 DenseSet<Register> VgprUse;
1764 DenseSet<Register> VgprDef;
1765
1766 for (MachineBasicBlock *MBB : ML->blocks()) {
1767 for (MachineInstr &MI : *MBB) {
1768 if (isVMEMOrFlatVMEM(MI)) {
1769 if (MI.mayLoad())
1770 HasVMemLoad = true;
1771 if (MI.mayStore())
1772 HasVMemStore = true;
1773 }
1774 for (unsigned I = 0; I < MI.getNumOperands(); I++) {
1775 MachineOperand &Op = MI.getOperand(I);
1776 if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
1777 continue;
1778 RegInterval Interval = Brackets.getRegInterval(&MI, TII, MRI, TRI, I);
1779 // Vgpr use
1780 if (Op.isUse()) {
1781 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1782 // If we find a register that is loaded inside the loop, 1. and 2.
1783 // are invalidated and we can exit.
1784 if (VgprDef.contains(RegNo))
1785 return false;
1786 VgprUse.insert(RegNo);
1787 // If at least one of Op's registers is in the score brackets, the
1788 // value is likely loaded outside of the loop.
1789 if (Brackets.getRegScore(RegNo, VM_CNT) > Brackets.getScoreLB(VM_CNT)) {
1790 UsesVgprLoadedOutside = true;
1791 break;
1792 }
1793 }
1794 }
1795 // VMem load vgpr def
1796 else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
1797 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1798 // If we find a register that is loaded inside the loop, 1. and 2.
1799 // are invalidated and we can exit.
1800 if (VgprUse.contains(RegNo))
1801 return false;
1802 VgprDef.insert(RegNo);
1803 }
1804 }
1805 }
1806 }
1807 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
1808 return true;
1809 return HasVMemLoad && UsesVgprLoadedOutside;
1810}
1811
1812bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1813 ST = &MF.getSubtarget<GCNSubtarget>();
1814 TII = ST->getInstrInfo();
1815 TRI = &TII->getRegisterInfo();
1816 MRI = &MF.getRegInfo();
1817 IV = AMDGPU::getIsaVersion(ST->getCPU());
1819 MLI = &getAnalysis<MachineLoopInfo>();
1820 PDT = &getAnalysis<MachinePostDominatorTree>();
1821
1822 ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1823 for (auto T : inst_counter_types())
1824 ForceEmitWaitcnt[T] = false;
1825
1826 OptNone = MF.getFunction().hasOptNone() ||
1827 MF.getTarget().getOptLevel() == CodeGenOptLevel::None;
1828
1829 HardwareLimits Limits = {};
1830 Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1831 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1832 Limits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1833 Limits.VscntMax = ST->hasVscnt() ? 63 : 0;
1834
1835 unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
1836 unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
1837 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1838 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1839
1840 RegisterEncoding Encoding = {};
1841 Encoding.VGPR0 =
1842 TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::EncValues::REG_IDX_MASK;
1843 Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
1844 Encoding.SGPR0 =
1845 TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::EncValues::REG_IDX_MASK;
1846 Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
1847
1848 TrackedWaitcntSet.clear();
1849 BlockInfos.clear();
1850 bool Modified = false;
1851
1852 if (!MFI->isEntryFunction()) {
1853 // Wait for any outstanding memory operations that the input registers may
1854 // depend on. We can't track them and it's better to do the wait after the
1855 // costly call sequence.
1856
1857 // TODO: Could insert earlier and schedule more liberally with operations
1858 // that only use caller preserved registers.
1859 MachineBasicBlock &EntryBB = MF.front();
1861 for (MachineBasicBlock::iterator E = EntryBB.end();
1862 I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
1863 ;
1864 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
1865
1866 Modified = true;
1867 }
1868
1869 // Keep iterating over the blocks in reverse post order, inserting and
1870 // updating s_waitcnt where needed, until a fix point is reached.
1872 BlockInfos.insert({MBB, BlockInfo()});
1873
1874 std::unique_ptr<WaitcntBrackets> Brackets;
1875 bool Repeat;
1876 do {
1877 Repeat = false;
1878
1879 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
1880 ++BII) {
1881 MachineBasicBlock *MBB = BII->first;
1882 BlockInfo &BI = BII->second;
1883 if (!BI.Dirty)
1884 continue;
1885
1886 if (BI.Incoming) {
1887 if (!Brackets)
1888 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
1889 else
1890 *Brackets = *BI.Incoming;
1891 } else {
1892 if (!Brackets)
1893 Brackets = std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
1894 else
1895 *Brackets = WaitcntBrackets(ST, Limits, Encoding);
1896 }
1897
1898 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
1899 BI.Dirty = false;
1900
1901 if (Brackets->hasPendingEvent()) {
1902 BlockInfo *MoveBracketsToSucc = nullptr;
1903 for (MachineBasicBlock *Succ : MBB->successors()) {
1904 auto SuccBII = BlockInfos.find(Succ);
1905 BlockInfo &SuccBI = SuccBII->second;
1906 if (!SuccBI.Incoming) {
1907 SuccBI.Dirty = true;
1908 if (SuccBII <= BII)
1909 Repeat = true;
1910 if (!MoveBracketsToSucc) {
1911 MoveBracketsToSucc = &SuccBI;
1912 } else {
1913 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
1914 }
1915 } else if (SuccBI.Incoming->merge(*Brackets)) {
1916 SuccBI.Dirty = true;
1917 if (SuccBII <= BII)
1918 Repeat = true;
1919 }
1920 }
1921 if (MoveBracketsToSucc)
1922 MoveBracketsToSucc->Incoming = std::move(Brackets);
1923 }
1924 }
1925 } while (Repeat);
1926
1927 if (ST->hasScalarStores()) {
1929 bool HaveScalarStores = false;
1930
1931 for (MachineBasicBlock &MBB : MF) {
1932 for (MachineInstr &MI : MBB) {
1933 if (!HaveScalarStores && TII->isScalarStore(MI))
1934 HaveScalarStores = true;
1935
1936 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1937 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1938 EndPgmBlocks.push_back(&MBB);
1939 }
1940 }
1941
1942 if (HaveScalarStores) {
1943 // If scalar writes are used, the cache must be flushed or else the next
1944 // wave to reuse the same scratch memory can be clobbered.
1945 //
1946 // Insert s_dcache_wb at wave termination points if there were any scalar
1947 // stores, and only if the cache hasn't already been flushed. This could
1948 // be improved by looking across blocks for flushes in postdominating
1949 // blocks from the stores but an explicitly requested flush is probably
1950 // very rare.
1951 for (MachineBasicBlock *MBB : EndPgmBlocks) {
1952 bool SeenDCacheWB = false;
1953
1954 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
1955 I != E; ++I) {
1956 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1957 SeenDCacheWB = true;
1958 else if (TII->isScalarStore(*I))
1959 SeenDCacheWB = false;
1960
1961 // FIXME: It would be better to insert this before a waitcnt if any.
1962 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1963 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1964 !SeenDCacheWB) {
1965 Modified = true;
1966 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1967 }
1968 }
1969 }
1970 }
1971 }
1972
1973 // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
1974 // instructions.
1975 for (MachineInstr *MI : ReleaseVGPRInsts) {
1976 if (ST->requiresNopBeforeDeallocVGPRs()) {
1977 BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_NOP))
1978 .addImm(0);
1979 }
1980 BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_SENDMSG))
1982 Modified = true;
1983 }
1984 ReleaseVGPRInsts.clear();
1985
1986 return Modified;
1987}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:182
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1272
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static bool readsVCCZ(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define DEBUG_TYPE
SI Insert Waitcnts
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Provides some synthesis utilities to produce sequences of values.
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
This class represents an Operation in the Expression.
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:100
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:72
A debug info location.
Definition: DebugLoc.h:33
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
bool erase(const KeyT &Val)
Definition: DenseMap.h:329
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
bool hasOptNone() const
Do not optimize this function (-O0).
Definition: Function.h:641
Interval Class - An Interval is a set of nodes defined such that every node in the interval has all o...
Definition: Interval.h:36
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:47
const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
iterator_range< succ_iterator > successors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:68
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:543
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:326
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:915
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:546
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr fully defines the specified register.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:756
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:472
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:553
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
A description of a memory reference used in the backend.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
iterator find(const KeyT &Key)
Definition: MapVector.h:146
iterator begin()
Definition: MapVector.h:69
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:117
void clear()
Definition: MapVector.h:88
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:404
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:584
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:597
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:743
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:534
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:576
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:621
static bool isVINTERP(const MachineInstr &MI)
Definition: SIInstrInfo.h:751
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:500
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:560
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:396
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
Iterator for intrusive lists based on ilist_node.
self_iterator getIterator()
Definition: ilist_node.h:82
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:392
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:395
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:390
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getVmcntBitMask(const IsaVersion &Version)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition: Sequence.h:337
@ Wait
Definition: Threading.h:61
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:666
char & SIInsertWaitcntsID
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1734
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
FunctionPass * createSIInsertWaitcntsPass()
Instruction set architecture version.
Definition: TargetParser.h:114
Represents the counter values to wait for in an s_waitcnt instruction.
static Waitcnt allZero(bool HasVscnt)
static Waitcnt allZeroExceptVsCnt()
static constexpr bool is_iterable
Definition: Sequence.h:100