LLVM 20.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
31#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/Sequence.h"
39using namespace llvm;
40
41#define DEBUG_TYPE "si-insert-waitcnts"
42
43DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
44 "Force emit s_waitcnt expcnt(0) instrs");
45DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
46 "Force emit s_waitcnt lgkmcnt(0) instrs");
47DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
48 "Force emit s_waitcnt vmcnt(0) instrs");
49
50static cl::opt<bool>
51 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
52 cl::desc("Force all waitcnt instrs to be emitted as "
53 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
54 cl::init(false), cl::Hidden);
55
56namespace {
57// Class of object that encapsulates latest instruction counter score
58// associated with the operand. Used for determining whether
59// s_waitcnt instruction needs to be emitted.
60
61enum InstCounterType {
62 LOAD_CNT = 0, // VMcnt prior to gfx12.
63 DS_CNT, // LKGMcnt prior to gfx12.
64 EXP_CNT, //
65 STORE_CNT, // VScnt in gfx10/gfx11.
66 NUM_NORMAL_INST_CNTS,
67 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
68 BVH_CNT, // gfx12+ only.
69 KM_CNT, // gfx12+ only.
70 NUM_EXTENDED_INST_CNTS,
71 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
72};
73} // namespace
74
75namespace llvm {
76template <> struct enum_iteration_traits<InstCounterType> {
77 static constexpr bool is_iterable = true;
78};
79} // namespace llvm
80
81namespace {
82// Return an iterator over all counters between LOAD_CNT (the first counter)
83// and \c MaxCounter (exclusive, default value yields an enumeration over
84// all counters).
85auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
86 return enum_seq(LOAD_CNT, MaxCounter);
87}
88
89using RegInterval = std::pair<int, int>;
90
91struct HardwareLimits {
92 unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
93 unsigned ExpcntMax;
94 unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
95 unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
96 unsigned SamplecntMax; // gfx12+ only.
97 unsigned BvhcntMax; // gfx12+ only.
98 unsigned KmcntMax; // gfx12+ only.
99};
100
101struct RegisterEncoding {
102 unsigned VGPR0;
103 unsigned VGPRL;
104 unsigned SGPR0;
105 unsigned SGPRL;
106};
107
108enum WaitEventType {
109 VMEM_ACCESS, // vector-memory read & write
110 VMEM_READ_ACCESS, // vector-memory read
111 VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
112 VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only)
113 VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
114 SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
115 LDS_ACCESS, // lds read & write
116 GDS_ACCESS, // gds read & write
117 SQ_MESSAGE, // send message
118 SMEM_ACCESS, // scalar-memory read & write
119 EXP_GPR_LOCK, // export holding on its data src
120 GDS_GPR_LOCK, // GDS holding on its data and addr src
121 EXP_POS_ACCESS, // write to export position
122 EXP_PARAM_ACCESS, // write to export parameter
123 VMW_GPR_LOCK, // vector-memory write holding on its data src
124 EXP_LDS_ACCESS, // read by ldsdir counting as export
125 NUM_WAIT_EVENTS,
126};
127
128// The mapping is:
129// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
130// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
131// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
132// We reserve a fixed number of VGPR slots in the scoring tables for
133// special tokens like SCMEM_LDS (needed for buffer load to LDS).
134enum RegisterMapping {
135 SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
136 AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
137 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
138 NUM_EXTRA_VGPRS = 9, // Reserved slots for DS.
139 // Artificial register slots to track LDS writes into specific LDS locations
140 // if a location is known. When slots are exhausted or location is
141 // unknown use the first slot. The first slot is also always updated in
142 // addition to known location's slot to properly generate waits if dependent
143 // instruction's location is unknown.
144 EXTRA_VGPR_LDS = 0,
145 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
146};
147
148// Enumerate different types of result-returning VMEM operations. Although
149// s_waitcnt orders them all with a single vmcnt counter, in the absence of
150// s_waitcnt only instructions of the same VmemType are guaranteed to write
151// their results in order -- so there is no need to insert an s_waitcnt between
152// two instructions of the same type that write the same vgpr.
153enum VmemType {
154 // BUF instructions and MIMG instructions without a sampler.
155 VMEM_NOSAMPLER,
156 // MIMG instructions with a sampler.
157 VMEM_SAMPLER,
158 // BVH instructions
159 VMEM_BVH,
160 NUM_VMEM_TYPES
161};
162
163// Maps values of InstCounterType to the instruction that waits on that
164// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
165// returns true.
166static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
167 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
168 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
169 AMDGPU::S_WAIT_KMCNT};
170
171static bool updateVMCntOnly(const MachineInstr &Inst) {
172 return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
174}
175
176#ifndef NDEBUG
177static bool isNormalMode(InstCounterType MaxCounter) {
178 return MaxCounter == NUM_NORMAL_INST_CNTS;
179}
180#endif // NDEBUG
181
182VmemType getVmemType(const MachineInstr &Inst) {
183 assert(updateVMCntOnly(Inst));
184 if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) &&
186 return VMEM_NOSAMPLER;
188 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
190 // We have to make an additional check for isVSAMPLE here since some
191 // instructions don't have a sampler, but are still classified as sampler
192 // instructions for the purposes of e.g. waitcnt.
193 return BaseInfo->BVH ? VMEM_BVH
194 : (BaseInfo->Sampler || SIInstrInfo::isVSAMPLE(Inst)) ? VMEM_SAMPLER
195 : VMEM_NOSAMPLER;
196}
197
198unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
199 switch (T) {
200 case LOAD_CNT:
201 return Wait.LoadCnt;
202 case EXP_CNT:
203 return Wait.ExpCnt;
204 case DS_CNT:
205 return Wait.DsCnt;
206 case STORE_CNT:
207 return Wait.StoreCnt;
208 case SAMPLE_CNT:
209 return Wait.SampleCnt;
210 case BVH_CNT:
211 return Wait.BvhCnt;
212 case KM_CNT:
213 return Wait.KmCnt;
214 default:
215 llvm_unreachable("bad InstCounterType");
216 }
217}
218
219void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
220 unsigned &WC = getCounterRef(Wait, T);
221 WC = std::min(WC, Count);
222}
223
224void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
225 getCounterRef(Wait, T) = ~0u;
226}
227
228unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
229 return getCounterRef(Wait, T);
230}
231
232// Mapping from event to counter according to the table masks.
233InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
234 for (auto T : inst_counter_types()) {
235 if (masks[T] & (1 << E))
236 return T;
237 }
238 llvm_unreachable("event type has no associated counter");
239}
240
241// This objects maintains the current score brackets of each wait counter, and
242// a per-register scoreboard for each wait counter.
243//
244// We also maintain the latest score for every event type that can change the
245// waitcnt in order to know if there are multiple types of events within
246// the brackets. When multiple types of event happen in the bracket,
247// wait count may get decreased out of order, therefore we need to put in
248// "s_waitcnt 0" before use.
249class WaitcntBrackets {
250public:
251 WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
252 HardwareLimits Limits, RegisterEncoding Encoding,
253 const unsigned *WaitEventMaskForInst,
254 InstCounterType SmemAccessCounter)
255 : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
256 Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
257 SmemAccessCounter(SmemAccessCounter) {}
258
259 unsigned getWaitCountMax(InstCounterType T) const {
260 switch (T) {
261 case LOAD_CNT:
262 return Limits.LoadcntMax;
263 case DS_CNT:
264 return Limits.DscntMax;
265 case EXP_CNT:
266 return Limits.ExpcntMax;
267 case STORE_CNT:
268 return Limits.StorecntMax;
269 case SAMPLE_CNT:
270 return Limits.SamplecntMax;
271 case BVH_CNT:
272 return Limits.BvhcntMax;
273 case KM_CNT:
274 return Limits.KmcntMax;
275 default:
276 break;
277 }
278 return 0;
279 }
280
281 unsigned getScoreLB(InstCounterType T) const {
282 assert(T < NUM_INST_CNTS);
283 return ScoreLBs[T];
284 }
285
286 unsigned getScoreUB(InstCounterType T) const {
287 assert(T < NUM_INST_CNTS);
288 return ScoreUBs[T];
289 }
290
291 unsigned getScoreRange(InstCounterType T) const {
292 return getScoreUB(T) - getScoreLB(T);
293 }
294
295 unsigned getRegScore(int GprNo, InstCounterType T) const {
296 if (GprNo < NUM_ALL_VGPRS) {
297 return VgprScores[T][GprNo];
298 }
299 assert(T == SmemAccessCounter);
300 return SgprScores[GprNo - NUM_ALL_VGPRS];
301 }
302
303 bool merge(const WaitcntBrackets &Other);
304
305 RegInterval getRegInterval(const MachineInstr *MI,
307 const SIRegisterInfo *TRI,
308 const MachineOperand &Op) const;
309
310 bool counterOutOfOrder(InstCounterType T) const;
311 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
312 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
313
314 void determineWait(InstCounterType T, RegInterval Interval,
315 AMDGPU::Waitcnt &Wait) const;
316 void determineWait(InstCounterType T, int RegNo,
317 AMDGPU::Waitcnt &Wait) const {
318 determineWait(T, {RegNo, RegNo + 1}, Wait);
319 }
320
321 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
322 void applyWaitcnt(InstCounterType T, unsigned Count);
323 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
324 const MachineRegisterInfo *MRI, WaitEventType E,
326
327 unsigned hasPendingEvent() const { return PendingEvents; }
328 unsigned hasPendingEvent(WaitEventType E) const {
329 return PendingEvents & (1 << E);
330 }
331 unsigned hasPendingEvent(InstCounterType T) const {
332 unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
333 assert((HasPending != 0) == (getScoreRange(T) != 0));
334 return HasPending;
335 }
336
337 bool hasMixedPendingEvents(InstCounterType T) const {
338 unsigned Events = hasPendingEvent(T);
339 // Return true if more than one bit is set in Events.
340 return Events & (Events - 1);
341 }
342
343 bool hasPendingFlat() const {
344 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
345 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
346 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
347 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
348 }
349
350 void setPendingFlat() {
351 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
352 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
353 }
354
355 // Return true if there might be pending writes to the vgpr-interval by VMEM
356 // instructions with types different from V.
357 bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const {
358 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
359 assert(RegNo < NUM_ALL_VGPRS);
360 if (VgprVmemTypes[RegNo] & ~(1 << V))
361 return true;
362 }
363 return false;
364 }
365
366 void clearVgprVmemTypes(RegInterval Interval) {
367 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
368 assert(RegNo < NUM_ALL_VGPRS);
369 VgprVmemTypes[RegNo] = 0;
370 }
371 }
372
373 void setStateOnFunctionEntryOrReturn() {
374 setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
375 PendingEvents |= WaitEventMaskForInst[STORE_CNT];
376 }
377
378 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
379 return LDSDMAStores;
380 }
381
382 void print(raw_ostream &);
383 void dump() { print(dbgs()); }
384
385private:
386 struct MergeInfo {
387 unsigned OldLB;
388 unsigned OtherLB;
389 unsigned MyShift;
390 unsigned OtherShift;
391 };
392 static bool mergeScore(const MergeInfo &M, unsigned &Score,
393 unsigned OtherScore);
394
395 void setScoreLB(InstCounterType T, unsigned Val) {
396 assert(T < NUM_INST_CNTS);
397 ScoreLBs[T] = Val;
398 }
399
400 void setScoreUB(InstCounterType T, unsigned Val) {
401 assert(T < NUM_INST_CNTS);
402 ScoreUBs[T] = Val;
403
404 if (T != EXP_CNT)
405 return;
406
407 if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
408 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
409 }
410
411 void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
412 setScoreByInterval({GprNo, GprNo + 1}, T, Val);
413 }
414
415 void setScoreByInterval(RegInterval Interval, InstCounterType CntTy,
416 unsigned Score);
417
418 void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI,
420 const MachineOperand &Op, InstCounterType CntTy,
421 unsigned Val);
422
423 const GCNSubtarget *ST = nullptr;
424 InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
425 HardwareLimits Limits = {};
426 RegisterEncoding Encoding = {};
427 const unsigned *WaitEventMaskForInst;
428 InstCounterType SmemAccessCounter;
429 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
430 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
431 unsigned PendingEvents = 0;
432 // Remember the last flat memory operation.
433 unsigned LastFlat[NUM_INST_CNTS] = {0};
434 // wait_cnt scores for every vgpr.
435 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
436 int VgprUB = -1;
437 int SgprUB = -1;
438 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
439 // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
440 // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
441 unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
442 // Bitmask of the VmemTypes of VMEM instructions that might have a pending
443 // write to each vgpr.
444 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
445 // Store representative LDS DMA operations. The only useful info here is
446 // alias info. One store is kept per unique AAInfo.
447 SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
448};
449
450// This abstracts the logic for generating and updating S_WAIT* instructions
451// away from the analysis that determines where they are needed. This was
452// done because the set of counters and instructions for waiting on them
453// underwent a major shift with gfx12, sufficiently so that having this
454// abstraction allows the main analysis logic to be simpler than it would
455// otherwise have had to become.
456class WaitcntGenerator {
457protected:
458 const GCNSubtarget *ST = nullptr;
459 const SIInstrInfo *TII = nullptr;
461 InstCounterType MaxCounter;
462 bool OptNone;
463
464public:
465 WaitcntGenerator() = default;
466 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
467 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
468 IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
469 OptNone(MF.getFunction().hasOptNone() ||
470 MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
471
472 // Return true if the current function should be compiled with no
473 // optimization.
474 bool isOptNone() const { return OptNone; }
475
476 // Edits an existing sequence of wait count instructions according
477 // to an incoming Waitcnt value, which is itself updated to reflect
478 // any new wait count instructions which may need to be generated by
479 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
480 // were made.
481 //
482 // This editing will usually be merely updated operands, but it may also
483 // delete instructions if the incoming Wait value indicates they are not
484 // needed. It may also remove existing instructions for which a wait
485 // is needed if it can be determined that it is better to generate new
486 // instructions later, as can happen on gfx12.
487 virtual bool
488 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
489 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
491
492 // Transform a soft waitcnt into a normal one.
493 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
494
495 // Generates new wait count instructions according to the value of
496 // Wait, returning true if any new instructions were created.
497 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
500
501 // Returns an array of bit masks which can be used to map values in
502 // WaitEventType to corresponding counter values in InstCounterType.
503 virtual const unsigned *getWaitEventMask() const = 0;
504
505 // Returns a new waitcnt with all counters except VScnt set to 0. If
506 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
507 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
508
509 virtual ~WaitcntGenerator() = default;
510
511 // Create a mask value from the initializer list of wait event types.
512 static constexpr unsigned
513 eventMask(std::initializer_list<WaitEventType> Events) {
514 unsigned Mask = 0;
515 for (auto &E : Events)
516 Mask |= 1 << E;
517
518 return Mask;
519 }
520};
521
522class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
523public:
524 WaitcntGeneratorPreGFX12() = default;
525 WaitcntGeneratorPreGFX12(const MachineFunction &MF)
526 : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
527
528 bool
529 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
530 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
531 MachineBasicBlock::instr_iterator It) const override;
532
533 bool createNewWaitcnt(MachineBasicBlock &Block,
535 AMDGPU::Waitcnt Wait) override;
536
537 const unsigned *getWaitEventMask() const override {
538 assert(ST);
539
540 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
541 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
542 VMEM_BVH_READ_ACCESS}),
543 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
544 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
545 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
546 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
547 0,
548 0,
549 0};
550
551 return WaitEventMaskForInstPreGFX12;
552 }
553
554 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
555};
556
557class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
558public:
559 WaitcntGeneratorGFX12Plus() = default;
560 WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
561 InstCounterType MaxCounter)
562 : WaitcntGenerator(MF, MaxCounter) {}
563
564 bool
565 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
566 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
567 MachineBasicBlock::instr_iterator It) const override;
568
569 bool createNewWaitcnt(MachineBasicBlock &Block,
571 AMDGPU::Waitcnt Wait) override;
572
573 const unsigned *getWaitEventMask() const override {
574 assert(ST);
575
576 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
577 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
578 eventMask({LDS_ACCESS, GDS_ACCESS}),
579 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
580 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
581 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
582 eventMask({VMEM_SAMPLER_READ_ACCESS}),
583 eventMask({VMEM_BVH_READ_ACCESS}),
584 eventMask({SMEM_ACCESS, SQ_MESSAGE})};
585
586 return WaitEventMaskForInstGFX12Plus;
587 }
588
589 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
590};
591
592class SIInsertWaitcnts : public MachineFunctionPass {
593private:
594 const GCNSubtarget *ST = nullptr;
595 const SIInstrInfo *TII = nullptr;
596 const SIRegisterInfo *TRI = nullptr;
597 const MachineRegisterInfo *MRI = nullptr;
598
600 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
601 MachineLoopInfo *MLI;
603 AliasAnalysis *AA = nullptr;
604
605 struct BlockInfo {
606 std::unique_ptr<WaitcntBrackets> Incoming;
607 bool Dirty = true;
608 };
609
610 InstCounterType SmemAccessCounter;
611
613
614 bool ForceEmitWaitcnt[NUM_INST_CNTS];
615
616 // In any given run of this pass, WCG will point to one of these two
617 // generator objects, which must have been re-initialised before use
618 // from a value made using a subtarget constructor.
619 WaitcntGeneratorPreGFX12 WCGPreGFX12;
620 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
621
622 WaitcntGenerator *WCG = nullptr;
623
624 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
625 // message.
626 DenseSet<MachineInstr *> ReleaseVGPRInsts;
627
628 InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
629
630public:
631 static char ID;
632
633 SIInsertWaitcnts() : MachineFunctionPass(ID) {
634 (void)ForceExpCounter;
635 (void)ForceLgkmCounter;
636 (void)ForceVMCounter;
637 }
638
639 bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
640 bool isPreheaderToFlush(MachineBasicBlock &MBB,
641 WaitcntBrackets &ScoreBrackets);
642 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
643 bool runOnMachineFunction(MachineFunction &MF) override;
644
645 StringRef getPassName() const override {
646 return "SI insert wait instructions";
647 }
648
649 void getAnalysisUsage(AnalysisUsage &AU) const override {
650 AU.setPreservesCFG();
656 }
657
658 bool isForceEmitWaitcnt() const {
659 for (auto T : inst_counter_types())
660 if (ForceEmitWaitcnt[T])
661 return true;
662 return false;
663 }
664
665 void setForceEmitWaitcnt() {
666// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
667// For debug builds, get the debug counter info and adjust if need be
668#ifndef NDEBUG
669 if (DebugCounter::isCounterSet(ForceExpCounter) &&
670 DebugCounter::shouldExecute(ForceExpCounter)) {
671 ForceEmitWaitcnt[EXP_CNT] = true;
672 } else {
673 ForceEmitWaitcnt[EXP_CNT] = false;
674 }
675
676 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
677 DebugCounter::shouldExecute(ForceLgkmCounter)) {
678 ForceEmitWaitcnt[DS_CNT] = true;
679 ForceEmitWaitcnt[KM_CNT] = true;
680 } else {
681 ForceEmitWaitcnt[DS_CNT] = false;
682 ForceEmitWaitcnt[KM_CNT] = false;
683 }
684
685 if (DebugCounter::isCounterSet(ForceVMCounter) &&
686 DebugCounter::shouldExecute(ForceVMCounter)) {
687 ForceEmitWaitcnt[LOAD_CNT] = true;
688 ForceEmitWaitcnt[SAMPLE_CNT] = true;
689 ForceEmitWaitcnt[BVH_CNT] = true;
690 } else {
691 ForceEmitWaitcnt[LOAD_CNT] = false;
692 ForceEmitWaitcnt[SAMPLE_CNT] = false;
693 ForceEmitWaitcnt[BVH_CNT] = false;
694 }
695#endif // NDEBUG
696 }
697
698 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
699 // FLAT instruction.
700 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
701 // Maps VMEM access types to their corresponding WaitEventType.
702 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
703 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
704
706 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
707 // these should use VM_CNT.
708 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
709 return VMEM_ACCESS;
710 if (Inst.mayStore() &&
711 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
712 // FLAT and SCRATCH instructions may access scratch. Other VMEM
713 // instructions do not.
714 if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
715 return SCRATCH_WRITE_ACCESS;
716 return VMEM_WRITE_ACCESS;
717 }
718 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
719 return VMEM_READ_ACCESS;
720 return VmemReadMapping[getVmemType(Inst)];
721 }
722
723 bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
724 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
725 bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
726 bool generateWaitcntInstBefore(MachineInstr &MI,
727 WaitcntBrackets &ScoreBrackets,
728 MachineInstr *OldWaitcntInstr,
729 bool FlushVmCnt);
730 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
732 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
733 MachineInstr *OldWaitcntInstr);
734 void updateEventWaitcntAfter(MachineInstr &Inst,
735 WaitcntBrackets *ScoreBrackets);
736 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
737 WaitcntBrackets &ScoreBrackets);
738};
739
740} // end anonymous namespace
741
742RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
744 const SIRegisterInfo *TRI,
745 const MachineOperand &Op) const {
746 if (!TRI->isInAllocatableClass(Op.getReg()))
747 return {-1, -1};
748
749 // A use via a PW operand does not need a waitcnt.
750 // A partial write is not a WAW.
751 assert(!Op.getSubReg() || !Op.isUndef());
752
753 RegInterval Result;
754
755 unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
757
758 if (TRI->isVectorRegister(*MRI, Op.getReg())) {
759 assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
760 Result.first = Reg - Encoding.VGPR0;
761 if (TRI->isAGPR(*MRI, Op.getReg()))
762 Result.first += AGPR_OFFSET;
763 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
764 } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
765 assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
766 Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
767 assert(Result.first >= NUM_ALL_VGPRS &&
768 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
769 }
770 // TODO: Handle TTMP
771 // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
772 else
773 return {-1, -1};
774
775 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
776 unsigned Size = TRI->getRegSizeInBits(*RC);
777 Result.second = Result.first + ((Size + 16) / 32);
778
779 return Result;
780}
781
782void WaitcntBrackets::setScoreByInterval(RegInterval Interval,
783 InstCounterType CntTy,
784 unsigned Score) {
785 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
786 if (RegNo < NUM_ALL_VGPRS) {
787 VgprUB = std::max(VgprUB, RegNo);
788 VgprScores[CntTy][RegNo] = Score;
789 } else {
790 assert(CntTy == SmemAccessCounter);
791 SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
792 SgprScores[RegNo - NUM_ALL_VGPRS] = Score;
793 }
794 }
795}
796
797void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI,
798 const SIRegisterInfo *TRI,
800 const MachineOperand &Op,
801 InstCounterType CntTy, unsigned Score) {
802 RegInterval Interval = getRegInterval(MI, MRI, TRI, Op);
803 setScoreByInterval(Interval, CntTy, Score);
804}
805
806void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
807 const SIRegisterInfo *TRI,
809 WaitEventType E, MachineInstr &Inst) {
810 InstCounterType T = eventCounter(WaitEventMaskForInst, E);
811
812 unsigned UB = getScoreUB(T);
813 unsigned CurrScore = UB + 1;
814 if (CurrScore == 0)
815 report_fatal_error("InsertWaitcnt score wraparound");
816 // PendingEvents and ScoreUB need to be update regardless if this event
817 // changes the score of a register or not.
818 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
819 PendingEvents |= 1 << E;
820 setScoreUB(T, CurrScore);
821
822 if (T == EXP_CNT) {
823 // Put score on the source vgprs. If this is a store, just use those
824 // specific register(s).
825 if (TII->isDS(Inst) && Inst.mayLoadOrStore()) {
826 // All GDS operations must protect their address register (same as
827 // export.)
828 if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
829 setScoreByOperand(&Inst, TRI, MRI, *AddrOp, EXP_CNT, CurrScore);
830
831 if (Inst.mayStore()) {
832 if (const auto *Data0 =
833 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
834 setScoreByOperand(&Inst, TRI, MRI, *Data0, EXP_CNT, CurrScore);
835 if (const auto *Data1 =
836 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
837 setScoreByOperand(&Inst, TRI, MRI, *Data1, EXP_CNT, CurrScore);
838 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
839 Inst.getOpcode() != AMDGPU::DS_APPEND &&
840 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
841 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
842 for (const MachineOperand &Op : Inst.all_uses()) {
843 if (TRI->isVectorRegister(*MRI, Op.getReg()))
844 setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
845 }
846 }
847 } else if (TII->isFLAT(Inst)) {
848 if (Inst.mayStore()) {
849 setScoreByOperand(&Inst, TRI, MRI,
850 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
851 EXP_CNT, CurrScore);
852 } else if (SIInstrInfo::isAtomicRet(Inst)) {
853 setScoreByOperand(&Inst, TRI, MRI,
854 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
855 EXP_CNT, CurrScore);
856 }
857 } else if (TII->isMIMG(Inst)) {
858 if (Inst.mayStore()) {
859 setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
860 CurrScore);
861 } else if (SIInstrInfo::isAtomicRet(Inst)) {
862 setScoreByOperand(&Inst, TRI, MRI,
863 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
864 EXP_CNT, CurrScore);
865 }
866 } else if (TII->isMTBUF(Inst)) {
867 if (Inst.mayStore())
868 setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
869 CurrScore);
870 } else if (TII->isMUBUF(Inst)) {
871 if (Inst.mayStore()) {
872 setScoreByOperand(&Inst, TRI, MRI, Inst.getOperand(0), EXP_CNT,
873 CurrScore);
874 } else if (SIInstrInfo::isAtomicRet(Inst)) {
875 setScoreByOperand(&Inst, TRI, MRI,
876 *TII->getNamedOperand(Inst, AMDGPU::OpName::data),
877 EXP_CNT, CurrScore);
878 }
879 } else if (TII->isLDSDIR(Inst)) {
880 // LDSDIR instructions attach the score to the destination.
881 setScoreByOperand(&Inst, TRI, MRI,
882 *TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
883 EXP_CNT, CurrScore);
884 } else {
885 if (TII->isEXP(Inst)) {
886 // For export the destination registers are really temps that
887 // can be used as the actual source after export patching, so
888 // we need to treat them like sources and set the EXP_CNT
889 // score.
890 for (MachineOperand &DefMO : Inst.all_defs()) {
891 if (TRI->isVGPR(*MRI, DefMO.getReg())) {
892 setScoreByOperand(&Inst, TRI, MRI, DefMO, EXP_CNT, CurrScore);
893 }
894 }
895 }
896 for (const MachineOperand &Op : Inst.all_uses()) {
897 if (TRI->isVectorRegister(*MRI, Op.getReg()))
898 setScoreByOperand(&Inst, TRI, MRI, Op, EXP_CNT, CurrScore);
899 }
900 }
901 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
902 // Match the score to the destination registers.
903 //
904 // Check only explicit operands. Stores, especially spill stores, include
905 // implicit uses and defs of their super registers which would create an
906 // artificial dependency, while these are there only for register liveness
907 // accounting purposes.
908 //
909 // Special cases where implicit register defs exists, such as M0 or VCC,
910 // but none with memory instructions.
911 for (const MachineOperand &Op : Inst.defs()) {
912 RegInterval Interval = getRegInterval(&Inst, MRI, TRI, Op);
913 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
914 if (Interval.first >= NUM_ALL_VGPRS)
915 continue;
916 if (updateVMCntOnly(Inst)) {
917 // updateVMCntOnly should only leave us with VGPRs
918 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
919 // defs. That's required for a sane index into `VgprMemTypes` below
920 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
921 VmemType V = getVmemType(Inst);
922 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
923 VgprVmemTypes[RegNo] |= 1 << V;
924 }
925 }
926 setScoreByInterval(Interval, T, CurrScore);
927 }
928 if (Inst.mayStore() &&
929 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
930 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
931 // written can be accessed. A load from LDS to VMEM does not need a wait.
932 unsigned Slot = 0;
933 for (const auto *MemOp : Inst.memoperands()) {
934 if (!MemOp->isStore() ||
935 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
936 continue;
937 // Comparing just AA info does not guarantee memoperands are equal
938 // in general, but this is so for LDS DMA in practice.
939 auto AAI = MemOp->getAAInfo();
940 // Alias scope information gives a way to definitely identify an
941 // original memory object and practically produced in the module LDS
942 // lowering pass. If there is no scope available we will not be able
943 // to disambiguate LDS aliasing as after the module lowering all LDS
944 // is squashed into a single big object. Do not attempt to use one of
945 // the limited LDSDMAStores for something we will not be able to use
946 // anyway.
947 if (!AAI || !AAI.Scope)
948 break;
949 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
950 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
951 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
952 Slot = I + 1;
953 break;
954 }
955 }
956 }
957 if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
958 break;
959 LDSDMAStores.push_back(&Inst);
960 Slot = LDSDMAStores.size();
961 break;
962 }
963 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore);
964 if (Slot)
965 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
966 }
967 }
968}
969
970void WaitcntBrackets::print(raw_ostream &OS) {
971 OS << '\n';
972 for (auto T : inst_counter_types(MaxCounter)) {
973 unsigned SR = getScoreRange(T);
974
975 switch (T) {
976 case LOAD_CNT:
977 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
978 << SR << "): ";
979 break;
980 case DS_CNT:
981 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
982 << SR << "): ";
983 break;
984 case EXP_CNT:
985 OS << " EXP_CNT(" << SR << "): ";
986 break;
987 case STORE_CNT:
988 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
989 << SR << "): ";
990 break;
991 case SAMPLE_CNT:
992 OS << " SAMPLE_CNT(" << SR << "): ";
993 break;
994 case BVH_CNT:
995 OS << " BVH_CNT(" << SR << "): ";
996 break;
997 case KM_CNT:
998 OS << " KM_CNT(" << SR << "): ";
999 break;
1000 default:
1001 OS << " UNKNOWN(" << SR << "): ";
1002 break;
1003 }
1004
1005 if (SR != 0) {
1006 // Print vgpr scores.
1007 unsigned LB = getScoreLB(T);
1008
1009 for (int J = 0; J <= VgprUB; J++) {
1010 unsigned RegScore = getRegScore(J, T);
1011 if (RegScore <= LB)
1012 continue;
1013 unsigned RelScore = RegScore - LB - 1;
1014 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
1015 OS << RelScore << ":v" << J << " ";
1016 } else {
1017 OS << RelScore << ":ds ";
1018 }
1019 }
1020 // Also need to print sgpr scores for lgkm_cnt.
1021 if (T == SmemAccessCounter) {
1022 for (int J = 0; J <= SgprUB; J++) {
1023 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
1024 if (RegScore <= LB)
1025 continue;
1026 unsigned RelScore = RegScore - LB - 1;
1027 OS << RelScore << ":s" << J << " ";
1028 }
1029 }
1030 }
1031 OS << '\n';
1032 }
1033 OS << '\n';
1034}
1035
1036/// Simplify the waitcnt, in the sense of removing redundant counts, and return
1037/// whether a waitcnt instruction is needed at all.
1038void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1039 simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1040 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
1041 simplifyWaitcnt(DS_CNT, Wait.DsCnt);
1042 simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
1043 simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1044 simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
1045 simplifyWaitcnt(KM_CNT, Wait.KmCnt);
1046}
1047
1048void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1049 unsigned &Count) const {
1050 // The number of outstanding events for this type, T, can be calculated
1051 // as (UB - LB). If the current Count is greater than or equal to the number
1052 // of outstanding events, then the wait for this counter is redundant.
1053 if (Count >= getScoreRange(T))
1054 Count = ~0u;
1055}
1056
1057void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval,
1058 AMDGPU::Waitcnt &Wait) const {
1059 const unsigned LB = getScoreLB(T);
1060 const unsigned UB = getScoreUB(T);
1061 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1062 unsigned ScoreToWait = getRegScore(RegNo, T);
1063
1064 // If the score of src_operand falls within the bracket, we need an
1065 // s_waitcnt instruction.
1066 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1067 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1068 !ST->hasFlatLgkmVMemCountInOrder()) {
1069 // If there is a pending FLAT operation, and this is a VMem or LGKM
1070 // waitcnt and the target can report early completion, then we need
1071 // to force a waitcnt 0.
1072 addWait(Wait, T, 0);
1073 } else if (counterOutOfOrder(T)) {
1074 // Counter can get decremented out-of-order when there
1075 // are multiple types event in the bracket. Also emit an s_wait counter
1076 // with a conservative value of 0 for the counter.
1077 addWait(Wait, T, 0);
1078 } else {
1079 // If a counter has been maxed out avoid overflow by waiting for
1080 // MAX(CounterType) - 1 instead.
1081 unsigned NeededWait =
1082 std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
1083 addWait(Wait, T, NeededWait);
1084 }
1085 }
1086 }
1087}
1088
1089void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1090 applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1091 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1092 applyWaitcnt(DS_CNT, Wait.DsCnt);
1093 applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1094 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1095 applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1096 applyWaitcnt(KM_CNT, Wait.KmCnt);
1097}
1098
1099void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1100 const unsigned UB = getScoreUB(T);
1101 if (Count >= UB)
1102 return;
1103 if (Count != 0) {
1104 if (counterOutOfOrder(T))
1105 return;
1106 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1107 } else {
1108 setScoreLB(T, UB);
1109 PendingEvents &= ~WaitEventMaskForInst[T];
1110 }
1111}
1112
1113// Where there are multiple types of event in the bracket of a counter,
1114// the decrement may go out of order.
1115bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1116 // Scalar memory read always can go out of order.
1117 if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
1118 return true;
1119 return hasMixedPendingEvents(T);
1120}
1121
1122INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1123 false)
1126INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1127 false)
1128
1129char SIInsertWaitcnts::ID = 0;
1130
1131char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
1132
1134 return new SIInsertWaitcnts();
1135}
1136
1138 unsigned NewEnc) {
1139 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1140 assert(OpIdx >= 0);
1141
1142 MachineOperand &MO = MI.getOperand(OpIdx);
1143
1144 if (NewEnc == MO.getImm())
1145 return false;
1146
1147 MO.setImm(NewEnc);
1148 return true;
1149}
1150
1151/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1152/// and if so, which counter it is waiting on.
1153static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1154 switch (Opcode) {
1155 case AMDGPU::S_WAIT_LOADCNT:
1156 return LOAD_CNT;
1157 case AMDGPU::S_WAIT_EXPCNT:
1158 return EXP_CNT;
1159 case AMDGPU::S_WAIT_STORECNT:
1160 return STORE_CNT;
1161 case AMDGPU::S_WAIT_SAMPLECNT:
1162 return SAMPLE_CNT;
1163 case AMDGPU::S_WAIT_BVHCNT:
1164 return BVH_CNT;
1165 case AMDGPU::S_WAIT_DSCNT:
1166 return DS_CNT;
1167 case AMDGPU::S_WAIT_KMCNT:
1168 return KM_CNT;
1169 default:
1170 return {};
1171 }
1172}
1173
1174bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1175 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1176 if (Opcode == Waitcnt->getOpcode())
1177 return false;
1178
1179 Waitcnt->setDesc(TII->get(Opcode));
1180 return true;
1181}
1182
1183/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1184/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1185/// from \p Wait that were added by previous passes. Currently this pass
1186/// conservatively assumes that these preexisting waits are required for
1187/// correctness.
1188bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1189 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1191 assert(ST);
1192 assert(isNormalMode(MaxCounter));
1193
1194 bool Modified = false;
1195 MachineInstr *WaitcntInstr = nullptr;
1196 MachineInstr *WaitcntVsCntInstr = nullptr;
1197
1198 for (auto &II :
1199 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1200 if (II.isMetaInstruction())
1201 continue;
1202
1203 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1204 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1205
1206 // Update required wait count. If this is a soft waitcnt (= it was added
1207 // by an earlier pass), it may be entirely removed.
1208 if (Opcode == AMDGPU::S_WAITCNT) {
1209 unsigned IEnc = II.getOperand(0).getImm();
1210 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1211 if (TrySimplify)
1212 ScoreBrackets.simplifyWaitcnt(OldWait);
1213 Wait = Wait.combined(OldWait);
1214
1215 // Merge consecutive waitcnt of the same type by erasing multiples.
1216 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1217 II.eraseFromParent();
1218 Modified = true;
1219 } else
1220 WaitcntInstr = &II;
1221 } else {
1222 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1223 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1224
1225 unsigned OldVSCnt =
1226 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1227 if (TrySimplify)
1228 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1229 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1230
1231 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1232 II.eraseFromParent();
1233 Modified = true;
1234 } else
1235 WaitcntVsCntInstr = &II;
1236 }
1237 }
1238
1239 if (WaitcntInstr) {
1240 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1242 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1243
1244 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1245 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1246 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1247 Wait.LoadCnt = ~0u;
1248 Wait.ExpCnt = ~0u;
1249 Wait.DsCnt = ~0u;
1250
1251 LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1252 ? dbgs()
1253 << "applyPreexistingWaitcnt\n"
1254 << "New Instr at block end: " << *WaitcntInstr << '\n'
1255 : dbgs() << "applyPreexistingWaitcnt\n"
1256 << "Old Instr: " << *It
1257 << "New Instr: " << *WaitcntInstr << '\n');
1258 }
1259
1260 if (WaitcntVsCntInstr) {
1261 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
1262 AMDGPU::OpName::simm16, Wait.StoreCnt);
1263 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1264
1265 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1266 Wait.StoreCnt = ~0u;
1267
1268 LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1269 ? dbgs() << "applyPreexistingWaitcnt\n"
1270 << "New Instr at block end: " << *WaitcntVsCntInstr
1271 << '\n'
1272 : dbgs() << "applyPreexistingWaitcnt\n"
1273 << "Old Instr: " << *It
1274 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1275 }
1276
1277 return Modified;
1278}
1279
1280/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1281/// required counters in \p Wait
1282bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1285 assert(ST);
1286 assert(isNormalMode(MaxCounter));
1287
1288 bool Modified = false;
1289 const DebugLoc &DL = Block.findDebugLoc(It);
1290
1291 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1292 // single instruction while VScnt has its own instruction.
1293 if (Wait.hasWaitExceptStoreCnt()) {
1294 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1295 [[maybe_unused]] auto SWaitInst =
1296 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1297 Modified = true;
1298
1299 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1300 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1301 dbgs() << "New Instr: " << *SWaitInst << '\n');
1302 }
1303
1304 if (Wait.hasWaitStoreCnt()) {
1305 assert(ST->hasVscnt());
1306
1307 [[maybe_unused]] auto SWaitInst =
1308 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1309 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1310 .addImm(Wait.StoreCnt);
1311 Modified = true;
1312
1313 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1314 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1315 dbgs() << "New Instr: " << *SWaitInst << '\n');
1316 }
1317
1318 return Modified;
1319}
1320
1322WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1323 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1324}
1325
1327WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1328 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
1329}
1330
1331/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1332/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1333/// were added by previous passes. Currently this pass conservatively
1334/// assumes that these preexisting waits are required for correctness.
1335bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1336 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1338 assert(ST);
1339 assert(!isNormalMode(MaxCounter));
1340
1341 bool Modified = false;
1342 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1343 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1344 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1345
1346 for (auto &II :
1347 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1348 if (II.isMetaInstruction())
1349 continue;
1350
1351 MachineInstr **UpdatableInstr;
1352
1353 // Update required wait count. If this is a soft waitcnt (= it was added
1354 // by an earlier pass), it may be entirely removed.
1355
1356 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1357 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1358
1359 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1360 // attempt to do more than that either.
1361 if (Opcode == AMDGPU::S_WAITCNT)
1362 continue;
1363
1364 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1365 unsigned OldEnc =
1366 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1368 if (TrySimplify)
1369 ScoreBrackets.simplifyWaitcnt(OldWait);
1370 Wait = Wait.combined(OldWait);
1371 UpdatableInstr = &CombinedLoadDsCntInstr;
1372 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1373 unsigned OldEnc =
1374 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1376 if (TrySimplify)
1377 ScoreBrackets.simplifyWaitcnt(OldWait);
1378 Wait = Wait.combined(OldWait);
1379 UpdatableInstr = &CombinedStoreDsCntInstr;
1380 } else {
1381 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1382 assert(CT.has_value());
1383 unsigned OldCnt =
1384 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1385 if (TrySimplify)
1386 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
1387 addWait(Wait, CT.value(), OldCnt);
1388 UpdatableInstr = &WaitInstrs[CT.value()];
1389 }
1390
1391 // Merge consecutive waitcnt of the same type by erasing multiples.
1392 if (!*UpdatableInstr) {
1393 *UpdatableInstr = &II;
1394 } else {
1395 II.eraseFromParent();
1396 Modified = true;
1397 }
1398 }
1399
1400 if (CombinedLoadDsCntInstr) {
1401 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1402 // to be waited for. Otherwise, let the instruction be deleted so
1403 // the appropriate single counter wait instruction can be inserted
1404 // instead, when new S_WAIT_*CNT instructions are inserted by
1405 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1406 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1407 // the loop below that deals with single counter instructions.
1408 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1409 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1410 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1411 AMDGPU::OpName::simm16, NewEnc);
1412 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1413 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1414 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1415 Wait.LoadCnt = ~0u;
1416 Wait.DsCnt = ~0u;
1417
1418 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1419 ? dbgs() << "applyPreexistingWaitcnt\n"
1420 << "New Instr at block end: "
1421 << *CombinedLoadDsCntInstr << '\n'
1422 : dbgs() << "applyPreexistingWaitcnt\n"
1423 << "Old Instr: " << *It << "New Instr: "
1424 << *CombinedLoadDsCntInstr << '\n');
1425 } else {
1426 CombinedLoadDsCntInstr->eraseFromParent();
1427 Modified = true;
1428 }
1429 }
1430
1431 if (CombinedStoreDsCntInstr) {
1432 // Similarly for S_WAIT_STORECNT_DSCNT.
1433 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1434 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1435 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
1436 AMDGPU::OpName::simm16, NewEnc);
1437 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1438 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1439 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1440 Wait.StoreCnt = ~0u;
1441 Wait.DsCnt = ~0u;
1442
1443 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1444 ? dbgs() << "applyPreexistingWaitcnt\n"
1445 << "New Instr at block end: "
1446 << *CombinedStoreDsCntInstr << '\n'
1447 : dbgs() << "applyPreexistingWaitcnt\n"
1448 << "Old Instr: " << *It << "New Instr: "
1449 << *CombinedStoreDsCntInstr << '\n');
1450 } else {
1451 CombinedStoreDsCntInstr->eraseFromParent();
1452 Modified = true;
1453 }
1454 }
1455
1456 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
1457 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1458 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1459 // instructions so that createNewWaitcnt() will create new combined
1460 // instructions to replace them.
1461
1462 if (Wait.DsCnt != ~0u) {
1463 // This is a vector of addresses in WaitInstrs pointing to instructions
1464 // that should be removed if they are present.
1466
1467 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1468 // both) need to be waited for, ensure that there are no existing
1469 // individual wait count instructions for these.
1470
1471 if (Wait.LoadCnt != ~0u) {
1472 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
1473 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1474 } else if (Wait.StoreCnt != ~0u) {
1475 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
1476 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1477 }
1478
1479 for (MachineInstr **WI : WaitsToErase) {
1480 if (!*WI)
1481 continue;
1482
1483 (*WI)->eraseFromParent();
1484 *WI = nullptr;
1485 Modified = true;
1486 }
1487 }
1488
1489 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1490 if (!WaitInstrs[CT])
1491 continue;
1492
1493 unsigned NewCnt = getWait(Wait, CT);
1494 if (NewCnt != ~0u) {
1495 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
1496 AMDGPU::OpName::simm16, NewCnt);
1497 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1498
1499 ScoreBrackets.applyWaitcnt(CT, NewCnt);
1500 setNoWait(Wait, CT);
1501
1502 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1503 ? dbgs() << "applyPreexistingWaitcnt\n"
1504 << "New Instr at block end: " << *WaitInstrs[CT]
1505 << '\n'
1506 : dbgs() << "applyPreexistingWaitcnt\n"
1507 << "Old Instr: " << *It
1508 << "New Instr: " << *WaitInstrs[CT] << '\n');
1509 } else {
1510 WaitInstrs[CT]->eraseFromParent();
1511 Modified = true;
1512 }
1513 }
1514
1515 return Modified;
1516}
1517
1518/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
1519bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1522 assert(ST);
1523 assert(!isNormalMode(MaxCounter));
1524
1525 bool Modified = false;
1526 const DebugLoc &DL = Block.findDebugLoc(It);
1527
1528 // Check for opportunities to use combined wait instructions.
1529 if (Wait.DsCnt != ~0u) {
1530 MachineInstr *SWaitInst = nullptr;
1531
1532 if (Wait.LoadCnt != ~0u) {
1533 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1534
1535 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
1536 .addImm(Enc);
1537
1538 Wait.LoadCnt = ~0u;
1539 Wait.DsCnt = ~0u;
1540 } else if (Wait.StoreCnt != ~0u) {
1541 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1542
1543 SWaitInst =
1544 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
1545 .addImm(Enc);
1546
1547 Wait.StoreCnt = ~0u;
1548 Wait.DsCnt = ~0u;
1549 }
1550
1551 if (SWaitInst) {
1552 Modified = true;
1553
1554 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1555 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1556 dbgs() << "New Instr: " << *SWaitInst << '\n');
1557 }
1558 }
1559
1560 // Generate an instruction for any remaining counter that needs
1561 // waiting for.
1562
1563 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1564 unsigned Count = getWait(Wait, CT);
1565 if (Count == ~0u)
1566 continue;
1567
1568 [[maybe_unused]] auto SWaitInst =
1569 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
1570 .addImm(Count);
1571
1572 Modified = true;
1573
1574 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1575 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1576 dbgs() << "New Instr: " << *SWaitInst << '\n');
1577 }
1578
1579 return Modified;
1580}
1581
1582static bool readsVCCZ(const MachineInstr &MI) {
1583 unsigned Opc = MI.getOpcode();
1584 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1585 !MI.getOperand(1).isUndef();
1586}
1587
1588/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
1590 // Currently all conventions wait, but this may not always be the case.
1591 //
1592 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
1593 // senses to omit the wait and do it in the caller.
1594 return true;
1595}
1596
1597/// \returns true if the callee is expected to wait for any outstanding waits
1598/// before returning.
1599static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
1600
1601/// Generate s_waitcnt instruction to be placed before cur_Inst.
1602/// Instructions of a given type are returned in order,
1603/// but instructions of different types can complete out of order.
1604/// We rely on this in-order completion
1605/// and simply assign a score to the memory access instructions.
1606/// We keep track of the active "score bracket" to determine
1607/// if an access of a memory read requires an s_waitcnt
1608/// and if so what the value of each counter is.
1609/// The "score bracket" is bound by the lower bound and upper bound
1610/// scores (*_score_LB and *_score_ub respectively).
1611/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1612/// flush the vmcnt counter here.
1613bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1614 WaitcntBrackets &ScoreBrackets,
1615 MachineInstr *OldWaitcntInstr,
1616 bool FlushVmCnt) {
1617 setForceEmitWaitcnt();
1618
1619 if (MI.isMetaInstruction())
1620 return false;
1621
1623
1624 // FIXME: This should have already been handled by the memory legalizer.
1625 // Removing this currently doesn't affect any lit tests, but we need to
1626 // verify that nothing was relying on this. The number of buffer invalidates
1627 // being handled here should not be expanded.
1628 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1629 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1630 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1631 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1632 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1633 Wait.LoadCnt = 0;
1634 }
1635
1636 // All waits must be resolved at call return.
1637 // NOTE: this could be improved with knowledge of all call sites or
1638 // with knowledge of the called routines.
1639 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1640 MI.getOpcode() == AMDGPU::SI_RETURN ||
1641 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1642 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1643 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
1644 }
1645 // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1646 // stores. In this case it can be useful to send a message to explicitly
1647 // release all VGPRs before the stores have completed, but it is only safe to
1648 // do this if:
1649 // * there are no outstanding scratch stores
1650 // * we are not in Dynamic VGPR mode
1651 else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1652 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1653 if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() &&
1654 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1655 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
1656 ReleaseVGPRInsts.insert(&MI);
1657 }
1658 // Resolve vm waits before gs-done.
1659 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
1660 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1661 ST->hasLegacyGeometry() &&
1662 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1664 Wait.LoadCnt = 0;
1665 }
1666
1667 // Export & GDS instructions do not read the EXEC mask until after the export
1668 // is granted (which can occur well after the instruction is issued).
1669 // The shader program must flush all EXP operations on the export-count
1670 // before overwriting the EXEC mask.
1671 else {
1672 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1673 // Export and GDS are tracked individually, either may trigger a waitcnt
1674 // for EXEC.
1675 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1676 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1677 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1678 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1679 Wait.ExpCnt = 0;
1680 }
1681 }
1682
1683 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1684 // The function is going to insert a wait on everything in its prolog.
1685 // This still needs to be careful if the call target is a load (e.g. a GOT
1686 // load). We also need to check WAW dependency with saved PC.
1688
1689 const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1690 if (CallAddrOp.isReg()) {
1691 RegInterval CallAddrOpInterval =
1692 ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOp);
1693
1694 ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval,
1695 Wait);
1696
1697 if (const auto *RtnAddrOp =
1698 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
1699 RegInterval RtnAddrOpInterval =
1700 ScoreBrackets.getRegInterval(&MI, MRI, TRI, *RtnAddrOp);
1701
1702 ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval,
1703 Wait);
1704 }
1705 }
1706 } else {
1707 // FIXME: Should not be relying on memoperands.
1708 // Look at the source operands of every instruction to see if
1709 // any of them results from a previous memory operation that affects
1710 // its current usage. If so, an s_waitcnt instruction needs to be
1711 // emitted.
1712 // If the source operand was defined by a load, add the s_waitcnt
1713 // instruction.
1714 //
1715 // Two cases are handled for destination operands:
1716 // 1) If the destination operand was defined by a load, add the s_waitcnt
1717 // instruction to guarantee the right WAW order.
1718 // 2) If a destination operand that was used by a recent export/store ins,
1719 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1720
1721 for (const MachineMemOperand *Memop : MI.memoperands()) {
1722 const Value *Ptr = Memop->getValue();
1723 if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
1724 addWait(Wait, SmemAccessCounter, 0);
1725 if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1726 SLoadAddresses.erase(Ptr);
1727 }
1728 unsigned AS = Memop->getAddrSpace();
1730 continue;
1731 // No need to wait before load from VMEM to LDS.
1732 if (TII->mayWriteLDSThroughDMA(MI))
1733 continue;
1734
1735 // LOAD_CNT is only relevant to vgpr or LDS.
1736 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1737 bool FoundAliasingStore = false;
1738 // Only objects with alias scope info were added to LDSDMAScopes array.
1739 // In the absense of the scope info we will not be able to disambiguate
1740 // aliasing here. There is no need to try searching for a corresponding
1741 // store slot. This is conservatively correct because in that case we
1742 // will produce a wait using the first (general) LDS DMA wait slot which
1743 // will wait on all of them anyway.
1744 if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1745 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
1746 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
1747 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
1748 FoundAliasingStore = true;
1749 ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
1750 }
1751 }
1752 }
1753 if (!FoundAliasingStore)
1754 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1755 if (Memop->isStore()) {
1756 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1757 }
1758 }
1759
1760 // Loop over use and def operands.
1761 for (const MachineOperand &Op : MI.operands()) {
1762 if (!Op.isReg())
1763 continue;
1764
1765 // If the instruction does not read tied source, skip the operand.
1766 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1767 continue;
1768
1769 RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, Op);
1770
1771 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1772 if (IsVGPR) {
1773 // Implicit VGPR defs and uses are never a part of the memory
1774 // instructions description and usually present to account for
1775 // super-register liveness.
1776 // TODO: Most of the other instructions also have implicit uses
1777 // for the liveness accounting only.
1778 if (Op.isImplicit() && MI.mayLoadOrStore())
1779 continue;
1780
1781 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1782 // previous write and this write are the same type of VMEM
1783 // instruction, in which case they are (in some architectures)
1784 // guaranteed to write their results in order anyway.
1785 if (Op.isUse() || !updateVMCntOnly(MI) ||
1786 ScoreBrackets.hasOtherPendingVmemTypes(Interval,
1787 getVmemType(MI)) ||
1788 !ST->hasVmemWriteVgprInOrder()) {
1789 ScoreBrackets.determineWait(LOAD_CNT, Interval, Wait);
1790 ScoreBrackets.determineWait(SAMPLE_CNT, Interval, Wait);
1791 ScoreBrackets.determineWait(BVH_CNT, Interval, Wait);
1792 ScoreBrackets.clearVgprVmemTypes(Interval);
1793 }
1794 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1795 ScoreBrackets.determineWait(EXP_CNT, Interval, Wait);
1796 }
1797 ScoreBrackets.determineWait(DS_CNT, Interval, Wait);
1798 } else {
1799 ScoreBrackets.determineWait(SmemAccessCounter, Interval, Wait);
1800 }
1801 }
1802 }
1803 }
1804
1805 // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1806 // not, we need to ensure the subtarget is capable of backing off barrier
1807 // instructions in case there are any outstanding memory operations that may
1808 // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
1809 if (TII->isBarrierStart(MI.getOpcode()) &&
1810 !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
1811 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
1812 }
1813
1814 // TODO: Remove this work-around, enable the assert for Bug 457939
1815 // after fixing the scheduler. Also, the Shader Compiler code is
1816 // independent of target.
1817 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1818 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1819 Wait.DsCnt = 0;
1820 }
1821 }
1822
1823 // Verify that the wait is actually needed.
1824 ScoreBrackets.simplifyWaitcnt(Wait);
1825
1826 // When forcing emit, we need to skip terminators because that would break the
1827 // terminators of the MBB if we emit a waitcnt between terminators.
1828 if (ForceEmitZeroFlag && !MI.isTerminator())
1829 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
1830
1831 if (ForceEmitWaitcnt[LOAD_CNT])
1832 Wait.LoadCnt = 0;
1833 if (ForceEmitWaitcnt[EXP_CNT])
1834 Wait.ExpCnt = 0;
1835 if (ForceEmitWaitcnt[DS_CNT])
1836 Wait.DsCnt = 0;
1837 if (ForceEmitWaitcnt[SAMPLE_CNT])
1838 Wait.SampleCnt = 0;
1839 if (ForceEmitWaitcnt[BVH_CNT])
1840 Wait.BvhCnt = 0;
1841 if (ForceEmitWaitcnt[KM_CNT])
1842 Wait.KmCnt = 0;
1843
1844 if (FlushVmCnt) {
1845 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
1846 Wait.LoadCnt = 0;
1847 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
1848 Wait.SampleCnt = 0;
1849 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
1850 Wait.BvhCnt = 0;
1851 }
1852
1853 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
1854 OldWaitcntInstr);
1855}
1856
1857bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1860 WaitcntBrackets &ScoreBrackets,
1861 MachineInstr *OldWaitcntInstr) {
1862 bool Modified = false;
1863
1864 if (OldWaitcntInstr)
1865 // Try to merge the required wait with preexisting waitcnt instructions.
1866 // Also erase redundant waitcnt.
1867 Modified =
1868 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
1869
1870 // Any counts that could have been applied to any existing waitcnt
1871 // instructions will have been done so, now deal with any remaining.
1872 ScoreBrackets.applyWaitcnt(Wait);
1873
1874 // ExpCnt can be merged into VINTERP.
1875 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
1877 MachineOperand *WaitExp =
1878 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1879 if (Wait.ExpCnt < WaitExp->getImm()) {
1880 WaitExp->setImm(Wait.ExpCnt);
1881 Modified = true;
1882 }
1883 Wait.ExpCnt = ~0u;
1884
1885 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
1886 << "Update Instr: " << *It);
1887 }
1888
1889 if (WCG->createNewWaitcnt(Block, It, Wait))
1890 Modified = true;
1891
1892 return Modified;
1893}
1894
1895// This is a flat memory operation. Check to see if it has memory tokens other
1896// than LDS. Other address spaces supported by flat memory operations involve
1897// global memory.
1898bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1899 assert(TII->isFLAT(MI));
1900
1901 // All flat instructions use the VMEM counter.
1902 assert(TII->usesVM_CNT(MI));
1903
1904 // If there are no memory operands then conservatively assume the flat
1905 // operation may access VMEM.
1906 if (MI.memoperands_empty())
1907 return true;
1908
1909 // See if any memory operand specifies an address space that involves VMEM.
1910 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1911 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1912 // (GDS) address space is not supported by flat operations. Therefore, simply
1913 // return true unless only the LDS address space is found.
1914 for (const MachineMemOperand *Memop : MI.memoperands()) {
1915 unsigned AS = Memop->getAddrSpace();
1917 if (AS != AMDGPUAS::LOCAL_ADDRESS)
1918 return true;
1919 }
1920
1921 return false;
1922}
1923
1924// This is a flat memory operation. Check to see if it has memory tokens for
1925// either LDS or FLAT.
1926bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1927 assert(TII->isFLAT(MI));
1928
1929 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1930 if (!TII->usesLGKM_CNT(MI))
1931 return false;
1932
1933 // If in tgsplit mode then there can be no use of LDS.
1934 if (ST->isTgSplitEnabled())
1935 return false;
1936
1937 // If there are no memory operands then conservatively assume the flat
1938 // operation may access LDS.
1939 if (MI.memoperands_empty())
1940 return true;
1941
1942 // See if any memory operand specifies an address space that involves LDS.
1943 for (const MachineMemOperand *Memop : MI.memoperands()) {
1944 unsigned AS = Memop->getAddrSpace();
1946 return true;
1947 }
1948
1949 return false;
1950}
1951
1952// This is a flat memory operation. Check to see if it has memory tokens for
1953// either scratch or FLAT.
1954bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
1955 const MachineInstr &MI) const {
1956 assert(TII->isFLAT(MI));
1957
1958 // SCRATCH instructions always access scratch.
1959 if (TII->isFLATScratch(MI))
1960 return true;
1961
1962 // GLOBAL instructions never access scratch.
1963 if (TII->isFLATGlobal(MI))
1964 return false;
1965
1966 // If there are no memory operands then conservatively assume the flat
1967 // operation may access scratch.
1968 if (MI.memoperands_empty())
1969 return true;
1970
1971 // See if any memory operand specifies an address space that involves scratch.
1972 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
1973 unsigned AS = Memop->getAddrSpace();
1974 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
1975 });
1976}
1977
1979 auto Opc = Inst.getOpcode();
1980 return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
1981 Opc == AMDGPU::GLOBAL_WBINV;
1982}
1983
1984void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1985 WaitcntBrackets *ScoreBrackets) {
1986 // Now look at the instruction opcode. If it is a memory access
1987 // instruction, update the upper-bound of the appropriate counter's
1988 // bracket and the destination operand scores.
1989 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
1990
1991 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1992 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1993 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1994 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1995 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1996 } else {
1997 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1998 }
1999 } else if (TII->isFLAT(Inst)) {
2000 // TODO: Track this properly.
2001 if (isCacheInvOrWBInst(Inst))
2002 return;
2003
2004 assert(Inst.mayLoadOrStore());
2005
2006 int FlatASCount = 0;
2007
2008 if (mayAccessVMEMThroughFlat(Inst)) {
2009 ++FlatASCount;
2010 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2011 Inst);
2012 }
2013
2014 if (mayAccessLDSThroughFlat(Inst)) {
2015 ++FlatASCount;
2016 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
2017 }
2018
2019 // A Flat memory operation must access at least one address space.
2020 assert(FlatASCount);
2021
2022 // This is a flat memory operation that access both VMEM and LDS, so note it
2023 // - it will require that both the VM and LGKM be flushed to zero if it is
2024 // pending when a VM or LGKM dependency occurs.
2025 if (FlatASCount > 1)
2026 ScoreBrackets->setPendingFlat();
2027 } else if (SIInstrInfo::isVMEM(Inst) &&
2029 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2030 Inst);
2031
2032 if (ST->vmemWriteNeedsExpWaitcnt() &&
2033 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2034 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
2035 }
2036 } else if (TII->isSMRD(Inst)) {
2037 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2038 } else if (Inst.isCall()) {
2039 if (callWaitsOnFunctionReturn(Inst)) {
2040 // Act as a wait on everything
2041 ScoreBrackets->applyWaitcnt(
2042 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2043 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2044 } else {
2045 // May need to way wait for anything.
2046 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
2047 }
2048 } else if (SIInstrInfo::isLDSDIR(Inst)) {
2049 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
2050 } else if (TII->isVINTERP(Inst)) {
2051 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2052 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2053 } else if (SIInstrInfo::isEXP(Inst)) {
2054 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2056 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
2057 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2058 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
2059 else
2060 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
2061 } else {
2062 switch (Inst.getOpcode()) {
2063 case AMDGPU::S_SENDMSG:
2064 case AMDGPU::S_SENDMSG_RTN_B32:
2065 case AMDGPU::S_SENDMSG_RTN_B64:
2066 case AMDGPU::S_SENDMSGHALT:
2067 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
2068 break;
2069 case AMDGPU::S_MEMTIME:
2070 case AMDGPU::S_MEMREALTIME:
2071 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2072 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2073 case AMDGPU::S_BARRIER_LEAVE:
2074 case AMDGPU::S_GET_BARRIER_STATE_M0:
2075 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2076 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2077 break;
2078 }
2079 }
2080}
2081
2082bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2083 unsigned OtherScore) {
2084 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2085 unsigned OtherShifted =
2086 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2087 Score = std::max(MyShifted, OtherShifted);
2088 return OtherShifted > MyShifted;
2089}
2090
2091/// Merge the pending events and associater score brackets of \p Other into
2092/// this brackets status.
2093///
2094/// Returns whether the merge resulted in a change that requires tighter waits
2095/// (i.e. the merged brackets strictly dominate the original brackets).
2096bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2097 bool StrictDom = false;
2098
2099 VgprUB = std::max(VgprUB, Other.VgprUB);
2100 SgprUB = std::max(SgprUB, Other.SgprUB);
2101
2102 for (auto T : inst_counter_types(MaxCounter)) {
2103 // Merge event flags for this counter
2104 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2105 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2106 if (OtherEvents & ~OldEvents)
2107 StrictDom = true;
2108 PendingEvents |= OtherEvents;
2109
2110 // Merge scores for this counter
2111 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2112 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2113 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2114 if (NewUB < ScoreLBs[T])
2115 report_fatal_error("waitcnt score overflow");
2116
2117 MergeInfo M;
2118 M.OldLB = ScoreLBs[T];
2119 M.OtherLB = Other.ScoreLBs[T];
2120 M.MyShift = NewUB - ScoreUBs[T];
2121 M.OtherShift = NewUB - Other.ScoreUBs[T];
2122
2123 ScoreUBs[T] = NewUB;
2124
2125 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
2126
2127 for (int J = 0; J <= VgprUB; J++)
2128 StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
2129
2130 if (T == SmemAccessCounter) {
2131 for (int J = 0; J <= SgprUB; J++)
2132 StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
2133 }
2134 }
2135
2136 for (int J = 0; J <= VgprUB; J++) {
2137 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
2138 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2139 VgprVmemTypes[J] = NewVmemTypes;
2140 }
2141
2142 return StrictDom;
2143}
2144
2145static bool isWaitInstr(MachineInstr &Inst) {
2146 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2147 return Opcode == AMDGPU::S_WAITCNT ||
2148 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2149 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2150 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2151 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2152 counterTypeForInstr(Opcode).has_value();
2153}
2154
2155// Generate s_waitcnt instructions where needed.
2156bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2158 WaitcntBrackets &ScoreBrackets) {
2159 bool Modified = false;
2160
2161 LLVM_DEBUG({
2162 dbgs() << "*** Block" << Block.getNumber() << " ***";
2163 ScoreBrackets.dump();
2164 });
2165
2166 // Track the correctness of vccz through this basic block. There are two
2167 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2168 // ST->partialVCCWritesUpdateVCCZ().
2169 bool VCCZCorrect = true;
2170 if (ST->hasReadVCCZBug()) {
2171 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2172 // to vcc and then issued an smem load.
2173 VCCZCorrect = false;
2174 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2175 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2176 // to vcc_lo or vcc_hi.
2177 VCCZCorrect = false;
2178 }
2179
2180 // Walk over the instructions.
2181 MachineInstr *OldWaitcntInstr = nullptr;
2182
2183 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2184 E = Block.instr_end();
2185 Iter != E;) {
2186 MachineInstr &Inst = *Iter;
2187
2188 // Track pre-existing waitcnts that were added in earlier iterations or by
2189 // the memory legalizer.
2190 if (isWaitInstr(Inst)) {
2191 if (!OldWaitcntInstr)
2192 OldWaitcntInstr = &Inst;
2193 ++Iter;
2194 continue;
2195 }
2196
2197 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2198 isPreheaderToFlush(Block, ScoreBrackets);
2199
2200 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
2201 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2202 FlushVmCnt);
2203 OldWaitcntInstr = nullptr;
2204
2205 // Restore vccz if it's not known to be correct already.
2206 bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
2207
2208 // Don't examine operands unless we need to track vccz correctness.
2209 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
2210 if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
2211 Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) {
2212 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2213 if (!ST->partialVCCWritesUpdateVCCZ())
2214 VCCZCorrect = false;
2215 } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) {
2216 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
2217 // vccz bit, so when we detect that an instruction may read from a
2218 // corrupt vccz bit, we need to:
2219 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2220 // operations to complete.
2221 // 2. Restore the correct value of vccz by writing the current value
2222 // of vcc back to vcc.
2223 if (ST->hasReadVCCZBug() &&
2224 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2225 // Writes to vcc while there's an outstanding smem read may get
2226 // clobbered as soon as any read completes.
2227 VCCZCorrect = false;
2228 } else {
2229 // Writes to vcc will fix any incorrect value in vccz.
2230 VCCZCorrect = true;
2231 }
2232 }
2233 }
2234
2235 if (TII->isSMRD(Inst)) {
2236 for (const MachineMemOperand *Memop : Inst.memoperands()) {
2237 // No need to handle invariant loads when avoiding WAR conflicts, as
2238 // there cannot be a vector store to the same memory location.
2239 if (!Memop->isInvariant()) {
2240 const Value *Ptr = Memop->getValue();
2241 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
2242 }
2243 }
2244 if (ST->hasReadVCCZBug()) {
2245 // This smem read could complete and clobber vccz at any time.
2246 VCCZCorrect = false;
2247 }
2248 }
2249
2250 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2251
2252 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
2253 AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(
2254 Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst));
2255 ScoreBrackets.simplifyWaitcnt(Wait);
2256 Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block,
2257 ScoreBrackets, /*OldWaitcntInstr=*/nullptr);
2258 }
2259
2260 LLVM_DEBUG({
2261 Inst.print(dbgs());
2262 ScoreBrackets.dump();
2263 });
2264
2265 // TODO: Remove this work-around after fixing the scheduler and enable the
2266 // assert above.
2267 if (RestoreVCCZ) {
2268 // Restore the vccz bit. Any time a value is written to vcc, the vcc
2269 // bit is updated, so we can restore the bit by reading the value of
2270 // vcc and then writing it back to the register.
2271 BuildMI(Block, Inst, Inst.getDebugLoc(),
2272 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2273 TRI->getVCC())
2274 .addReg(TRI->getVCC());
2275 VCCZCorrect = true;
2276 Modified = true;
2277 }
2278
2279 ++Iter;
2280 }
2281
2282 // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
2283 // needed.
2285 if (Block.getFirstTerminator() == Block.end() &&
2286 isPreheaderToFlush(Block, ScoreBrackets)) {
2287 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2288 Wait.LoadCnt = 0;
2289 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2290 Wait.SampleCnt = 0;
2291 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2292 Wait.BvhCnt = 0;
2293 }
2294
2295 // Combine or remove any redundant waitcnts at the end of the block.
2296 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
2297 OldWaitcntInstr);
2298
2299 return Modified;
2300}
2301
2302// Return true if the given machine basic block is a preheader of a loop in
2303// which we want to flush the vmcnt counter, and false otherwise.
2304bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
2305 WaitcntBrackets &ScoreBrackets) {
2306 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
2307 if (!IsInserted)
2308 return Iterator->second;
2309
2311 if (!Succ)
2312 return false;
2313
2314 MachineLoop *Loop = MLI->getLoopFor(Succ);
2315 if (!Loop)
2316 return false;
2317
2318 if (Loop->getLoopPreheader() == &MBB &&
2319 shouldFlushVmCnt(Loop, ScoreBrackets)) {
2320 Iterator->second = true;
2321 return true;
2322 }
2323
2324 return false;
2325}
2326
2327bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2328 return SIInstrInfo::isVMEM(MI) ||
2329 (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
2330}
2331
2332// Return true if it is better to flush the vmcnt counter in the preheader of
2333// the given loop. We currently decide to flush in two situations:
2334// 1. The loop contains vmem store(s), no vmem load and at least one use of a
2335// vgpr containing a value that is loaded outside of the loop. (Only on
2336// targets with no vscnt counter).
2337// 2. The loop contains vmem load(s), but the loaded values are not used in the
2338// loop, and at least one use of a vgpr containing a value that is loaded
2339// outside of the loop.
2340bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2341 WaitcntBrackets &Brackets) {
2342 bool HasVMemLoad = false;
2343 bool HasVMemStore = false;
2344 bool UsesVgprLoadedOutside = false;
2345 DenseSet<Register> VgprUse;
2346 DenseSet<Register> VgprDef;
2347
2348 for (MachineBasicBlock *MBB : ML->blocks()) {
2349 for (MachineInstr &MI : *MBB) {
2350 if (isVMEMOrFlatVMEM(MI)) {
2351 if (MI.mayLoad())
2352 HasVMemLoad = true;
2353 if (MI.mayStore())
2354 HasVMemStore = true;
2355 }
2356 for (const MachineOperand &Op : MI.all_uses()) {
2357 if (!TRI->isVectorRegister(*MRI, Op.getReg()))
2358 continue;
2359 RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op);
2360 // Vgpr use
2361 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2362 // If we find a register that is loaded inside the loop, 1. and 2.
2363 // are invalidated and we can exit.
2364 if (VgprDef.contains(RegNo))
2365 return false;
2366 VgprUse.insert(RegNo);
2367 // If at least one of Op's registers is in the score brackets, the
2368 // value is likely loaded outside of the loop.
2369 if (Brackets.getRegScore(RegNo, LOAD_CNT) >
2370 Brackets.getScoreLB(LOAD_CNT) ||
2371 Brackets.getRegScore(RegNo, SAMPLE_CNT) >
2372 Brackets.getScoreLB(SAMPLE_CNT) ||
2373 Brackets.getRegScore(RegNo, BVH_CNT) >
2374 Brackets.getScoreLB(BVH_CNT)) {
2375 UsesVgprLoadedOutside = true;
2376 break;
2377 }
2378 }
2379 }
2380
2381 // VMem load vgpr def
2382 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
2383 for (const MachineOperand &Op : MI.all_defs()) {
2384 RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op);
2385 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2386 // If we find a register that is loaded inside the loop, 1. and 2.
2387 // are invalidated and we can exit.
2388 if (VgprUse.contains(RegNo))
2389 return false;
2390 VgprDef.insert(RegNo);
2391 }
2392 }
2393 }
2394 }
2395 }
2396 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2397 return true;
2398 return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
2399}
2400
2401bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
2402 ST = &MF.getSubtarget<GCNSubtarget>();
2403 TII = ST->getInstrInfo();
2404 TRI = &TII->getRegisterInfo();
2405 MRI = &MF.getRegInfo();
2407 MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2408 PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
2409 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2410 AA = &AAR->getAAResults();
2411
2413
2414 if (ST->hasExtendedWaitCounts()) {
2415 MaxCounter = NUM_EXTENDED_INST_CNTS;
2416 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
2417 WCG = &WCGGFX12Plus;
2418 } else {
2419 MaxCounter = NUM_NORMAL_INST_CNTS;
2420 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
2421 WCG = &WCGPreGFX12;
2422 }
2423
2424 for (auto T : inst_counter_types())
2425 ForceEmitWaitcnt[T] = false;
2426
2427 const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
2428
2429 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
2430
2431 HardwareLimits Limits = {};
2432 if (ST->hasExtendedWaitCounts()) {
2433 Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
2434 Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
2435 } else {
2436 Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
2437 Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
2438 }
2439 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
2440 Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
2441 Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
2442 Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
2443 Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
2444
2445 unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
2446 unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
2447 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2448 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2449
2450 RegisterEncoding Encoding = {};
2451 Encoding.VGPR0 =
2452 TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2453 Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
2454 Encoding.SGPR0 =
2455 TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2456 Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
2457
2458 BlockInfos.clear();
2459 bool Modified = false;
2460
2461 MachineBasicBlock &EntryBB = MF.front();
2463
2464 if (!MFI->isEntryFunction()) {
2465 // Wait for any outstanding memory operations that the input registers may
2466 // depend on. We can't track them and it's better to do the wait after the
2467 // costly call sequence.
2468
2469 // TODO: Could insert earlier and schedule more liberally with operations
2470 // that only use caller preserved registers.
2471 for (MachineBasicBlock::iterator E = EntryBB.end();
2472 I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
2473 ;
2474
2475 if (ST->hasExtendedWaitCounts()) {
2476 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2477 .addImm(0);
2478 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2479 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
2480 continue;
2481
2482 BuildMI(EntryBB, I, DebugLoc(),
2483 TII->get(instrsForExtendedCounterTypes[CT]))
2484 .addImm(0);
2485 }
2486 } else {
2487 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
2488 }
2489
2490 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
2491 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
2492 SmemAccessCounter);
2493 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2494 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2495
2496 Modified = true;
2497 }
2498
2499 // Keep iterating over the blocks in reverse post order, inserting and
2500 // updating s_waitcnt where needed, until a fix point is reached.
2502 BlockInfos.insert({MBB, BlockInfo()});
2503
2504 std::unique_ptr<WaitcntBrackets> Brackets;
2505 bool Repeat;
2506 do {
2507 Repeat = false;
2508
2509 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
2510 ++BII) {
2511 MachineBasicBlock *MBB = BII->first;
2512 BlockInfo &BI = BII->second;
2513 if (!BI.Dirty)
2514 continue;
2515
2516 if (BI.Incoming) {
2517 if (!Brackets)
2518 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
2519 else
2520 *Brackets = *BI.Incoming;
2521 } else {
2522 if (!Brackets)
2523 Brackets = std::make_unique<WaitcntBrackets>(
2524 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
2525 SmemAccessCounter);
2526 else
2527 *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding,
2528 WaitEventMaskForInst, SmemAccessCounter);
2529 }
2530
2531 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
2532 BI.Dirty = false;
2533
2534 if (Brackets->hasPendingEvent()) {
2535 BlockInfo *MoveBracketsToSucc = nullptr;
2536 for (MachineBasicBlock *Succ : MBB->successors()) {
2537 auto *SuccBII = BlockInfos.find(Succ);
2538 BlockInfo &SuccBI = SuccBII->second;
2539 if (!SuccBI.Incoming) {
2540 SuccBI.Dirty = true;
2541 if (SuccBII <= BII)
2542 Repeat = true;
2543 if (!MoveBracketsToSucc) {
2544 MoveBracketsToSucc = &SuccBI;
2545 } else {
2546 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
2547 }
2548 } else if (SuccBI.Incoming->merge(*Brackets)) {
2549 SuccBI.Dirty = true;
2550 if (SuccBII <= BII)
2551 Repeat = true;
2552 }
2553 }
2554 if (MoveBracketsToSucc)
2555 MoveBracketsToSucc->Incoming = std::move(Brackets);
2556 }
2557 }
2558 } while (Repeat);
2559
2560 if (ST->hasScalarStores()) {
2562 bool HaveScalarStores = false;
2563
2564 for (MachineBasicBlock &MBB : MF) {
2565 for (MachineInstr &MI : MBB) {
2566 if (!HaveScalarStores && TII->isScalarStore(MI))
2567 HaveScalarStores = true;
2568
2569 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
2570 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2571 EndPgmBlocks.push_back(&MBB);
2572 }
2573 }
2574
2575 if (HaveScalarStores) {
2576 // If scalar writes are used, the cache must be flushed or else the next
2577 // wave to reuse the same scratch memory can be clobbered.
2578 //
2579 // Insert s_dcache_wb at wave termination points if there were any scalar
2580 // stores, and only if the cache hasn't already been flushed. This could
2581 // be improved by looking across blocks for flushes in postdominating
2582 // blocks from the stores but an explicitly requested flush is probably
2583 // very rare.
2584 for (MachineBasicBlock *MBB : EndPgmBlocks) {
2585 bool SeenDCacheWB = false;
2586
2587 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
2588 I != E; ++I) {
2589 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
2590 SeenDCacheWB = true;
2591 else if (TII->isScalarStore(*I))
2592 SeenDCacheWB = false;
2593
2594 // FIXME: It would be better to insert this before a waitcnt if any.
2595 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
2596 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2597 !SeenDCacheWB) {
2598 Modified = true;
2599 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
2600 }
2601 }
2602 }
2603 }
2604 }
2605
2606 // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
2607 // instructions.
2608 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
2609 // waveslot limited kernel runs slower with the deallocation.
2610 if (!ReleaseVGPRInsts.empty() &&
2611 (MF.getFrameInfo().hasCalls() ||
2612 ST->getOccupancyWithNumVGPRs(
2613 TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
2615 for (MachineInstr *MI : ReleaseVGPRInsts) {
2616 if (ST->requiresNopBeforeDeallocVGPRs()) {
2617 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2618 TII->get(AMDGPU::S_NOP))
2619 .addImm(0);
2620 }
2621 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2622 TII->get(AMDGPU::S_SENDMSG))
2624 Modified = true;
2625 }
2626 }
2627 ReleaseVGPRInsts.clear();
2628 PreheadersToFlush.clear();
2629 SLoadAddresses.clear();
2630
2631 return Modified;
2632}
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:190
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Size
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1315
static Function * getFunction(Constant *C)
Definition: Evaluator.cpp:235
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
static bool isCacheInvOrWBInst(MachineInstr &Inst)
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static bool readsVCCZ(const MachineInstr &MI)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define DEBUG_TYPE
SI Insert Waitcnts
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Provides some synthesis utilities to produce sequences of values.
static const uint32_t IV[8]
Definition: blake3_impl.h:78
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Represent the analysis usage information of a pass.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:256
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
This class represents an Operation in the Expression.
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:96
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:87
A debug info location.
Definition: DebugLoc.h:33
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
bool erase(const KeyT &Val)
Definition: DenseMap.h:321
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:347
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:956
iterator_range< filtered_mop_iterator > all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
Definition: MachineInstr.h:772
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
iterator_range< mop_iterator > defs()
Returns a range over all explicit operands that are register definitions.
Definition: MachineInstr.h:728
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:788
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:499
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
iterator_range< filtered_mop_iterator > all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
Definition: MachineInstr.h:762
A description of a memory reference used in the backend.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
iterator find(const KeyT &Key)
Definition: MapVector.h:167
iterator begin()
Definition: MapVector.h:69
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
void clear()
Definition: MapVector.h:88
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:438
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:642
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:655
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:697
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:594
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:839
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:576
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:634
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:602
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:679
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:971
static bool isVINTERP(const MachineInstr &MI)
Definition: SIInstrInfo.h:847
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:586
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:618
static bool isAtomicNoRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:671
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
self_iterator getIterator()
Definition: ilist_node.h:132
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
unsigned getStorecntBitMask(const IsaVersion &Version)
IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getSamplecntBitMask(const IsaVersion &Version)
unsigned getKmcntBitMask(const IsaVersion &Version)
unsigned getVmcntBitMask(const IsaVersion &Version)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getBvhcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getLoadcntBitMask(const IsaVersion &Version)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned getDscntBitMask(const IsaVersion &Version)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition: Sequence.h:337
@ Wait
Definition: Threading.h:61
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
char & SIInsertWaitcntsID
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
@ None
Definition: CodeGenData.h:106
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
FunctionPass * createSIInsertWaitcntsPass()
Instruction set architecture version.
Definition: TargetParser.h:130
Represents the counter values to wait for in an s_waitcnt instruction.
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
static constexpr bool is_iterable
Definition: Sequence.h:100