LLVM 19.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
31#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/Sequence.h"
40using namespace llvm;
41
42#define DEBUG_TYPE "si-insert-waitcnts"
43
44DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
45 "Force emit s_waitcnt expcnt(0) instrs");
46DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
47 "Force emit s_waitcnt lgkmcnt(0) instrs");
48DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
49 "Force emit s_waitcnt vmcnt(0) instrs");
50
52 "amdgpu-waitcnt-forcezero",
53 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
54 cl::init(false), cl::Hidden);
55
56namespace {
57// Class of object that encapsulates latest instruction counter score
58// associated with the operand. Used for determining whether
59// s_waitcnt instruction needs to be emitted.
60
61enum InstCounterType {
62 LOAD_CNT = 0, // VMcnt prior to gfx12.
63 DS_CNT, // LKGMcnt prior to gfx12.
64 EXP_CNT, //
65 STORE_CNT, // VScnt in gfx10/gfx11.
66 NUM_NORMAL_INST_CNTS,
67 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
68 BVH_CNT, // gfx12+ only.
69 KM_CNT, // gfx12+ only.
70 NUM_EXTENDED_INST_CNTS,
71 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
72};
73} // namespace
74
75namespace llvm {
76template <> struct enum_iteration_traits<InstCounterType> {
77 static constexpr bool is_iterable = true;
78};
79} // namespace llvm
80
81namespace {
82// Return an iterator over all counters between LOAD_CNT (the first counter)
83// and \c MaxCounter (exclusive, default value yields an enumeration over
84// all counters).
85auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
86 return enum_seq(LOAD_CNT, MaxCounter);
87}
88
89using RegInterval = std::pair<int, int>;
90
91struct HardwareLimits {
92 unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
93 unsigned ExpcntMax;
94 unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
95 unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
96 unsigned SamplecntMax; // gfx12+ only.
97 unsigned BvhcntMax; // gfx12+ only.
98 unsigned KmcntMax; // gfx12+ only.
99};
100
101struct RegisterEncoding {
102 unsigned VGPR0;
103 unsigned VGPRL;
104 unsigned SGPR0;
105 unsigned SGPRL;
106};
107
108enum WaitEventType {
109 VMEM_ACCESS, // vector-memory read & write
110 VMEM_READ_ACCESS, // vector-memory read
111 VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
112 VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only)
113 VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
114 SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
115 LDS_ACCESS, // lds read & write
116 GDS_ACCESS, // gds read & write
117 SQ_MESSAGE, // send message
118 SMEM_ACCESS, // scalar-memory read & write
119 EXP_GPR_LOCK, // export holding on its data src
120 GDS_GPR_LOCK, // GDS holding on its data and addr src
121 EXP_POS_ACCESS, // write to export position
122 EXP_PARAM_ACCESS, // write to export parameter
123 VMW_GPR_LOCK, // vector-memory write holding on its data src
124 EXP_LDS_ACCESS, // read by ldsdir counting as export
125 NUM_WAIT_EVENTS,
126};
127
128// The mapping is:
129// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
130// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
131// NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
132// We reserve a fixed number of VGPR slots in the scoring tables for
133// special tokens like SCMEM_LDS (needed for buffer load to LDS).
134enum RegisterMapping {
135 SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
136 AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
137 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
138 NUM_EXTRA_VGPRS = 9, // Reserved slots for DS.
139 // Artificial register slots to track LDS writes into specific LDS locations
140 // if a location is known. When slots are exhausted or location is
141 // unknown use the first slot. The first slot is also always updated in
142 // addition to known location's slot to properly generate waits if dependent
143 // instruction's location is unknown.
144 EXTRA_VGPR_LDS = 0,
145 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
146};
147
148// Enumerate different types of result-returning VMEM operations. Although
149// s_waitcnt orders them all with a single vmcnt counter, in the absence of
150// s_waitcnt only instructions of the same VmemType are guaranteed to write
151// their results in order -- so there is no need to insert an s_waitcnt between
152// two instructions of the same type that write the same vgpr.
153enum VmemType {
154 // BUF instructions and MIMG instructions without a sampler.
155 VMEM_NOSAMPLER,
156 // MIMG instructions with a sampler.
157 VMEM_SAMPLER,
158 // BVH instructions
159 VMEM_BVH,
160 NUM_VMEM_TYPES
161};
162
163// Maps values of InstCounterType to the instruction that waits on that
164// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
165// returns true.
166static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
167 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
168 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
169 AMDGPU::S_WAIT_KMCNT};
170
171static bool updateVMCntOnly(const MachineInstr &Inst) {
172 return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
174}
175
176#ifndef NDEBUG
177static bool isNormalMode(InstCounterType MaxCounter) {
178 return MaxCounter == NUM_NORMAL_INST_CNTS;
179}
180#endif // NDEBUG
181
182VmemType getVmemType(const MachineInstr &Inst) {
183 assert(updateVMCntOnly(Inst));
184 if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) &&
186 return VMEM_NOSAMPLER;
188 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
190 return BaseInfo->BVH ? VMEM_BVH
191 : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
192}
193
194unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
195 switch (T) {
196 case LOAD_CNT:
197 return Wait.LoadCnt;
198 case EXP_CNT:
199 return Wait.ExpCnt;
200 case DS_CNT:
201 return Wait.DsCnt;
202 case STORE_CNT:
203 return Wait.StoreCnt;
204 case SAMPLE_CNT:
205 return Wait.SampleCnt;
206 case BVH_CNT:
207 return Wait.BvhCnt;
208 case KM_CNT:
209 return Wait.KmCnt;
210 default:
211 llvm_unreachable("bad InstCounterType");
212 }
213}
214
215void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
216 unsigned &WC = getCounterRef(Wait, T);
217 WC = std::min(WC, Count);
218}
219
220void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
221 getCounterRef(Wait, T) = ~0u;
222}
223
224unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
225 return getCounterRef(Wait, T);
226}
227
228// Mapping from event to counter according to the table masks.
229InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
230 for (auto T : inst_counter_types()) {
231 if (masks[T] & (1 << E))
232 return T;
233 }
234 llvm_unreachable("event type has no associated counter");
235}
236
237// This objects maintains the current score brackets of each wait counter, and
238// a per-register scoreboard for each wait counter.
239//
240// We also maintain the latest score for every event type that can change the
241// waitcnt in order to know if there are multiple types of events within
242// the brackets. When multiple types of event happen in the bracket,
243// wait count may get decreased out of order, therefore we need to put in
244// "s_waitcnt 0" before use.
245class WaitcntBrackets {
246public:
247 WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
248 HardwareLimits Limits, RegisterEncoding Encoding,
249 const unsigned *WaitEventMaskForInst,
250 InstCounterType SmemAccessCounter)
251 : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
252 Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
253 SmemAccessCounter(SmemAccessCounter) {}
254
255 unsigned getWaitCountMax(InstCounterType T) const {
256 switch (T) {
257 case LOAD_CNT:
258 return Limits.LoadcntMax;
259 case DS_CNT:
260 return Limits.DscntMax;
261 case EXP_CNT:
262 return Limits.ExpcntMax;
263 case STORE_CNT:
264 return Limits.StorecntMax;
265 case SAMPLE_CNT:
266 return Limits.SamplecntMax;
267 case BVH_CNT:
268 return Limits.BvhcntMax;
269 case KM_CNT:
270 return Limits.KmcntMax;
271 default:
272 break;
273 }
274 return 0;
275 }
276
277 unsigned getScoreLB(InstCounterType T) const {
278 assert(T < NUM_INST_CNTS);
279 return ScoreLBs[T];
280 }
281
282 unsigned getScoreUB(InstCounterType T) const {
283 assert(T < NUM_INST_CNTS);
284 return ScoreUBs[T];
285 }
286
287 unsigned getScoreRange(InstCounterType T) const {
288 return getScoreUB(T) - getScoreLB(T);
289 }
290
291 unsigned getRegScore(int GprNo, InstCounterType T) const {
292 if (GprNo < NUM_ALL_VGPRS) {
293 return VgprScores[T][GprNo];
294 }
295 assert(T == SmemAccessCounter);
296 return SgprScores[GprNo - NUM_ALL_VGPRS];
297 }
298
299 bool merge(const WaitcntBrackets &Other);
300
301 RegInterval getRegInterval(const MachineInstr *MI,
303 const SIRegisterInfo *TRI, unsigned OpNo) const;
304
305 bool counterOutOfOrder(InstCounterType T) const;
306 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
307 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
308 void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
309 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
310 void applyWaitcnt(InstCounterType T, unsigned Count);
311 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
312 const MachineRegisterInfo *MRI, WaitEventType E,
314
315 unsigned hasPendingEvent() const { return PendingEvents; }
316 unsigned hasPendingEvent(WaitEventType E) const {
317 return PendingEvents & (1 << E);
318 }
319 unsigned hasPendingEvent(InstCounterType T) const {
320 unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
321 assert((HasPending != 0) == (getScoreRange(T) != 0));
322 return HasPending;
323 }
324
325 bool hasMixedPendingEvents(InstCounterType T) const {
326 unsigned Events = hasPendingEvent(T);
327 // Return true if more than one bit is set in Events.
328 return Events & (Events - 1);
329 }
330
331 bool hasPendingFlat() const {
332 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
333 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
334 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
335 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
336 }
337
338 void setPendingFlat() {
339 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
340 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
341 }
342
343 // Return true if there might be pending writes to the specified vgpr by VMEM
344 // instructions with types different from V.
345 bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
346 assert(GprNo < NUM_ALL_VGPRS);
347 return VgprVmemTypes[GprNo] & ~(1 << V);
348 }
349
350 void clearVgprVmemTypes(int GprNo) {
351 assert(GprNo < NUM_ALL_VGPRS);
352 VgprVmemTypes[GprNo] = 0;
353 }
354
355 void setStateOnFunctionEntryOrReturn() {
356 setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
357 PendingEvents |= WaitEventMaskForInst[STORE_CNT];
358 }
359
360 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
361 return LDSDMAStores;
362 }
363
364 void print(raw_ostream &);
365 void dump() { print(dbgs()); }
366
367private:
368 struct MergeInfo {
369 unsigned OldLB;
370 unsigned OtherLB;
371 unsigned MyShift;
372 unsigned OtherShift;
373 };
374 static bool mergeScore(const MergeInfo &M, unsigned &Score,
375 unsigned OtherScore);
376
377 void setScoreLB(InstCounterType T, unsigned Val) {
378 assert(T < NUM_INST_CNTS);
379 ScoreLBs[T] = Val;
380 }
381
382 void setScoreUB(InstCounterType T, unsigned Val) {
383 assert(T < NUM_INST_CNTS);
384 ScoreUBs[T] = Val;
385
386 if (T != EXP_CNT)
387 return;
388
389 if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
390 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
391 }
392
393 void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
394 if (GprNo < NUM_ALL_VGPRS) {
395 VgprUB = std::max(VgprUB, GprNo);
396 VgprScores[T][GprNo] = Val;
397 } else {
398 assert(T == SmemAccessCounter);
399 SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
400 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
401 }
402 }
403
404 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
406 unsigned OpNo, unsigned Val);
407
408 const GCNSubtarget *ST = nullptr;
409 InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
410 HardwareLimits Limits = {};
411 RegisterEncoding Encoding = {};
412 const unsigned *WaitEventMaskForInst;
413 InstCounterType SmemAccessCounter;
414 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
415 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
416 unsigned PendingEvents = 0;
417 // Remember the last flat memory operation.
418 unsigned LastFlat[NUM_INST_CNTS] = {0};
419 // wait_cnt scores for every vgpr.
420 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
421 int VgprUB = -1;
422 int SgprUB = -1;
423 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
424 // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
425 // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
426 unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
427 // Bitmask of the VmemTypes of VMEM instructions that might have a pending
428 // write to each vgpr.
429 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
430 // Store representative LDS DMA operations. The only useful info here is
431 // alias info. One store is kept per unique AAInfo.
432 SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
433};
434
435// This abstracts the logic for generating and updating S_WAIT* instructions
436// away from the analysis that determines where they are needed. This was
437// done because the set of counters and instructions for waiting on them
438// underwent a major shift with gfx12, sufficiently so that having this
439// abstraction allows the main analysis logic to be simpler than it would
440// otherwise have had to become.
441class WaitcntGenerator {
442protected:
443 const GCNSubtarget *ST = nullptr;
444 const SIInstrInfo *TII = nullptr;
446 InstCounterType MaxCounter;
447
448public:
449 WaitcntGenerator() {}
450 WaitcntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter)
451 : ST(ST), TII(ST->getInstrInfo()),
452 IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter) {}
453
454 // Edits an existing sequence of wait count instructions according
455 // to an incoming Waitcnt value, which is itself updated to reflect
456 // any new wait count instructions which may need to be generated by
457 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
458 // were made.
459 //
460 // This editing will usually be merely updated operands, but it may also
461 // delete instructions if the incoming Wait value indicates they are not
462 // needed. It may also remove existing instructions for which a wait
463 // is needed if it can be determined that it is better to generate new
464 // instructions later, as can happen on gfx12.
465 virtual bool
466 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
467 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
469
470 // Transform a soft waitcnt into a normal one.
471 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
472
473 // Generates new wait count instructions according to the value of
474 // Wait, returning true if any new instructions were created.
475 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
478
479 // Returns an array of bit masks which can be used to map values in
480 // WaitEventType to corresponding counter values in InstCounterType.
481 virtual const unsigned *getWaitEventMask() const = 0;
482
483 // Returns a new waitcnt with all counters except VScnt set to 0. If
484 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
485 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
486
487 virtual ~WaitcntGenerator() = default;
488
489 // Create a mask value from the initializer list of wait event types.
490 static constexpr unsigned
491 eventMask(std::initializer_list<WaitEventType> Events) {
492 unsigned Mask = 0;
493 for (auto &E : Events)
494 Mask |= 1 << E;
495
496 return Mask;
497 }
498};
499
500class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
501public:
502 WaitcntGeneratorPreGFX12() {}
503 WaitcntGeneratorPreGFX12(const GCNSubtarget *ST)
504 : WaitcntGenerator(ST, NUM_NORMAL_INST_CNTS) {}
505
506 bool
507 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
508 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
509 MachineBasicBlock::instr_iterator It) const override;
510
511 bool createNewWaitcnt(MachineBasicBlock &Block,
513 AMDGPU::Waitcnt Wait) override;
514
515 const unsigned *getWaitEventMask() const override {
516 assert(ST);
517
518 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
519 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
520 VMEM_BVH_READ_ACCESS}),
521 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
522 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
523 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
524 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
525 0,
526 0,
527 0};
528
529 return WaitEventMaskForInstPreGFX12;
530 }
531
532 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
533};
534
535class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
536public:
537 WaitcntGeneratorGFX12Plus() {}
538 WaitcntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter)
539 : WaitcntGenerator(ST, MaxCounter) {}
540
541 bool
542 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
543 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
544 MachineBasicBlock::instr_iterator It) const override;
545
546 bool createNewWaitcnt(MachineBasicBlock &Block,
548 AMDGPU::Waitcnt Wait) override;
549
550 const unsigned *getWaitEventMask() const override {
551 assert(ST);
552
553 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
554 eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
555 eventMask({LDS_ACCESS, GDS_ACCESS}),
556 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
557 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
558 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
559 eventMask({VMEM_SAMPLER_READ_ACCESS}),
560 eventMask({VMEM_BVH_READ_ACCESS}),
561 eventMask({SMEM_ACCESS, SQ_MESSAGE})};
562
563 return WaitEventMaskForInstGFX12Plus;
564 }
565
566 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
567};
568
569class SIInsertWaitcnts : public MachineFunctionPass {
570private:
571 const GCNSubtarget *ST = nullptr;
572 const SIInstrInfo *TII = nullptr;
573 const SIRegisterInfo *TRI = nullptr;
574 const MachineRegisterInfo *MRI = nullptr;
575
577 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
578 MachineLoopInfo *MLI;
580 AliasAnalysis *AA = nullptr;
581
582 struct BlockInfo {
583 std::unique_ptr<WaitcntBrackets> Incoming;
584 bool Dirty = true;
585 };
586
587 InstCounterType SmemAccessCounter;
588
590
591 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
592 // because of amdgpu-waitcnt-forcezero flag
593 bool ForceEmitZeroWaitcnts;
594 bool ForceEmitWaitcnt[NUM_INST_CNTS];
595
596 bool OptNone;
597
598 // In any given run of this pass, WCG will point to one of these two
599 // generator objects, which must have been re-initialised before use
600 // from a value made using a subtarget constructor.
601 WaitcntGeneratorPreGFX12 WCGPreGFX12;
602 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
603
604 WaitcntGenerator *WCG = nullptr;
605
606 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
607 // message.
608 DenseSet<MachineInstr *> ReleaseVGPRInsts;
609
610 InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
611
612public:
613 static char ID;
614
615 SIInsertWaitcnts() : MachineFunctionPass(ID) {
616 (void)ForceExpCounter;
617 (void)ForceLgkmCounter;
618 (void)ForceVMCounter;
619 }
620
621 bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
622 bool isPreheaderToFlush(MachineBasicBlock &MBB,
623 WaitcntBrackets &ScoreBrackets);
624 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
625 bool runOnMachineFunction(MachineFunction &MF) override;
626
627 StringRef getPassName() const override {
628 return "SI insert wait instructions";
629 }
630
631 void getAnalysisUsage(AnalysisUsage &AU) const override {
632 AU.setPreservesCFG();
638 }
639
640 bool isForceEmitWaitcnt() const {
641 for (auto T : inst_counter_types())
642 if (ForceEmitWaitcnt[T])
643 return true;
644 return false;
645 }
646
647 void setForceEmitWaitcnt() {
648// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
649// For debug builds, get the debug counter info and adjust if need be
650#ifndef NDEBUG
651 if (DebugCounter::isCounterSet(ForceExpCounter) &&
652 DebugCounter::shouldExecute(ForceExpCounter)) {
653 ForceEmitWaitcnt[EXP_CNT] = true;
654 } else {
655 ForceEmitWaitcnt[EXP_CNT] = false;
656 }
657
658 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
659 DebugCounter::shouldExecute(ForceLgkmCounter)) {
660 ForceEmitWaitcnt[DS_CNT] = true;
661 ForceEmitWaitcnt[KM_CNT] = true;
662 } else {
663 ForceEmitWaitcnt[DS_CNT] = false;
664 ForceEmitWaitcnt[KM_CNT] = false;
665 }
666
667 if (DebugCounter::isCounterSet(ForceVMCounter) &&
668 DebugCounter::shouldExecute(ForceVMCounter)) {
669 ForceEmitWaitcnt[LOAD_CNT] = true;
670 ForceEmitWaitcnt[SAMPLE_CNT] = true;
671 ForceEmitWaitcnt[BVH_CNT] = true;
672 } else {
673 ForceEmitWaitcnt[LOAD_CNT] = false;
674 ForceEmitWaitcnt[SAMPLE_CNT] = false;
675 ForceEmitWaitcnt[BVH_CNT] = false;
676 }
677#endif // NDEBUG
678 }
679
680 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
681 // FLAT instruction.
682 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
683 // Maps VMEM access types to their corresponding WaitEventType.
684 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
685 VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
686
688 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
689 // these should use VM_CNT.
690 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
691 return VMEM_ACCESS;
692 if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) {
693 // FLAT and SCRATCH instructions may access scratch. Other VMEM
694 // instructions do not.
695 if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
696 return SCRATCH_WRITE_ACCESS;
697 return VMEM_WRITE_ACCESS;
698 }
699 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
700 return VMEM_READ_ACCESS;
701 return VmemReadMapping[getVmemType(Inst)];
702 }
703
704 bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
705 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
706 bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
707 bool generateWaitcntInstBefore(MachineInstr &MI,
708 WaitcntBrackets &ScoreBrackets,
709 MachineInstr *OldWaitcntInstr,
710 bool FlushVmCnt);
711 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
713 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
714 MachineInstr *OldWaitcntInstr);
715 void updateEventWaitcntAfter(MachineInstr &Inst,
716 WaitcntBrackets *ScoreBrackets);
717 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
718 WaitcntBrackets &ScoreBrackets);
719};
720
721} // end anonymous namespace
722
723RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
725 const SIRegisterInfo *TRI,
726 unsigned OpNo) const {
727 const MachineOperand &Op = MI->getOperand(OpNo);
728 if (!TRI->isInAllocatableClass(Op.getReg()))
729 return {-1, -1};
730
731 // A use via a PW operand does not need a waitcnt.
732 // A partial write is not a WAW.
733 assert(!Op.getSubReg() || !Op.isUndef());
734
735 RegInterval Result;
736
737 unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
739
740 if (TRI->isVectorRegister(*MRI, Op.getReg())) {
741 assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
742 Result.first = Reg - Encoding.VGPR0;
743 if (TRI->isAGPR(*MRI, Op.getReg()))
744 Result.first += AGPR_OFFSET;
745 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
746 } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
747 assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
748 Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
749 assert(Result.first >= NUM_ALL_VGPRS &&
750 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
751 }
752 // TODO: Handle TTMP
753 // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
754 else
755 return {-1, -1};
756
757 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
758 unsigned Size = TRI->getRegSizeInBits(*RC);
759 Result.second = Result.first + ((Size + 16) / 32);
760
761 return Result;
762}
763
764void WaitcntBrackets::setExpScore(const MachineInstr *MI,
765 const SIInstrInfo *TII,
766 const SIRegisterInfo *TRI,
767 const MachineRegisterInfo *MRI, unsigned OpNo,
768 unsigned Val) {
769 RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo);
770 assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
771 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
772 setRegScore(RegNo, EXP_CNT, Val);
773 }
774}
775
776void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
777 const SIRegisterInfo *TRI,
779 WaitEventType E, MachineInstr &Inst) {
780 InstCounterType T = eventCounter(WaitEventMaskForInst, E);
781
782 unsigned UB = getScoreUB(T);
783 unsigned CurrScore = UB + 1;
784 if (CurrScore == 0)
785 report_fatal_error("InsertWaitcnt score wraparound");
786 // PendingEvents and ScoreUB need to be update regardless if this event
787 // changes the score of a register or not.
788 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
789 PendingEvents |= 1 << E;
790 setScoreUB(T, CurrScore);
791
792 if (T == EXP_CNT) {
793 // Put score on the source vgprs. If this is a store, just use those
794 // specific register(s).
795 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
796 int AddrOpIdx =
797 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
798 // All GDS operations must protect their address register (same as
799 // export.)
800 if (AddrOpIdx != -1) {
801 setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
802 }
803
804 if (Inst.mayStore()) {
805 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) {
806 setExpScore(
807 &Inst, TII, TRI, MRI,
808 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
809 CurrScore);
810 }
811 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) {
812 setExpScore(&Inst, TII, TRI, MRI,
814 AMDGPU::OpName::data1),
815 CurrScore);
816 }
817 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
818 Inst.getOpcode() != AMDGPU::DS_APPEND &&
819 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
820 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
821 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
822 const MachineOperand &Op = Inst.getOperand(I);
823 if (Op.isReg() && !Op.isDef() &&
824 TRI->isVectorRegister(*MRI, Op.getReg())) {
825 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
826 }
827 }
828 }
829 } else if (TII->isFLAT(Inst)) {
830 if (Inst.mayStore()) {
831 setExpScore(
832 &Inst, TII, TRI, MRI,
833 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
834 CurrScore);
835 } else if (SIInstrInfo::isAtomicRet(Inst)) {
836 setExpScore(
837 &Inst, TII, TRI, MRI,
838 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
839 CurrScore);
840 }
841 } else if (TII->isMIMG(Inst)) {
842 if (Inst.mayStore()) {
843 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
844 } else if (SIInstrInfo::isAtomicRet(Inst)) {
845 setExpScore(
846 &Inst, TII, TRI, MRI,
847 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
848 CurrScore);
849 }
850 } else if (TII->isMTBUF(Inst)) {
851 if (Inst.mayStore()) {
852 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
853 }
854 } else if (TII->isMUBUF(Inst)) {
855 if (Inst.mayStore()) {
856 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
857 } else if (SIInstrInfo::isAtomicRet(Inst)) {
858 setExpScore(
859 &Inst, TII, TRI, MRI,
860 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
861 CurrScore);
862 }
863 } else if (TII->isLDSDIR(Inst)) {
864 // LDSDIR instructions attach the score to the destination.
865 setExpScore(
866 &Inst, TII, TRI, MRI,
867 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
868 CurrScore);
869 } else {
870 if (TII->isEXP(Inst)) {
871 // For export the destination registers are really temps that
872 // can be used as the actual source after export patching, so
873 // we need to treat them like sources and set the EXP_CNT
874 // score.
875 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
876 MachineOperand &DefMO = Inst.getOperand(I);
877 if (DefMO.isReg() && DefMO.isDef() &&
878 TRI->isVGPR(*MRI, DefMO.getReg())) {
879 setRegScore(
880 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
881 EXP_CNT, CurrScore);
882 }
883 }
884 }
885 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
886 MachineOperand &MO = Inst.getOperand(I);
887 if (MO.isReg() && !MO.isDef() &&
888 TRI->isVectorRegister(*MRI, MO.getReg())) {
889 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
890 }
891 }
892 }
893#if 0 // TODO: check if this is handled by MUBUF code above.
894 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
895 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
896 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
897 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
898 unsigned OpNo;//TODO: find the OpNo for this operand;
899 RegInterval Interval = getRegInterval(&Inst, MRI, TRI, OpNo);
900 for (int RegNo = Interval.first; RegNo < Interval.second;
901 ++RegNo) {
902 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
903 }
904#endif
905 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
906 // Match the score to the destination registers.
907 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
908 auto &Op = Inst.getOperand(I);
909 if (!Op.isReg() || !Op.isDef())
910 continue;
911 RegInterval Interval = getRegInterval(&Inst, MRI, TRI, I);
912 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
913 if (Interval.first >= NUM_ALL_VGPRS)
914 continue;
915 if (updateVMCntOnly(Inst)) {
916 // updateVMCntOnly should only leave us with VGPRs
917 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
918 // defs. That's required for a sane index into `VgprMemTypes` below
919 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
920 VmemType V = getVmemType(Inst);
921 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
922 VgprVmemTypes[RegNo] |= 1 << V;
923 }
924 }
925 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
926 setRegScore(RegNo, T, CurrScore);
927 }
928 }
929 if (Inst.mayStore() &&
930 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
931 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
932 // written can be accessed. A load from LDS to VMEM does not need a wait.
933 unsigned Slot = 0;
934 for (const auto *MemOp : Inst.memoperands()) {
935 if (!MemOp->isStore() ||
936 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
937 continue;
938 // Comparing just AA info does not guarantee memoperands are equal
939 // in general, but this is so for LDS DMA in practice.
940 auto AAI = MemOp->getAAInfo();
941 // Alias scope information gives a way to definitely identify an
942 // original memory object and practically produced in the module LDS
943 // lowering pass. If there is no scope available we will not be able
944 // to disambiguate LDS aliasing as after the module lowering all LDS
945 // is squashed into a single big object. Do not attempt to use one of
946 // the limited LDSDMAStores for something we will not be able to use
947 // anyway.
948 if (!AAI || !AAI.Scope)
949 break;
950 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
951 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
952 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
953 Slot = I + 1;
954 break;
955 }
956 }
957 }
958 if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
959 break;
960 LDSDMAStores.push_back(&Inst);
961 Slot = LDSDMAStores.size();
962 break;
963 }
964 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore);
965 if (Slot)
966 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
967 }
968 }
969}
970
971void WaitcntBrackets::print(raw_ostream &OS) {
972 OS << '\n';
973 for (auto T : inst_counter_types(MaxCounter)) {
974 unsigned SR = getScoreRange(T);
975
976 switch (T) {
977 case LOAD_CNT:
978 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
979 << SR << "): ";
980 break;
981 case DS_CNT:
982 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
983 << SR << "): ";
984 break;
985 case EXP_CNT:
986 OS << " EXP_CNT(" << SR << "): ";
987 break;
988 case STORE_CNT:
989 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
990 << SR << "): ";
991 break;
992 case SAMPLE_CNT:
993 OS << " SAMPLE_CNT(" << SR << "): ";
994 break;
995 case BVH_CNT:
996 OS << " BVH_CNT(" << SR << "): ";
997 break;
998 case KM_CNT:
999 OS << " KM_CNT(" << SR << "): ";
1000 break;
1001 default:
1002 OS << " UNKNOWN(" << SR << "): ";
1003 break;
1004 }
1005
1006 if (SR != 0) {
1007 // Print vgpr scores.
1008 unsigned LB = getScoreLB(T);
1009
1010 for (int J = 0; J <= VgprUB; J++) {
1011 unsigned RegScore = getRegScore(J, T);
1012 if (RegScore <= LB)
1013 continue;
1014 unsigned RelScore = RegScore - LB - 1;
1015 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
1016 OS << RelScore << ":v" << J << " ";
1017 } else {
1018 OS << RelScore << ":ds ";
1019 }
1020 }
1021 // Also need to print sgpr scores for lgkm_cnt.
1022 if (T == SmemAccessCounter) {
1023 for (int J = 0; J <= SgprUB; J++) {
1024 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
1025 if (RegScore <= LB)
1026 continue;
1027 unsigned RelScore = RegScore - LB - 1;
1028 OS << RelScore << ":s" << J << " ";
1029 }
1030 }
1031 }
1032 OS << '\n';
1033 }
1034 OS << '\n';
1035}
1036
1037/// Simplify the waitcnt, in the sense of removing redundant counts, and return
1038/// whether a waitcnt instruction is needed at all.
1039void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
1040 simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1041 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
1042 simplifyWaitcnt(DS_CNT, Wait.DsCnt);
1043 simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
1044 simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1045 simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
1046 simplifyWaitcnt(KM_CNT, Wait.KmCnt);
1047}
1048
1049void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1050 unsigned &Count) const {
1051 // The number of outstanding events for this type, T, can be calculated
1052 // as (UB - LB). If the current Count is greater than or equal to the number
1053 // of outstanding events, then the wait for this counter is redundant.
1054 if (Count >= getScoreRange(T))
1055 Count = ~0u;
1056}
1057
1058void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
1059 AMDGPU::Waitcnt &Wait) const {
1060 unsigned ScoreToWait = getRegScore(RegNo, T);
1061
1062 // If the score of src_operand falls within the bracket, we need an
1063 // s_waitcnt instruction.
1064 const unsigned LB = getScoreLB(T);
1065 const unsigned UB = getScoreUB(T);
1066 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1067 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1068 !ST->hasFlatLgkmVMemCountInOrder()) {
1069 // If there is a pending FLAT operation, and this is a VMem or LGKM
1070 // waitcnt and the target can report early completion, then we need
1071 // to force a waitcnt 0.
1072 addWait(Wait, T, 0);
1073 } else if (counterOutOfOrder(T)) {
1074 // Counter can get decremented out-of-order when there
1075 // are multiple types event in the bracket. Also emit an s_wait counter
1076 // with a conservative value of 0 for the counter.
1077 addWait(Wait, T, 0);
1078 } else {
1079 // If a counter has been maxed out avoid overflow by waiting for
1080 // MAX(CounterType) - 1 instead.
1081 unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
1082 addWait(Wait, T, NeededWait);
1083 }
1084 }
1085}
1086
1087void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1088 applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1089 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1090 applyWaitcnt(DS_CNT, Wait.DsCnt);
1091 applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1092 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1093 applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1094 applyWaitcnt(KM_CNT, Wait.KmCnt);
1095}
1096
1097void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1098 const unsigned UB = getScoreUB(T);
1099 if (Count >= UB)
1100 return;
1101 if (Count != 0) {
1102 if (counterOutOfOrder(T))
1103 return;
1104 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1105 } else {
1106 setScoreLB(T, UB);
1107 PendingEvents &= ~WaitEventMaskForInst[T];
1108 }
1109}
1110
1111// Where there are multiple types of event in the bracket of a counter,
1112// the decrement may go out of order.
1113bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1114 // Scalar memory read always can go out of order.
1115 if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
1116 return true;
1117 return hasMixedPendingEvents(T);
1118}
1119
1120INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1121 false)
1124INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
1125 false)
1126
1127char SIInsertWaitcnts::ID = 0;
1128
1129char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
1130
1132 return new SIInsertWaitcnts();
1133}
1134
1136 unsigned NewEnc) {
1137 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1138 assert(OpIdx >= 0);
1139
1140 MachineOperand &MO = MI.getOperand(OpIdx);
1141
1142 if (NewEnc == MO.getImm())
1143 return false;
1144
1145 MO.setImm(NewEnc);
1146 return true;
1147}
1148
1149/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1150/// and if so, which counter it is waiting on.
1151static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1152 switch (Opcode) {
1153 case AMDGPU::S_WAIT_LOADCNT:
1154 return LOAD_CNT;
1155 case AMDGPU::S_WAIT_EXPCNT:
1156 return EXP_CNT;
1157 case AMDGPU::S_WAIT_STORECNT:
1158 return STORE_CNT;
1159 case AMDGPU::S_WAIT_SAMPLECNT:
1160 return SAMPLE_CNT;
1161 case AMDGPU::S_WAIT_BVHCNT:
1162 return BVH_CNT;
1163 case AMDGPU::S_WAIT_DSCNT:
1164 return DS_CNT;
1165 case AMDGPU::S_WAIT_KMCNT:
1166 return KM_CNT;
1167 default:
1168 return {};
1169 }
1170}
1171
1172bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1173 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1174 if (Opcode == Waitcnt->getOpcode())
1175 return false;
1176
1177 Waitcnt->setDesc(TII->get(Opcode));
1178 return true;
1179}
1180
1181/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1182/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1183/// from \p Wait that were added by previous passes. Currently this pass
1184/// conservatively assumes that these preexisting waits are required for
1185/// correctness.
1186bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1187 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1189 assert(ST);
1190 assert(isNormalMode(MaxCounter));
1191
1192 bool Modified = false;
1193 MachineInstr *WaitcntInstr = nullptr;
1194 MachineInstr *WaitcntVsCntInstr = nullptr;
1195
1196 for (auto &II :
1197 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1198 if (II.isMetaInstruction())
1199 continue;
1200
1201 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1202 bool IsSoft = Opcode != II.getOpcode();
1203
1204 // Update required wait count. If this is a soft waitcnt (= it was added
1205 // by an earlier pass), it may be entirely removed.
1206 if (Opcode == AMDGPU::S_WAITCNT) {
1207 unsigned IEnc = II.getOperand(0).getImm();
1208 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1209 if (IsSoft)
1210 ScoreBrackets.simplifyWaitcnt(OldWait);
1211 Wait = Wait.combined(OldWait);
1212
1213 // Merge consecutive waitcnt of the same type by erasing multiples.
1214 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && IsSoft)) {
1215 II.eraseFromParent();
1216 Modified = true;
1217 } else
1218 WaitcntInstr = &II;
1219 } else {
1220 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1221 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1222
1223 unsigned OldVSCnt =
1224 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1225 if (IsSoft)
1226 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1227 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1228
1229 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && IsSoft)) {
1230 II.eraseFromParent();
1231 Modified = true;
1232 } else
1233 WaitcntVsCntInstr = &II;
1234 }
1235 }
1236
1237 if (WaitcntInstr) {
1238 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1240 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1241
1242 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1243 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1244 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1245 Wait.LoadCnt = ~0u;
1246 Wait.ExpCnt = ~0u;
1247 Wait.DsCnt = ~0u;
1248
1249 LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1250 ? dbgs()
1251 << "applyPreexistingWaitcnt\n"
1252 << "New Instr at block end: " << *WaitcntInstr << '\n'
1253 : dbgs() << "applyPreexistingWaitcnt\n"
1254 << "Old Instr: " << *It
1255 << "New Instr: " << *WaitcntInstr << '\n');
1256 }
1257
1258 if (WaitcntVsCntInstr) {
1259 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
1260 AMDGPU::OpName::simm16, Wait.StoreCnt);
1261 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1262
1263 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1264 Wait.StoreCnt = ~0u;
1265
1266 LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1267 ? dbgs() << "applyPreexistingWaitcnt\n"
1268 << "New Instr at block end: " << *WaitcntVsCntInstr
1269 << '\n'
1270 : dbgs() << "applyPreexistingWaitcnt\n"
1271 << "Old Instr: " << *It
1272 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1273 }
1274
1275 return Modified;
1276}
1277
1278/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1279/// required counters in \p Wait
1280bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1283 assert(ST);
1284 assert(isNormalMode(MaxCounter));
1285
1286 bool Modified = false;
1287 const DebugLoc &DL = Block.findDebugLoc(It);
1288
1289 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1290 // single instruction while VScnt has its own instruction.
1291 if (Wait.hasWaitExceptStoreCnt()) {
1292 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1293 [[maybe_unused]] auto SWaitInst =
1294 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1295 Modified = true;
1296
1297 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1298 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1299 dbgs() << "New Instr: " << *SWaitInst << '\n');
1300 }
1301
1302 if (Wait.hasWaitStoreCnt()) {
1303 assert(ST->hasVscnt());
1304
1305 [[maybe_unused]] auto SWaitInst =
1306 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1307 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1308 .addImm(Wait.StoreCnt);
1309 Modified = true;
1310
1311 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1312 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1313 dbgs() << "New Instr: " << *SWaitInst << '\n');
1314 }
1315
1316 return Modified;
1317}
1318
1320WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1321 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1322}
1323
1325WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1326 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0);
1327}
1328
1329/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1330/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1331/// were added by previous passes. Currently this pass conservatively
1332/// assumes that these preexisting waits are required for correctness.
1333bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1334 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1336 assert(ST);
1337 assert(!isNormalMode(MaxCounter));
1338
1339 bool Modified = false;
1340 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1341 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1342 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1343
1344 for (auto &II :
1345 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1346 if (II.isMetaInstruction())
1347 continue;
1348
1349 MachineInstr **UpdatableInstr;
1350
1351 // Update required wait count. If this is a soft waitcnt (= it was added
1352 // by an earlier pass), it may be entirely removed.
1353
1354 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1355 bool IsSoft = Opcode != II.getOpcode();
1356
1357 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1358 unsigned OldEnc =
1359 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1361 if (IsSoft)
1362 ScoreBrackets.simplifyWaitcnt(OldWait);
1363 Wait = Wait.combined(OldWait);
1364 UpdatableInstr = &CombinedLoadDsCntInstr;
1365 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1366 unsigned OldEnc =
1367 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1369 if (IsSoft)
1370 ScoreBrackets.simplifyWaitcnt(OldWait);
1371 Wait = Wait.combined(OldWait);
1372 UpdatableInstr = &CombinedStoreDsCntInstr;
1373 } else {
1374 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1375 assert(CT.has_value());
1376 unsigned OldCnt =
1377 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1378 if (IsSoft)
1379 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
1380 addWait(Wait, CT.value(), OldCnt);
1381 UpdatableInstr = &WaitInstrs[CT.value()];
1382 }
1383
1384 // Merge consecutive waitcnt of the same type by erasing multiples.
1385 if (!*UpdatableInstr) {
1386 *UpdatableInstr = &II;
1387 } else {
1388 II.eraseFromParent();
1389 Modified = true;
1390 }
1391 }
1392
1393 if (CombinedLoadDsCntInstr) {
1394 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1395 // to be waited for. Otherwise, let the instruction be deleted so
1396 // the appropriate single counter wait instruction can be inserted
1397 // instead, when new S_WAIT_*CNT instructions are inserted by
1398 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1399 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1400 // the loop below that deals with single counter instructions.
1401 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1402 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1403 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1404 AMDGPU::OpName::simm16, NewEnc);
1405 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1406 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1407 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1408 Wait.LoadCnt = ~0u;
1409 Wait.DsCnt = ~0u;
1410
1411 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1412 ? dbgs() << "applyPreexistingWaitcnt\n"
1413 << "New Instr at block end: "
1414 << *CombinedLoadDsCntInstr << '\n'
1415 : dbgs() << "applyPreexistingWaitcnt\n"
1416 << "Old Instr: " << *It << "New Instr: "
1417 << *CombinedLoadDsCntInstr << '\n');
1418 } else {
1419 CombinedLoadDsCntInstr->eraseFromParent();
1420 Modified = true;
1421 }
1422 }
1423
1424 if (CombinedStoreDsCntInstr) {
1425 // Similarly for S_WAIT_STORECNT_DSCNT.
1426 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1427 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1428 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
1429 AMDGPU::OpName::simm16, NewEnc);
1430 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1431 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1432 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1433 Wait.StoreCnt = ~0u;
1434 Wait.DsCnt = ~0u;
1435
1436 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1437 ? dbgs() << "applyPreexistingWaitcnt\n"
1438 << "New Instr at block end: "
1439 << *CombinedStoreDsCntInstr << '\n'
1440 : dbgs() << "applyPreexistingWaitcnt\n"
1441 << "Old Instr: " << *It << "New Instr: "
1442 << *CombinedStoreDsCntInstr << '\n');
1443 } else {
1444 CombinedStoreDsCntInstr->eraseFromParent();
1445 Modified = true;
1446 }
1447 }
1448
1449 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
1450 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1451 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1452 // instructions so that createNewWaitcnt() will create new combined
1453 // instructions to replace them.
1454
1455 if (Wait.DsCnt != ~0u) {
1456 // This is a vector of addresses in WaitInstrs pointing to instructions
1457 // that should be removed if they are present.
1459
1460 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1461 // both) need to be waited for, ensure that there are no existing
1462 // individual wait count instructions for these.
1463
1464 if (Wait.LoadCnt != ~0u) {
1465 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
1466 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1467 } else if (Wait.StoreCnt != ~0u) {
1468 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
1469 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1470 }
1471
1472 for (MachineInstr **WI : WaitsToErase) {
1473 if (!*WI)
1474 continue;
1475
1476 (*WI)->eraseFromParent();
1477 *WI = nullptr;
1478 Modified = true;
1479 }
1480 }
1481
1482 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1483 if (!WaitInstrs[CT])
1484 continue;
1485
1486 unsigned NewCnt = getWait(Wait, CT);
1487 if (NewCnt != ~0u) {
1488 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
1489 AMDGPU::OpName::simm16, NewCnt);
1490 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1491
1492 ScoreBrackets.applyWaitcnt(CT, NewCnt);
1493 setNoWait(Wait, CT);
1494
1495 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1496 ? dbgs() << "applyPreexistingWaitcnt\n"
1497 << "New Instr at block end: " << *WaitInstrs[CT]
1498 << '\n'
1499 : dbgs() << "applyPreexistingWaitcnt\n"
1500 << "Old Instr: " << *It
1501 << "New Instr: " << *WaitInstrs[CT] << '\n');
1502 } else {
1503 WaitInstrs[CT]->eraseFromParent();
1504 Modified = true;
1505 }
1506 }
1507
1508 return Modified;
1509}
1510
1511/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
1512bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1515 assert(ST);
1516 assert(!isNormalMode(MaxCounter));
1517
1518 bool Modified = false;
1519 const DebugLoc &DL = Block.findDebugLoc(It);
1520
1521 // Check for opportunities to use combined wait instructions.
1522 if (Wait.DsCnt != ~0u) {
1523 MachineInstr *SWaitInst = nullptr;
1524
1525 if (Wait.LoadCnt != ~0u) {
1526 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1527
1528 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
1529 .addImm(Enc);
1530
1531 Wait.LoadCnt = ~0u;
1532 Wait.DsCnt = ~0u;
1533 } else if (Wait.StoreCnt != ~0u) {
1534 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1535
1536 SWaitInst =
1537 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
1538 .addImm(Enc);
1539
1540 Wait.StoreCnt = ~0u;
1541 Wait.DsCnt = ~0u;
1542 }
1543
1544 if (SWaitInst) {
1545 Modified = true;
1546
1547 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1548 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1549 dbgs() << "New Instr: " << *SWaitInst << '\n');
1550 }
1551 }
1552
1553 // Generate an instruction for any remaining counter that needs
1554 // waiting for.
1555
1556 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1557 unsigned Count = getWait(Wait, CT);
1558 if (Count == ~0u)
1559 continue;
1560
1561 [[maybe_unused]] auto SWaitInst =
1562 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
1563 .addImm(Count);
1564
1565 Modified = true;
1566
1567 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1568 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1569 dbgs() << "New Instr: " << *SWaitInst << '\n');
1570 }
1571
1572 return Modified;
1573}
1574
1575static bool readsVCCZ(const MachineInstr &MI) {
1576 unsigned Opc = MI.getOpcode();
1577 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
1578 !MI.getOperand(1).isUndef();
1579}
1580
1581/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
1583 // Currently all conventions wait, but this may not always be the case.
1584 //
1585 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
1586 // senses to omit the wait and do it in the caller.
1587 return true;
1588}
1589
1590/// \returns true if the callee is expected to wait for any outstanding waits
1591/// before returning.
1593 return true;
1594}
1595
1596/// Generate s_waitcnt instruction to be placed before cur_Inst.
1597/// Instructions of a given type are returned in order,
1598/// but instructions of different types can complete out of order.
1599/// We rely on this in-order completion
1600/// and simply assign a score to the memory access instructions.
1601/// We keep track of the active "score bracket" to determine
1602/// if an access of a memory read requires an s_waitcnt
1603/// and if so what the value of each counter is.
1604/// The "score bracket" is bound by the lower bound and upper bound
1605/// scores (*_score_LB and *_score_ub respectively).
1606/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1607/// flush the vmcnt counter here.
1608bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1609 WaitcntBrackets &ScoreBrackets,
1610 MachineInstr *OldWaitcntInstr,
1611 bool FlushVmCnt) {
1612 setForceEmitWaitcnt();
1613
1614 if (MI.isMetaInstruction())
1615 return false;
1616
1618
1619 // FIXME: This should have already been handled by the memory legalizer.
1620 // Removing this currently doesn't affect any lit tests, but we need to
1621 // verify that nothing was relying on this. The number of buffer invalidates
1622 // being handled here should not be expanded.
1623 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1624 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1625 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1626 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1627 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1628 Wait.LoadCnt = 0;
1629 }
1630
1631 // All waits must be resolved at call return.
1632 // NOTE: this could be improved with knowledge of all call sites or
1633 // with knowledge of the called routines.
1634 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1635 MI.getOpcode() == AMDGPU::SI_RETURN ||
1636 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1637 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1638 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
1639 }
1640 // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1641 // stores. In this case it can be useful to send a message to explicitly
1642 // release all VGPRs before the stores have completed, but it is only safe to
1643 // do this if:
1644 // * there are no outstanding scratch stores
1645 // * we are not in Dynamic VGPR mode
1646 else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1647 MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1648 if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone &&
1649 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1650 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
1651 ReleaseVGPRInsts.insert(&MI);
1652 }
1653 // Resolve vm waits before gs-done.
1654 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
1655 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1656 ST->hasLegacyGeometry() &&
1657 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1659 Wait.LoadCnt = 0;
1660 }
1661#if 0 // TODO: the following blocks of logic when we have fence.
1662 else if (MI.getOpcode() == SC_FENCE) {
1663 const unsigned int group_size =
1664 context->shader_info->GetMaxThreadGroupSize();
1665 // group_size == 0 means thread group size is unknown at compile time
1666 const bool group_is_multi_wave =
1667 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
1668 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
1669
1670 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
1671 SCRegType src_type = Inst->GetSrcType(i);
1672 switch (src_type) {
1673 case SCMEM_LDS:
1674 if (group_is_multi_wave ||
1675 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
1676 EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT,
1677 ScoreBrackets->getScoreUB(DS_CNT));
1678 // LDS may have to wait for VMcnt after buffer load to LDS
1679 if (target_info->HasBufferLoadToLDS()) {
1680 EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT,
1681 ScoreBrackets->getScoreUB(LOAD_CNT));
1682 }
1683 }
1684 break;
1685
1686 case SCMEM_GDS:
1687 if (group_is_multi_wave || fence_is_global) {
1688 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1689 ScoreBrackets->getScoreUB(EXP_CNT));
1690 EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT,
1691 ScoreBrackets->getScoreUB(DS_CNT));
1692 }
1693 break;
1694
1695 case SCMEM_UAV:
1696 case SCMEM_TFBUF:
1697 case SCMEM_RING:
1698 case SCMEM_SCATTER:
1699 if (group_is_multi_wave || fence_is_global) {
1700 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1701 ScoreBrackets->getScoreUB(EXP_CNT));
1702 EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT,
1703 ScoreBrackets->getScoreUB(LOAD_CNT));
1704 }
1705 break;
1706
1707 case SCMEM_SCRATCH:
1708 default:
1709 break;
1710 }
1711 }
1712 }
1713#endif
1714
1715 // Export & GDS instructions do not read the EXEC mask until after the export
1716 // is granted (which can occur well after the instruction is issued).
1717 // The shader program must flush all EXP operations on the export-count
1718 // before overwriting the EXEC mask.
1719 else {
1720 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1721 // Export and GDS are tracked individually, either may trigger a waitcnt
1722 // for EXEC.
1723 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1724 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1725 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1726 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1727 Wait.ExpCnt = 0;
1728 }
1729 }
1730
1731 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1732 // The function is going to insert a wait on everything in its prolog.
1733 // This still needs to be careful if the call target is a load (e.g. a GOT
1734 // load). We also need to check WAW dependency with saved PC.
1736
1737 int CallAddrOpIdx =
1738 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
1739
1740 if (MI.getOperand(CallAddrOpIdx).isReg()) {
1741 RegInterval CallAddrOpInterval =
1742 ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOpIdx);
1743
1744 for (int RegNo = CallAddrOpInterval.first;
1745 RegNo < CallAddrOpInterval.second; ++RegNo)
1746 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1747
1748 int RtnAddrOpIdx =
1749 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
1750 if (RtnAddrOpIdx != -1) {
1751 RegInterval RtnAddrOpInterval =
1752 ScoreBrackets.getRegInterval(&MI, MRI, TRI, RtnAddrOpIdx);
1753
1754 for (int RegNo = RtnAddrOpInterval.first;
1755 RegNo < RtnAddrOpInterval.second; ++RegNo)
1756 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1757 }
1758 }
1759 } else {
1760 // FIXME: Should not be relying on memoperands.
1761 // Look at the source operands of every instruction to see if
1762 // any of them results from a previous memory operation that affects
1763 // its current usage. If so, an s_waitcnt instruction needs to be
1764 // emitted.
1765 // If the source operand was defined by a load, add the s_waitcnt
1766 // instruction.
1767 //
1768 // Two cases are handled for destination operands:
1769 // 1) If the destination operand was defined by a load, add the s_waitcnt
1770 // instruction to guarantee the right WAW order.
1771 // 2) If a destination operand that was used by a recent export/store ins,
1772 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1773
1774 for (const MachineMemOperand *Memop : MI.memoperands()) {
1775 const Value *Ptr = Memop->getValue();
1776 if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
1777 addWait(Wait, SmemAccessCounter, 0);
1778 if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1779 SLoadAddresses.erase(Ptr);
1780 }
1781 unsigned AS = Memop->getAddrSpace();
1783 continue;
1784 // No need to wait before load from VMEM to LDS.
1785 if (TII->mayWriteLDSThroughDMA(MI))
1786 continue;
1787
1788 // LOAD_CNT is only relevant to vgpr or LDS.
1789 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1790 bool FoundAliasingStore = false;
1791 // Only objects with alias scope info were added to LDSDMAScopes array.
1792 // In the absense of the scope info we will not be able to disambiguate
1793 // aliasing here. There is no need to try searching for a corresponding
1794 // store slot. This is conservatively correct because in that case we
1795 // will produce a wait using the first (general) LDS DMA wait slot which
1796 // will wait on all of them anyway.
1797 if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
1798 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
1799 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
1800 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
1801 FoundAliasingStore = true;
1802 ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
1803 }
1804 }
1805 }
1806 if (!FoundAliasingStore)
1807 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1808 if (Memop->isStore()) {
1809 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1810 }
1811 }
1812
1813 // Loop over use and def operands.
1814 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1815 MachineOperand &Op = MI.getOperand(I);
1816 if (!Op.isReg())
1817 continue;
1818
1819 // If the instruction does not read tied source, skip the operand.
1820 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1821 continue;
1822
1823 RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, I);
1824
1825 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1826 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1827 if (IsVGPR) {
1828 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1829 // previous write and this write are the same type of VMEM
1830 // instruction, in which case they're guaranteed to write their
1831 // results in order anyway.
1832 if (Op.isUse() || !updateVMCntOnly(MI) ||
1833 ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1834 getVmemType(MI))) {
1835 ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
1836 ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait);
1837 ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait);
1838 ScoreBrackets.clearVgprVmemTypes(RegNo);
1839 }
1840 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1841 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1842 }
1843 ScoreBrackets.determineWait(DS_CNT, RegNo, Wait);
1844 } else {
1845 ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
1846 }
1847 }
1848 }
1849 }
1850 }
1851
1852 // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1853 // not, we need to ensure the subtarget is capable of backing off barrier
1854 // instructions in case there are any outstanding memory operations that may
1855 // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
1856 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1857 !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
1858 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
1859 }
1860
1861 // TODO: Remove this work-around, enable the assert for Bug 457939
1862 // after fixing the scheduler. Also, the Shader Compiler code is
1863 // independent of target.
1864 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1865 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1866 Wait.DsCnt = 0;
1867 }
1868 }
1869
1870 // Verify that the wait is actually needed.
1871 ScoreBrackets.simplifyWaitcnt(Wait);
1872
1873 if (ForceEmitZeroWaitcnts)
1874 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
1875
1876 if (ForceEmitWaitcnt[LOAD_CNT])
1877 Wait.LoadCnt = 0;
1878 if (ForceEmitWaitcnt[EXP_CNT])
1879 Wait.ExpCnt = 0;
1880 if (ForceEmitWaitcnt[DS_CNT])
1881 Wait.DsCnt = 0;
1882 if (ForceEmitWaitcnt[SAMPLE_CNT])
1883 Wait.SampleCnt = 0;
1884 if (ForceEmitWaitcnt[BVH_CNT])
1885 Wait.BvhCnt = 0;
1886 if (ForceEmitWaitcnt[KM_CNT])
1887 Wait.KmCnt = 0;
1888
1889 if (FlushVmCnt) {
1890 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
1891 Wait.LoadCnt = 0;
1892 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
1893 Wait.SampleCnt = 0;
1894 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
1895 Wait.BvhCnt = 0;
1896 }
1897
1898 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
1899 OldWaitcntInstr);
1900}
1901
1902bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1905 WaitcntBrackets &ScoreBrackets,
1906 MachineInstr *OldWaitcntInstr) {
1907 bool Modified = false;
1908
1909 if (OldWaitcntInstr)
1910 // Try to merge the required wait with preexisting waitcnt instructions.
1911 // Also erase redundant waitcnt.
1912 Modified =
1913 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
1914
1915 // Any counts that could have been applied to any existing waitcnt
1916 // instructions will have been done so, now deal with any remaining.
1917 ScoreBrackets.applyWaitcnt(Wait);
1918
1919 // ExpCnt can be merged into VINTERP.
1920 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
1922 MachineOperand *WaitExp =
1923 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1924 if (Wait.ExpCnt < WaitExp->getImm()) {
1925 WaitExp->setImm(Wait.ExpCnt);
1926 Modified = true;
1927 }
1928 Wait.ExpCnt = ~0u;
1929
1930 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
1931 << "Update Instr: " << *It);
1932 }
1933
1934 if (WCG->createNewWaitcnt(Block, It, Wait))
1935 Modified = true;
1936
1937 return Modified;
1938}
1939
1940// This is a flat memory operation. Check to see if it has memory tokens other
1941// than LDS. Other address spaces supported by flat memory operations involve
1942// global memory.
1943bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1944 assert(TII->isFLAT(MI));
1945
1946 // All flat instructions use the VMEM counter.
1947 assert(TII->usesVM_CNT(MI));
1948
1949 // If there are no memory operands then conservatively assume the flat
1950 // operation may access VMEM.
1951 if (MI.memoperands_empty())
1952 return true;
1953
1954 // See if any memory operand specifies an address space that involves VMEM.
1955 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1956 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1957 // (GDS) address space is not supported by flat operations. Therefore, simply
1958 // return true unless only the LDS address space is found.
1959 for (const MachineMemOperand *Memop : MI.memoperands()) {
1960 unsigned AS = Memop->getAddrSpace();
1962 if (AS != AMDGPUAS::LOCAL_ADDRESS)
1963 return true;
1964 }
1965
1966 return false;
1967}
1968
1969// This is a flat memory operation. Check to see if it has memory tokens for
1970// either LDS or FLAT.
1971bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1972 assert(TII->isFLAT(MI));
1973
1974 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1975 if (!TII->usesLGKM_CNT(MI))
1976 return false;
1977
1978 // If in tgsplit mode then there can be no use of LDS.
1979 if (ST->isTgSplitEnabled())
1980 return false;
1981
1982 // If there are no memory operands then conservatively assume the flat
1983 // operation may access LDS.
1984 if (MI.memoperands_empty())
1985 return true;
1986
1987 // See if any memory operand specifies an address space that involves LDS.
1988 for (const MachineMemOperand *Memop : MI.memoperands()) {
1989 unsigned AS = Memop->getAddrSpace();
1991 return true;
1992 }
1993
1994 return false;
1995}
1996
1997// This is a flat memory operation. Check to see if it has memory tokens for
1998// either scratch or FLAT.
1999bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
2000 const MachineInstr &MI) const {
2001 assert(TII->isFLAT(MI));
2002
2003 // SCRATCH instructions always access scratch.
2004 if (TII->isFLATScratch(MI))
2005 return true;
2006
2007 // GLOBAL instructions never access scratch.
2008 if (TII->isFLATGlobal(MI))
2009 return false;
2010
2011 // If there are no memory operands then conservatively assume the flat
2012 // operation may access scratch.
2013 if (MI.memoperands_empty())
2014 return true;
2015
2016 // See if any memory operand specifies an address space that involves scratch.
2017 return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
2018 unsigned AS = Memop->getAddrSpace();
2019 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
2020 });
2021}
2022
2024 auto Opc = Inst.getOpcode();
2025 return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
2026 Opc == AMDGPU::GLOBAL_WBINV;
2027}
2028
2029void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2030 WaitcntBrackets *ScoreBrackets) {
2031 // Now look at the instruction opcode. If it is a memory access
2032 // instruction, update the upper-bound of the appropriate counter's
2033 // bracket and the destination operand scores.
2034 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
2035
2036 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
2037 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
2038 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2039 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
2040 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
2041 } else {
2042 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
2043 }
2044 } else if (TII->isFLAT(Inst)) {
2045 // TODO: Track this properly.
2046 if (isCacheInvOrWBInst(Inst))
2047 return;
2048
2049 assert(Inst.mayLoadOrStore());
2050
2051 int FlatASCount = 0;
2052
2053 if (mayAccessVMEMThroughFlat(Inst)) {
2054 ++FlatASCount;
2055 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2056 Inst);
2057 }
2058
2059 if (mayAccessLDSThroughFlat(Inst)) {
2060 ++FlatASCount;
2061 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
2062 }
2063
2064 // A Flat memory operation must access at least one address space.
2065 assert(FlatASCount);
2066
2067 // This is a flat memory operation that access both VMEM and LDS, so note it
2068 // - it will require that both the VM and LGKM be flushed to zero if it is
2069 // pending when a VM or LGKM dependency occurs.
2070 if (FlatASCount > 1)
2071 ScoreBrackets->setPendingFlat();
2072 } else if (SIInstrInfo::isVMEM(Inst) &&
2074 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2075 Inst);
2076
2077 if (ST->vmemWriteNeedsExpWaitcnt() &&
2078 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2079 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
2080 }
2081 } else if (TII->isSMRD(Inst)) {
2082 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2083 } else if (Inst.isCall()) {
2084 if (callWaitsOnFunctionReturn(Inst)) {
2085 // Act as a wait on everything
2086 ScoreBrackets->applyWaitcnt(
2087 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2088 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2089 } else {
2090 // May need to way wait for anything.
2091 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
2092 }
2093 } else if (SIInstrInfo::isLDSDIR(Inst)) {
2094 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
2095 } else if (TII->isVINTERP(Inst)) {
2096 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2097 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2098 } else if (SIInstrInfo::isEXP(Inst)) {
2099 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2101 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
2102 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2103 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
2104 else
2105 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
2106 } else {
2107 switch (Inst.getOpcode()) {
2108 case AMDGPU::S_SENDMSG:
2109 case AMDGPU::S_SENDMSG_RTN_B32:
2110 case AMDGPU::S_SENDMSG_RTN_B64:
2111 case AMDGPU::S_SENDMSGHALT:
2112 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
2113 break;
2114 case AMDGPU::S_MEMTIME:
2115 case AMDGPU::S_MEMREALTIME:
2116 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
2117 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
2118 case AMDGPU::S_BARRIER_LEAVE:
2119 case AMDGPU::S_GET_BARRIER_STATE_M0:
2120 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2121 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
2122 break;
2123 }
2124 }
2125}
2126
2127bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2128 unsigned OtherScore) {
2129 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2130 unsigned OtherShifted =
2131 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2132 Score = std::max(MyShifted, OtherShifted);
2133 return OtherShifted > MyShifted;
2134}
2135
2136/// Merge the pending events and associater score brackets of \p Other into
2137/// this brackets status.
2138///
2139/// Returns whether the merge resulted in a change that requires tighter waits
2140/// (i.e. the merged brackets strictly dominate the original brackets).
2141bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2142 bool StrictDom = false;
2143
2144 VgprUB = std::max(VgprUB, Other.VgprUB);
2145 SgprUB = std::max(SgprUB, Other.SgprUB);
2146
2147 for (auto T : inst_counter_types(MaxCounter)) {
2148 // Merge event flags for this counter
2149 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2150 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2151 if (OtherEvents & ~OldEvents)
2152 StrictDom = true;
2153 PendingEvents |= OtherEvents;
2154
2155 // Merge scores for this counter
2156 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2157 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2158 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2159 if (NewUB < ScoreLBs[T])
2160 report_fatal_error("waitcnt score overflow");
2161
2162 MergeInfo M;
2163 M.OldLB = ScoreLBs[T];
2164 M.OtherLB = Other.ScoreLBs[T];
2165 M.MyShift = NewUB - ScoreUBs[T];
2166 M.OtherShift = NewUB - Other.ScoreUBs[T];
2167
2168 ScoreUBs[T] = NewUB;
2169
2170 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
2171
2172 for (int J = 0; J <= VgprUB; J++)
2173 StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
2174
2175 if (T == SmemAccessCounter) {
2176 for (int J = 0; J <= SgprUB; J++)
2177 StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
2178 }
2179 }
2180
2181 for (int J = 0; J <= VgprUB; J++) {
2182 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
2183 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
2184 VgprVmemTypes[J] = NewVmemTypes;
2185 }
2186
2187 return StrictDom;
2188}
2189
2190static bool isWaitInstr(MachineInstr &Inst) {
2191 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2192 return Opcode == AMDGPU::S_WAITCNT ||
2193 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2194 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2195 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2196 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2197 counterTypeForInstr(Opcode).has_value();
2198}
2199
2200// Generate s_waitcnt instructions where needed.
2201bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2203 WaitcntBrackets &ScoreBrackets) {
2204 bool Modified = false;
2205
2206 LLVM_DEBUG({
2207 dbgs() << "*** Block" << Block.getNumber() << " ***";
2208 ScoreBrackets.dump();
2209 });
2210
2211 // Track the correctness of vccz through this basic block. There are two
2212 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2213 // ST->partialVCCWritesUpdateVCCZ().
2214 bool VCCZCorrect = true;
2215 if (ST->hasReadVCCZBug()) {
2216 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2217 // to vcc and then issued an smem load.
2218 VCCZCorrect = false;
2219 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2220 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2221 // to vcc_lo or vcc_hi.
2222 VCCZCorrect = false;
2223 }
2224
2225 // Walk over the instructions.
2226 MachineInstr *OldWaitcntInstr = nullptr;
2227
2228 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2229 E = Block.instr_end();
2230 Iter != E;) {
2231 MachineInstr &Inst = *Iter;
2232
2233 // Track pre-existing waitcnts that were added in earlier iterations or by
2234 // the memory legalizer.
2235 if (isWaitInstr(Inst)) {
2236 if (!OldWaitcntInstr)
2237 OldWaitcntInstr = &Inst;
2238 ++Iter;
2239 continue;
2240 }
2241
2242 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2243 isPreheaderToFlush(Block, ScoreBrackets);
2244
2245 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
2246 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2247 FlushVmCnt);
2248 OldWaitcntInstr = nullptr;
2249
2250 // Restore vccz if it's not known to be correct already.
2251 bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
2252
2253 // Don't examine operands unless we need to track vccz correctness.
2254 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
2255 if (Inst.definesRegister(AMDGPU::VCC_LO) ||
2256 Inst.definesRegister(AMDGPU::VCC_HI)) {
2257 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2258 if (!ST->partialVCCWritesUpdateVCCZ())
2259 VCCZCorrect = false;
2260 } else if (Inst.definesRegister(AMDGPU::VCC)) {
2261 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
2262 // vccz bit, so when we detect that an instruction may read from a
2263 // corrupt vccz bit, we need to:
2264 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2265 // operations to complete.
2266 // 2. Restore the correct value of vccz by writing the current value
2267 // of vcc back to vcc.
2268 if (ST->hasReadVCCZBug() &&
2269 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2270 // Writes to vcc while there's an outstanding smem read may get
2271 // clobbered as soon as any read completes.
2272 VCCZCorrect = false;
2273 } else {
2274 // Writes to vcc will fix any incorrect value in vccz.
2275 VCCZCorrect = true;
2276 }
2277 }
2278 }
2279
2280 if (TII->isSMRD(Inst)) {
2281 for (const MachineMemOperand *Memop : Inst.memoperands()) {
2282 // No need to handle invariant loads when avoiding WAR conflicts, as
2283 // there cannot be a vector store to the same memory location.
2284 if (!Memop->isInvariant()) {
2285 const Value *Ptr = Memop->getValue();
2286 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
2287 }
2288 }
2289 if (ST->hasReadVCCZBug()) {
2290 // This smem read could complete and clobber vccz at any time.
2291 VCCZCorrect = false;
2292 }
2293 }
2294
2295 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2296
2297#if 0 // TODO: implement resource type check controlled by options with ub = LB.
2298 // If this instruction generates a S_SETVSKIP because it is an
2299 // indexed resource, and we are on Tahiti, then it will also force
2300 // an S_WAITCNT vmcnt(0)
2301 if (RequireCheckResourceType(Inst, context)) {
2302 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
2303 ScoreBrackets->setScoreLB(LOAD_CNT,
2304 ScoreBrackets->getScoreUB(LOAD_CNT));
2305 }
2306#endif
2307
2308 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) {
2309 AMDGPU::Waitcnt Wait = WCG->getAllZeroWaitcnt(
2310 Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst));
2311 ScoreBrackets.simplifyWaitcnt(Wait);
2312 Modified |= generateWaitcnt(Wait, std::next(Inst.getIterator()), Block,
2313 ScoreBrackets, /*OldWaitcntInstr=*/nullptr);
2314 }
2315
2316 LLVM_DEBUG({
2317 Inst.print(dbgs());
2318 ScoreBrackets.dump();
2319 });
2320
2321 // TODO: Remove this work-around after fixing the scheduler and enable the
2322 // assert above.
2323 if (RestoreVCCZ) {
2324 // Restore the vccz bit. Any time a value is written to vcc, the vcc
2325 // bit is updated, so we can restore the bit by reading the value of
2326 // vcc and then writing it back to the register.
2327 BuildMI(Block, Inst, Inst.getDebugLoc(),
2328 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2329 TRI->getVCC())
2330 .addReg(TRI->getVCC());
2331 VCCZCorrect = true;
2332 Modified = true;
2333 }
2334
2335 ++Iter;
2336 }
2337
2338 // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
2339 // needed.
2341 if (Block.getFirstTerminator() == Block.end() &&
2342 isPreheaderToFlush(Block, ScoreBrackets)) {
2343 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2344 Wait.LoadCnt = 0;
2345 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2346 Wait.SampleCnt = 0;
2347 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2348 Wait.BvhCnt = 0;
2349 }
2350
2351 // Combine or remove any redundant waitcnts at the end of the block.
2352 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
2353 OldWaitcntInstr);
2354
2355 return Modified;
2356}
2357
2358// Return true if the given machine basic block is a preheader of a loop in
2359// which we want to flush the vmcnt counter, and false otherwise.
2360bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
2361 WaitcntBrackets &ScoreBrackets) {
2362 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
2363 if (!IsInserted)
2364 return Iterator->second;
2365
2367 if (!Succ)
2368 return false;
2369
2370 MachineLoop *Loop = MLI->getLoopFor(Succ);
2371 if (!Loop)
2372 return false;
2373
2374 if (Loop->getLoopPreheader() == &MBB &&
2375 shouldFlushVmCnt(Loop, ScoreBrackets)) {
2376 Iterator->second = true;
2377 return true;
2378 }
2379
2380 return false;
2381}
2382
2383bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2384 return SIInstrInfo::isVMEM(MI) ||
2385 (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
2386}
2387
2388// Return true if it is better to flush the vmcnt counter in the preheader of
2389// the given loop. We currently decide to flush in two situations:
2390// 1. The loop contains vmem store(s), no vmem load and at least one use of a
2391// vgpr containing a value that is loaded outside of the loop. (Only on
2392// targets with no vscnt counter).
2393// 2. The loop contains vmem load(s), but the loaded values are not used in the
2394// loop, and at least one use of a vgpr containing a value that is loaded
2395// outside of the loop.
2396bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2397 WaitcntBrackets &Brackets) {
2398 bool HasVMemLoad = false;
2399 bool HasVMemStore = false;
2400 bool UsesVgprLoadedOutside = false;
2401 DenseSet<Register> VgprUse;
2402 DenseSet<Register> VgprDef;
2403
2404 for (MachineBasicBlock *MBB : ML->blocks()) {
2405 for (MachineInstr &MI : *MBB) {
2406 if (isVMEMOrFlatVMEM(MI)) {
2407 if (MI.mayLoad())
2408 HasVMemLoad = true;
2409 if (MI.mayStore())
2410 HasVMemStore = true;
2411 }
2412 for (unsigned I = 0; I < MI.getNumOperands(); I++) {
2413 MachineOperand &Op = MI.getOperand(I);
2414 if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
2415 continue;
2416 RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, I);
2417 // Vgpr use
2418 if (Op.isUse()) {
2419 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2420 // If we find a register that is loaded inside the loop, 1. and 2.
2421 // are invalidated and we can exit.
2422 if (VgprDef.contains(RegNo))
2423 return false;
2424 VgprUse.insert(RegNo);
2425 // If at least one of Op's registers is in the score brackets, the
2426 // value is likely loaded outside of the loop.
2427 if (Brackets.getRegScore(RegNo, LOAD_CNT) >
2428 Brackets.getScoreLB(LOAD_CNT) ||
2429 Brackets.getRegScore(RegNo, SAMPLE_CNT) >
2430 Brackets.getScoreLB(SAMPLE_CNT) ||
2431 Brackets.getRegScore(RegNo, BVH_CNT) >
2432 Brackets.getScoreLB(BVH_CNT)) {
2433 UsesVgprLoadedOutside = true;
2434 break;
2435 }
2436 }
2437 }
2438 // VMem load vgpr def
2439 else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
2440 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
2441 // If we find a register that is loaded inside the loop, 1. and 2.
2442 // are invalidated and we can exit.
2443 if (VgprUse.contains(RegNo))
2444 return false;
2445 VgprDef.insert(RegNo);
2446 }
2447 }
2448 }
2449 }
2450 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2451 return true;
2452 return HasVMemLoad && UsesVgprLoadedOutside;
2453}
2454
2455bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
2456 ST = &MF.getSubtarget<GCNSubtarget>();
2457 TII = ST->getInstrInfo();
2458 TRI = &TII->getRegisterInfo();
2459 MRI = &MF.getRegInfo();
2461 MLI = &getAnalysis<MachineLoopInfo>();
2462 PDT = &getAnalysis<MachinePostDominatorTree>();
2463 if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2464 AA = &AAR->getAAResults();
2465
2467
2468 if (ST->hasExtendedWaitCounts()) {
2469 MaxCounter = NUM_EXTENDED_INST_CNTS;
2470 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(ST, MaxCounter);
2471 WCG = &WCGGFX12Plus;
2472 } else {
2473 MaxCounter = NUM_NORMAL_INST_CNTS;
2474 WCGPreGFX12 = WaitcntGeneratorPreGFX12(ST);
2475 WCG = &WCGPreGFX12;
2476 }
2477
2478 ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
2479 for (auto T : inst_counter_types())
2480 ForceEmitWaitcnt[T] = false;
2481
2482 const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
2483
2484 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
2485
2486 OptNone = MF.getFunction().hasOptNone() ||
2487 MF.getTarget().getOptLevel() == CodeGenOptLevel::None;
2488
2489 HardwareLimits Limits = {};
2490 if (ST->hasExtendedWaitCounts()) {
2491 Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
2492 Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
2493 } else {
2494 Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
2495 Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
2496 }
2497 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
2498 Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
2499 Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
2500 Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
2501 Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
2502
2503 unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
2504 unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
2505 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
2506 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
2507
2508 RegisterEncoding Encoding = {};
2509 Encoding.VGPR0 =
2510 TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2511 Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
2512 Encoding.SGPR0 =
2513 TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
2514 Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
2515
2516 BlockInfos.clear();
2517 bool Modified = false;
2518
2519 MachineBasicBlock &EntryBB = MF.front();
2521
2522 if (!MFI->isEntryFunction()) {
2523 // Wait for any outstanding memory operations that the input registers may
2524 // depend on. We can't track them and it's better to do the wait after the
2525 // costly call sequence.
2526
2527 // TODO: Could insert earlier and schedule more liberally with operations
2528 // that only use caller preserved registers.
2529 for (MachineBasicBlock::iterator E = EntryBB.end();
2530 I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
2531 ;
2532
2533 if (ST->hasExtendedWaitCounts()) {
2534 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2535 .addImm(0);
2536 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2537 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
2538 continue;
2539
2540 BuildMI(EntryBB, I, DebugLoc(),
2541 TII->get(instrsForExtendedCounterTypes[CT]))
2542 .addImm(0);
2543 }
2544 } else {
2545 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
2546 }
2547
2548 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
2549 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
2550 SmemAccessCounter);
2551 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2552 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2553
2554 Modified = true;
2555 }
2556
2557 // Keep iterating over the blocks in reverse post order, inserting and
2558 // updating s_waitcnt where needed, until a fix point is reached.
2560 BlockInfos.insert({MBB, BlockInfo()});
2561
2562 std::unique_ptr<WaitcntBrackets> Brackets;
2563 bool Repeat;
2564 do {
2565 Repeat = false;
2566
2567 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
2568 ++BII) {
2569 MachineBasicBlock *MBB = BII->first;
2570 BlockInfo &BI = BII->second;
2571 if (!BI.Dirty)
2572 continue;
2573
2574 if (BI.Incoming) {
2575 if (!Brackets)
2576 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
2577 else
2578 *Brackets = *BI.Incoming;
2579 } else {
2580 if (!Brackets)
2581 Brackets = std::make_unique<WaitcntBrackets>(
2582 ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
2583 SmemAccessCounter);
2584 else
2585 *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding,
2586 WaitEventMaskForInst, SmemAccessCounter);
2587 }
2588
2589 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
2590 BI.Dirty = false;
2591
2592 if (Brackets->hasPendingEvent()) {
2593 BlockInfo *MoveBracketsToSucc = nullptr;
2594 for (MachineBasicBlock *Succ : MBB->successors()) {
2595 auto SuccBII = BlockInfos.find(Succ);
2596 BlockInfo &SuccBI = SuccBII->second;
2597 if (!SuccBI.Incoming) {
2598 SuccBI.Dirty = true;
2599 if (SuccBII <= BII)
2600 Repeat = true;
2601 if (!MoveBracketsToSucc) {
2602 MoveBracketsToSucc = &SuccBI;
2603 } else {
2604 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
2605 }
2606 } else if (SuccBI.Incoming->merge(*Brackets)) {
2607 SuccBI.Dirty = true;
2608 if (SuccBII <= BII)
2609 Repeat = true;
2610 }
2611 }
2612 if (MoveBracketsToSucc)
2613 MoveBracketsToSucc->Incoming = std::move(Brackets);
2614 }
2615 }
2616 } while (Repeat);
2617
2618 if (ST->hasScalarStores()) {
2620 bool HaveScalarStores = false;
2621
2622 for (MachineBasicBlock &MBB : MF) {
2623 for (MachineInstr &MI : MBB) {
2624 if (!HaveScalarStores && TII->isScalarStore(MI))
2625 HaveScalarStores = true;
2626
2627 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
2628 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2629 EndPgmBlocks.push_back(&MBB);
2630 }
2631 }
2632
2633 if (HaveScalarStores) {
2634 // If scalar writes are used, the cache must be flushed or else the next
2635 // wave to reuse the same scratch memory can be clobbered.
2636 //
2637 // Insert s_dcache_wb at wave termination points if there were any scalar
2638 // stores, and only if the cache hasn't already been flushed. This could
2639 // be improved by looking across blocks for flushes in postdominating
2640 // blocks from the stores but an explicitly requested flush is probably
2641 // very rare.
2642 for (MachineBasicBlock *MBB : EndPgmBlocks) {
2643 bool SeenDCacheWB = false;
2644
2645 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
2646 I != E; ++I) {
2647 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
2648 SeenDCacheWB = true;
2649 else if (TII->isScalarStore(*I))
2650 SeenDCacheWB = false;
2651
2652 // FIXME: It would be better to insert this before a waitcnt if any.
2653 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
2654 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2655 !SeenDCacheWB) {
2656 Modified = true;
2657 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
2658 }
2659 }
2660 }
2661 }
2662 }
2663
2664 // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
2665 // instructions.
2666 for (MachineInstr *MI : ReleaseVGPRInsts) {
2667 if (ST->requiresNopBeforeDeallocVGPRs()) {
2668 BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_NOP))
2669 .addImm(0);
2670 }
2671 BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_SENDMSG))
2673 Modified = true;
2674 }
2675 ReleaseVGPRInsts.clear();
2676
2677 return Modified;
2678}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:182
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1291
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
static bool isCacheInvOrWBInst(MachineInstr &Inst)
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static bool readsVCCZ(const MachineInstr &MI)
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define DEBUG_TYPE
SI Insert Waitcnts
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Provides some synthesis utilities to produce sequences of values.
static const uint32_t IV[8]
Definition: blake3_impl.h:78
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Represent the analysis usage information of a pass.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
This class represents an Operation in the Expression.
static bool isCounterSet(unsigned ID)
Definition: DebugCounter.h:100
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:72
A debug info location.
Definition: DebugLoc.h:33
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
bool erase(const KeyT &Val)
Definition: DenseMap.h:329
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
bool hasOptNone() const
Do not optimize this function (-O0).
Definition: Function.h:675
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:546
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:329
bool isCall(QueryType Type=AnyInBundle) const
Definition: MachineInstr.h:918
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:549
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr fully defines the specified register.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:759
void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:475
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:556
MachineLoop * getLoopFor(const MachineBasicBlock *BB) const
Return the innermost loop that BB lives in.
A description of a memory reference used in the backend.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
bool dominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
iterator find(const KeyT &Key)
Definition: MapVector.h:167
iterator begin()
Definition: MapVector.h:69
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
void clear()
Definition: MapVector.h:88
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
static bool isVMEM(const MachineInstr &MI)
Definition: SIInstrInfo.h:432
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:649
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:691
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:588
static bool isLDSDIR(const MachineInstr &MI)
Definition: SIInstrInfo.h:833
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:570
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:596
static bool isAtomicRet(const MachineInstr &MI)
Definition: SIInstrInfo.h:673
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:936
static bool isVINTERP(const MachineInstr &MI)
Definition: SIInstrInfo.h:841
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:580
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
self_iterator getIterator()
Definition: ilist_node.h:109
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
unsigned getStorecntBitMask(const IsaVersion &Version)
IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getSamplecntBitMask(const IsaVersion &Version)
unsigned getKmcntBitMask(const IsaVersion &Version)
unsigned getVmcntBitMask(const IsaVersion &Version)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getBvhcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getLoadcntBitMask(const IsaVersion &Version)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned getDscntBitMask(const IsaVersion &Version)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition: Sequence.h:337
@ Wait
Definition: Threading.h:61
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
char & SIInsertWaitcntsID
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
FunctionPass * createSIInsertWaitcntsPass()
Instruction set architecture version.
Definition: TargetParser.h:125
Represents the counter values to wait for in an s_waitcnt instruction.
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
static constexpr bool is_iterable
Definition: Sequence.h:100