LLVM 22.0.0git
SIInsertWaitcnts.cpp
Go to the documentation of this file.
1//===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Insert wait instructions for memory reads and writes.
11///
12/// Memory reads and writes are issued asynchronously, so we need to insert
13/// S_WAITCNT instructions when we want to access any of their results or
14/// overwrite any register that's used asynchronously.
15///
16/// TODO: This pass currently keeps one timeline per hardware counter. A more
17/// finely-grained approach that keeps one timeline per event type could
18/// sometimes get away with generating weaker s_waitcnt instructions. For
19/// example, when both SMEM and LDS are in flight and we need to wait for
20/// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21/// but the pass will currently generate a conservative lgkmcnt(0) because
22/// multiple event types are in flight.
23//
24//===----------------------------------------------------------------------===//
25
26#include "AMDGPU.h"
27#include "GCNSubtarget.h"
31#include "llvm/ADT/MapVector.h"
33#include "llvm/ADT/Sequence.h"
39#include "llvm/IR/Dominators.h"
43
44using namespace llvm;
45
46#define DEBUG_TYPE "si-insert-waitcnts"
47
48DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE "-forceexp",
49 "Force emit s_waitcnt expcnt(0) instrs");
50DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE "-forcelgkm",
51 "Force emit s_waitcnt lgkmcnt(0) instrs");
52DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE "-forcevm",
53 "Force emit s_waitcnt vmcnt(0) instrs");
54
55static cl::opt<bool>
56 ForceEmitZeroFlag("amdgpu-waitcnt-forcezero",
57 cl::desc("Force all waitcnt instrs to be emitted as "
58 "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
59 cl::init(false), cl::Hidden);
60
62 "amdgpu-waitcnt-load-forcezero",
63 cl::desc("Force all waitcnt load counters to wait until 0"),
64 cl::init(false), cl::Hidden);
65
66namespace {
67// Class of object that encapsulates latest instruction counter score
68// associated with the operand. Used for determining whether
69// s_waitcnt instruction needs to be emitted.
70
71enum InstCounterType {
72 LOAD_CNT = 0, // VMcnt prior to gfx12.
73 DS_CNT, // LKGMcnt prior to gfx12.
74 EXP_CNT, //
75 STORE_CNT, // VScnt in gfx10/gfx11.
76 NUM_NORMAL_INST_CNTS,
77 SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
78 BVH_CNT, // gfx12+ only.
79 KM_CNT, // gfx12+ only.
80 X_CNT, // gfx1250.
81 NUM_EXTENDED_INST_CNTS,
82 NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
83};
84} // namespace
85
86namespace llvm {
87template <> struct enum_iteration_traits<InstCounterType> {
88 static constexpr bool is_iterable = true;
89};
90} // namespace llvm
91
92namespace {
93// Return an iterator over all counters between LOAD_CNT (the first counter)
94// and \c MaxCounter (exclusive, default value yields an enumeration over
95// all counters).
96auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
97 return enum_seq(LOAD_CNT, MaxCounter);
98}
99
100/// Integer IDs used to track vector memory locations we may have to wait on.
101/// Encoded as u16 chunks:
102///
103/// [0, REGUNITS_END ): MCRegUnit
104/// [LDSDMA_BEGIN, LDSDMA_END ) : LDS DMA IDs
105///
106/// NOTE: The choice of encoding these as "u16 chunks" is arbitrary.
107/// It gives (2 << 16) - 1 entries per category which is more than enough
108/// for all register units. MCPhysReg is u16 so we don't even support >u16
109/// physical register numbers at this time, let alone >u16 register units.
110/// In any case, an assertion in "WaitcntBrackets" ensures REGUNITS_END
111/// is enough for all register units.
112using VMEMID = uint32_t;
113
114enum : VMEMID {
115 TRACKINGID_RANGE_LEN = (1 << 16),
116
117 // Important: MCRegUnits must always be tracked starting from 0, as we
118 // need to be able to convert between a MCRegUnit and a VMEMID freely.
119 REGUNITS_BEGIN = 0,
120 REGUNITS_END = REGUNITS_BEGIN + TRACKINGID_RANGE_LEN,
121
122 // Note for LDSDMA: LDSDMA_BEGIN corresponds to the "common"
123 // entry, which is updated for all LDS DMA operations encountered.
124 // Specific LDS DMA IDs start at LDSDMA_BEGIN + 1.
125 NUM_LDSDMA = TRACKINGID_RANGE_LEN,
126 LDSDMA_BEGIN = REGUNITS_END,
127 LDSDMA_END = LDSDMA_BEGIN + NUM_LDSDMA,
128};
129
130/// Convert a MCRegUnit to a VMEMID.
131static constexpr VMEMID toVMEMID(MCRegUnit RU) {
132 return static_cast<unsigned>(RU);
133}
134
135struct HardwareLimits {
136 unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
137 unsigned ExpcntMax;
138 unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
139 unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
140 unsigned SamplecntMax; // gfx12+ only.
141 unsigned BvhcntMax; // gfx12+ only.
142 unsigned KmcntMax; // gfx12+ only.
143 unsigned XcntMax; // gfx1250.
144};
145
146#define AMDGPU_DECLARE_WAIT_EVENTS(DECL) \
147 DECL(VMEM_ACCESS) /* vmem read & write (pre-gfx10), vmem read (gfx10+) */ \
148 DECL(VMEM_SAMPLER_READ_ACCESS) /* vmem SAMPLER read (gfx12+ only) */ \
149 DECL(VMEM_BVH_READ_ACCESS) /* vmem BVH read (gfx12+ only) */ \
150 DECL(VMEM_WRITE_ACCESS) /* vmem write that is not scratch */ \
151 DECL(SCRATCH_WRITE_ACCESS) /* vmem write that may be scratch */ \
152 DECL(VMEM_GROUP) /* vmem group */ \
153 DECL(LDS_ACCESS) /* lds read & write */ \
154 DECL(GDS_ACCESS) /* gds read & write */ \
155 DECL(SQ_MESSAGE) /* send message */ \
156 DECL(SCC_WRITE) /* write to SCC from barrier */ \
157 DECL(SMEM_ACCESS) /* scalar-memory read & write */ \
158 DECL(SMEM_GROUP) /* scalar-memory group */ \
159 DECL(EXP_GPR_LOCK) /* export holding on its data src */ \
160 DECL(GDS_GPR_LOCK) /* GDS holding on its data and addr src */ \
161 DECL(EXP_POS_ACCESS) /* write to export position */ \
162 DECL(EXP_PARAM_ACCESS) /* write to export parameter */ \
163 DECL(VMW_GPR_LOCK) /* vmem write holding on its data src */ \
164 DECL(EXP_LDS_ACCESS) /* read by ldsdir counting as export */
165
166// clang-format off
167#define AMDGPU_EVENT_ENUM(Name) Name,
168enum WaitEventType {
170 NUM_WAIT_EVENTS
171};
172#undef AMDGPU_EVENT_ENUM
173
174#define AMDGPU_EVENT_NAME(Name) #Name,
175static constexpr StringLiteral WaitEventTypeName[] = {
177};
178#undef AMDGPU_EVENT_NAME
179// clang-format on
180
181// Enumerate different types of result-returning VMEM operations. Although
182// s_waitcnt orders them all with a single vmcnt counter, in the absence of
183// s_waitcnt only instructions of the same VmemType are guaranteed to write
184// their results in order -- so there is no need to insert an s_waitcnt between
185// two instructions of the same type that write the same vgpr.
186enum VmemType {
187 // BUF instructions and MIMG instructions without a sampler.
188 VMEM_NOSAMPLER,
189 // MIMG instructions with a sampler.
190 VMEM_SAMPLER,
191 // BVH instructions
192 VMEM_BVH,
193 NUM_VMEM_TYPES
194};
195
196// Maps values of InstCounterType to the instruction that waits on that
197// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
198// returns true.
199static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
200 AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
201 AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
202 AMDGPU::S_WAIT_KMCNT, AMDGPU::S_WAIT_XCNT};
203
204static bool updateVMCntOnly(const MachineInstr &Inst) {
205 return (SIInstrInfo::isVMEM(Inst) && !SIInstrInfo::isFLAT(Inst)) ||
207}
208
209#ifndef NDEBUG
210static bool isNormalMode(InstCounterType MaxCounter) {
211 return MaxCounter == NUM_NORMAL_INST_CNTS;
212}
213#endif // NDEBUG
214
215VmemType getVmemType(const MachineInstr &Inst) {
216 assert(updateVMCntOnly(Inst));
217 if (!SIInstrInfo::isImage(Inst))
218 return VMEM_NOSAMPLER;
220 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
222
223 if (BaseInfo->BVH)
224 return VMEM_BVH;
225
226 // We have to make an additional check for isVSAMPLE here since some
227 // instructions don't have a sampler, but are still classified as sampler
228 // instructions for the purposes of e.g. waitcnt.
229 if (BaseInfo->Sampler || BaseInfo->MSAA || SIInstrInfo::isVSAMPLE(Inst))
230 return VMEM_SAMPLER;
231
232 return VMEM_NOSAMPLER;
233}
234
235unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
236 switch (T) {
237 case LOAD_CNT:
238 return Wait.LoadCnt;
239 case EXP_CNT:
240 return Wait.ExpCnt;
241 case DS_CNT:
242 return Wait.DsCnt;
243 case STORE_CNT:
244 return Wait.StoreCnt;
245 case SAMPLE_CNT:
246 return Wait.SampleCnt;
247 case BVH_CNT:
248 return Wait.BvhCnt;
249 case KM_CNT:
250 return Wait.KmCnt;
251 case X_CNT:
252 return Wait.XCnt;
253 default:
254 llvm_unreachable("bad InstCounterType");
255 }
256}
257
258void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
259 unsigned &WC = getCounterRef(Wait, T);
260 WC = std::min(WC, Count);
261}
262
263void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
264 getCounterRef(Wait, T) = ~0u;
265}
266
267unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
268 return getCounterRef(Wait, T);
269}
270
271// Mapping from event to counter according to the table masks.
272InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
273 for (auto T : inst_counter_types()) {
274 if (masks[T] & (1 << E))
275 return T;
276 }
277 llvm_unreachable("event type has no associated counter");
278}
279
280class WaitcntBrackets;
281
282// This abstracts the logic for generating and updating S_WAIT* instructions
283// away from the analysis that determines where they are needed. This was
284// done because the set of counters and instructions for waiting on them
285// underwent a major shift with gfx12, sufficiently so that having this
286// abstraction allows the main analysis logic to be simpler than it would
287// otherwise have had to become.
288class WaitcntGenerator {
289protected:
290 const GCNSubtarget *ST = nullptr;
291 const SIInstrInfo *TII = nullptr;
292 AMDGPU::IsaVersion IV;
293 InstCounterType MaxCounter;
294 bool OptNone;
295
296public:
297 WaitcntGenerator() = default;
298 WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter)
299 : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
300 IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter),
301 OptNone(MF.getFunction().hasOptNone() ||
302 MF.getTarget().getOptLevel() == CodeGenOptLevel::None) {}
303
304 // Return true if the current function should be compiled with no
305 // optimization.
306 bool isOptNone() const { return OptNone; }
307
308 // Edits an existing sequence of wait count instructions according
309 // to an incoming Waitcnt value, which is itself updated to reflect
310 // any new wait count instructions which may need to be generated by
311 // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
312 // were made.
313 //
314 // This editing will usually be merely updated operands, but it may also
315 // delete instructions if the incoming Wait value indicates they are not
316 // needed. It may also remove existing instructions for which a wait
317 // is needed if it can be determined that it is better to generate new
318 // instructions later, as can happen on gfx12.
319 virtual bool
320 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
321 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
323
324 // Transform a soft waitcnt into a normal one.
325 bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
326
327 // Generates new wait count instructions according to the value of
328 // Wait, returning true if any new instructions were created.
329 virtual bool createNewWaitcnt(MachineBasicBlock &Block,
331 AMDGPU::Waitcnt Wait) = 0;
332
333 // Returns an array of bit masks which can be used to map values in
334 // WaitEventType to corresponding counter values in InstCounterType.
335 virtual const unsigned *getWaitEventMask() const = 0;
336
337 // Returns a new waitcnt with all counters except VScnt set to 0. If
338 // IncludeVSCnt is true, VScnt is set to 0, otherwise it is set to ~0u.
339 virtual AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const = 0;
340
341 virtual ~WaitcntGenerator() = default;
342
343 // Create a mask value from the initializer list of wait event types.
344 static constexpr unsigned
345 eventMask(std::initializer_list<WaitEventType> Events) {
346 unsigned Mask = 0;
347 for (auto &E : Events)
348 Mask |= 1 << E;
349
350 return Mask;
351 }
352};
353
354class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
355public:
356 using WaitcntGenerator::WaitcntGenerator;
357
358 bool
359 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
360 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
361 MachineBasicBlock::instr_iterator It) const override;
362
363 bool createNewWaitcnt(MachineBasicBlock &Block,
365 AMDGPU::Waitcnt Wait) override;
366
367 const unsigned *getWaitEventMask() const override {
368 assert(ST);
369
370 static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
371 eventMask(
372 {VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS}),
373 eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
374 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
375 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
376 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
377 0,
378 0,
379 0,
380 0};
381
382 return WaitEventMaskForInstPreGFX12;
383 }
384
385 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
386};
387
388class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
389public:
390 using WaitcntGenerator::WaitcntGenerator;
391
392 bool
393 applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
394 MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
395 MachineBasicBlock::instr_iterator It) const override;
396
397 bool createNewWaitcnt(MachineBasicBlock &Block,
399 AMDGPU::Waitcnt Wait) override;
400
401 const unsigned *getWaitEventMask() const override {
402 assert(ST);
403
404 static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
405 eventMask({VMEM_ACCESS}),
406 eventMask({LDS_ACCESS, GDS_ACCESS}),
407 eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
408 EXP_POS_ACCESS, EXP_LDS_ACCESS}),
409 eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
410 eventMask({VMEM_SAMPLER_READ_ACCESS}),
411 eventMask({VMEM_BVH_READ_ACCESS}),
412 eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
413 eventMask({VMEM_GROUP, SMEM_GROUP})};
414
415 return WaitEventMaskForInstGFX12Plus;
416 }
417
418 AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override;
419};
420
421class SIInsertWaitcnts {
422public:
423 const GCNSubtarget *ST;
424 const SIInstrInfo *TII = nullptr;
425 const SIRegisterInfo *TRI = nullptr;
426 const MachineRegisterInfo *MRI = nullptr;
427 InstCounterType SmemAccessCounter;
428 InstCounterType MaxCounter;
429 const unsigned *WaitEventMaskForInst;
430
431private:
432 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
433 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
434 MachineLoopInfo *MLI;
435 MachinePostDominatorTree *PDT;
436 AliasAnalysis *AA = nullptr;
437
438 struct BlockInfo {
439 std::unique_ptr<WaitcntBrackets> Incoming;
440 bool Dirty = true;
441 };
442
443 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
444
445 bool ForceEmitWaitcnt[NUM_INST_CNTS];
446
447 // In any given run of this pass, WCG will point to one of these two
448 // generator objects, which must have been re-initialised before use
449 // from a value made using a subtarget constructor.
450 WaitcntGeneratorPreGFX12 WCGPreGFX12;
451 WaitcntGeneratorGFX12Plus WCGGFX12Plus;
452
453 WaitcntGenerator *WCG = nullptr;
454
455 // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
456 // message.
457 DenseSet<MachineInstr *> ReleaseVGPRInsts;
458
459 HardwareLimits Limits;
460
461public:
462 SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
463 AliasAnalysis *AA)
464 : MLI(MLI), PDT(PDT), AA(AA) {
465 (void)ForceExpCounter;
466 (void)ForceLgkmCounter;
467 (void)ForceVMCounter;
468 }
469
470 unsigned getWaitCountMax(InstCounterType T) const {
471 switch (T) {
472 case LOAD_CNT:
473 return Limits.LoadcntMax;
474 case DS_CNT:
475 return Limits.DscntMax;
476 case EXP_CNT:
477 return Limits.ExpcntMax;
478 case STORE_CNT:
479 return Limits.StorecntMax;
480 case SAMPLE_CNT:
481 return Limits.SamplecntMax;
482 case BVH_CNT:
483 return Limits.BvhcntMax;
484 case KM_CNT:
485 return Limits.KmcntMax;
486 case X_CNT:
487 return Limits.XcntMax;
488 default:
489 break;
490 }
491 return 0;
492 }
493
494 bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets);
495 bool isPreheaderToFlush(MachineBasicBlock &MBB,
496 const WaitcntBrackets &ScoreBrackets);
497 bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
498 bool run(MachineFunction &MF);
499
500 void setForceEmitWaitcnt() {
501// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
502// For debug builds, get the debug counter info and adjust if need be
503#ifndef NDEBUG
504 if (DebugCounter::isCounterSet(ForceExpCounter) &&
505 DebugCounter::shouldExecute(ForceExpCounter)) {
506 ForceEmitWaitcnt[EXP_CNT] = true;
507 } else {
508 ForceEmitWaitcnt[EXP_CNT] = false;
509 }
510
511 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
512 DebugCounter::shouldExecute(ForceLgkmCounter)) {
513 ForceEmitWaitcnt[DS_CNT] = true;
514 ForceEmitWaitcnt[KM_CNT] = true;
515 } else {
516 ForceEmitWaitcnt[DS_CNT] = false;
517 ForceEmitWaitcnt[KM_CNT] = false;
518 }
519
520 if (DebugCounter::isCounterSet(ForceVMCounter) &&
521 DebugCounter::shouldExecute(ForceVMCounter)) {
522 ForceEmitWaitcnt[LOAD_CNT] = true;
523 ForceEmitWaitcnt[SAMPLE_CNT] = true;
524 ForceEmitWaitcnt[BVH_CNT] = true;
525 } else {
526 ForceEmitWaitcnt[LOAD_CNT] = false;
527 ForceEmitWaitcnt[SAMPLE_CNT] = false;
528 ForceEmitWaitcnt[BVH_CNT] = false;
529 }
530#endif // NDEBUG
531 }
532
533 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM
534 // instruction.
535 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
536 switch (Inst.getOpcode()) {
537 // FIXME: GLOBAL_INV needs to be tracked with xcnt too.
538 case AMDGPU::GLOBAL_INV:
539 return VMEM_ACCESS; // tracked using loadcnt
540 case AMDGPU::GLOBAL_WB:
541 case AMDGPU::GLOBAL_WBINV:
542 return VMEM_WRITE_ACCESS; // tracked using storecnt
543 default:
544 break;
545 }
546
547 // Maps VMEM access types to their corresponding WaitEventType.
548 static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
549 VMEM_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
550
552 // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
553 // these should use VM_CNT.
554 if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
555 return VMEM_ACCESS;
556 if (Inst.mayStore() &&
557 (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
558 if (TII->mayAccessScratch(Inst))
559 return SCRATCH_WRITE_ACCESS;
560 return VMEM_WRITE_ACCESS;
561 }
562 if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
563 return VMEM_ACCESS;
564 return VmemReadMapping[getVmemType(Inst)];
565 }
566
567 bool isVmemAccess(const MachineInstr &MI) const;
568 bool generateWaitcntInstBefore(MachineInstr &MI,
569 WaitcntBrackets &ScoreBrackets,
570 MachineInstr *OldWaitcntInstr,
571 bool FlushVmCnt);
572 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
574 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
575 MachineInstr *OldWaitcntInstr);
576 void updateEventWaitcntAfter(MachineInstr &Inst,
577 WaitcntBrackets *ScoreBrackets);
578 bool isNextENDPGM(MachineBasicBlock::instr_iterator It,
579 MachineBasicBlock *Block) const;
580 bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &Block,
581 WaitcntBrackets &ScoreBrackets);
582 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
583 WaitcntBrackets &ScoreBrackets);
584};
585
586// This objects maintains the current score brackets of each wait counter, and
587// a per-register scoreboard for each wait counter.
588//
589// We also maintain the latest score for every event type that can change the
590// waitcnt in order to know if there are multiple types of events within
591// the brackets. When multiple types of event happen in the bracket,
592// wait count may get decreased out of order, therefore we need to put in
593// "s_waitcnt 0" before use.
594class WaitcntBrackets {
595public:
596 WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
597 assert(Context->TRI->getNumRegUnits() < REGUNITS_END);
598 }
599
600#ifndef NDEBUG
601 ~WaitcntBrackets() {
602 unsigned NumUnusedVmem = 0, NumUnusedSGPRs = 0;
603 for (auto &[ID, Val] : VMem) {
604 if (Val.empty())
605 ++NumUnusedVmem;
606 }
607 for (auto &[ID, Val] : SGPRs) {
608 if (Val.empty())
609 ++NumUnusedSGPRs;
610 }
611
612 if (NumUnusedVmem || NumUnusedSGPRs) {
613 errs() << "WaitcntBracket had unused entries at destruction time: "
614 << NumUnusedVmem << " VMem and " << NumUnusedSGPRs
615 << " SGPR unused entries\n";
616 std::abort();
617 }
618 }
619#endif
620
621 bool isSmemCounter(InstCounterType T) const {
622 return T == Context->SmemAccessCounter || T == X_CNT;
623 }
624
625 unsigned getSgprScoresIdx(InstCounterType T) const {
626 assert(isSmemCounter(T) && "Invalid SMEM counter");
627 return T == X_CNT ? 1 : 0;
628 }
629
630 unsigned getScoreLB(InstCounterType T) const {
631 assert(T < NUM_INST_CNTS);
632 return ScoreLBs[T];
633 }
634
635 unsigned getScoreUB(InstCounterType T) const {
636 assert(T < NUM_INST_CNTS);
637 return ScoreUBs[T];
638 }
639
640 unsigned getScoreRange(InstCounterType T) const {
641 return getScoreUB(T) - getScoreLB(T);
642 }
643
644 unsigned getSGPRScore(MCRegUnit RU, InstCounterType T) const {
645 auto It = SGPRs.find(RU);
646 return It != SGPRs.end() ? It->second.Scores[getSgprScoresIdx(T)] : 0;
647 }
648
649 unsigned getVMemScore(VMEMID TID, InstCounterType T) const {
650 auto It = VMem.find(TID);
651 return It != VMem.end() ? It->second.Scores[T] : 0;
652 }
653
654 bool merge(const WaitcntBrackets &Other);
655
656 bool counterOutOfOrder(InstCounterType T) const;
657 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait);
658 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
659 bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait);
660 bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait);
661 void simplifyXcnt(AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait);
662
663 void determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
664 AMDGPU::Waitcnt &Wait) const;
665 void determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
666 AMDGPU::Waitcnt &Wait) const;
667 void tryClearSCCWriteEvent(MachineInstr *Inst);
668
669 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
670 void applyWaitcnt(InstCounterType T, unsigned Count);
671 void updateByEvent(WaitEventType E, MachineInstr &MI);
672
673 unsigned hasPendingEvent() const { return PendingEvents; }
674 unsigned hasPendingEvent(WaitEventType E) const {
675 return PendingEvents & (1 << E);
676 }
677 unsigned hasPendingEvent(InstCounterType T) const {
678 unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T];
679 assert((HasPending != 0) == (getScoreRange(T) != 0));
680 return HasPending;
681 }
682
683 bool hasMixedPendingEvents(InstCounterType T) const {
684 unsigned Events = hasPendingEvent(T);
685 // Return true if more than one bit is set in Events.
686 return Events & (Events - 1);
687 }
688
689 bool hasPendingFlat() const {
690 return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
691 LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
692 (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
693 LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
694 }
695
696 void setPendingFlat() {
697 LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
698 LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
699 }
700
701 bool hasPendingGDS() const {
702 return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
703 }
704
705 unsigned getPendingGDSWait() const {
706 return std::min(getScoreUB(DS_CNT) - LastGDS,
707 Context->getWaitCountMax(DS_CNT) - 1);
708 }
709
710 void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
711
712 // Return true if there might be pending writes to the vgpr-interval by VMEM
713 // instructions with types different from V.
714 bool hasOtherPendingVmemTypes(MCPhysReg Reg, VmemType V) const {
715 for (MCRegUnit RU : regunits(Reg)) {
716 auto It = VMem.find(toVMEMID(RU));
717 if (It != VMem.end() && (It->second.VMEMTypes & ~(1 << V)))
718 return true;
719 }
720 return false;
721 }
722
723 void clearVgprVmemTypes(MCPhysReg Reg) {
724 for (MCRegUnit RU : regunits(Reg)) {
725 if (auto It = VMem.find(toVMEMID(RU)); It != VMem.end()) {
726 It->second.VMEMTypes = 0;
727 if (It->second.empty())
728 VMem.erase(It);
729 }
730 }
731 }
732
733 void setStateOnFunctionEntryOrReturn() {
734 setScoreUB(STORE_CNT,
735 getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
736 PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
737 }
738
739 ArrayRef<const MachineInstr *> getLDSDMAStores() const {
740 return LDSDMAStores;
741 }
742
743 bool hasPointSampleAccel(const MachineInstr &MI) const;
744 bool hasPointSamplePendingVmemTypes(const MachineInstr &MI,
745 MCPhysReg RU) const;
746
747 void print(raw_ostream &) const;
748 void dump() const { print(dbgs()); }
749
750 // Free up memory by removing empty entries from the DenseMap that track event
751 // scores.
752 void purgeEmptyTrackingData();
753
754private:
755 struct MergeInfo {
756 unsigned OldLB;
757 unsigned OtherLB;
758 unsigned MyShift;
759 unsigned OtherShift;
760 };
761
762 void determineWaitForScore(InstCounterType T, unsigned Score,
763 AMDGPU::Waitcnt &Wait) const;
764
765 static bool mergeScore(const MergeInfo &M, unsigned &Score,
766 unsigned OtherScore);
767
769 assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
770 if (!Context->TRI->isInAllocatableClass(Reg))
771 return {{}, {}};
772 const TargetRegisterClass *RC = Context->TRI->getPhysRegBaseClass(Reg);
773 unsigned Size = Context->TRI->getRegSizeInBits(*RC);
774 if (Size == 16 && Context->ST->hasD16Writes32BitVgpr())
775 Reg = Context->TRI->get32BitRegister(Reg);
776 return Context->TRI->regunits(Reg);
777 }
778
779 void setScoreLB(InstCounterType T, unsigned Val) {
780 assert(T < NUM_INST_CNTS);
781 ScoreLBs[T] = Val;
782 }
783
784 void setScoreUB(InstCounterType T, unsigned Val) {
785 assert(T < NUM_INST_CNTS);
786 ScoreUBs[T] = Val;
787
788 if (T != EXP_CNT)
789 return;
790
791 if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
792 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
793 }
794
795 void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {
796 const SIRegisterInfo *TRI = Context->TRI;
797 if (Reg == AMDGPU::SCC) {
798 SCCScore = Val;
799 } else if (TRI->isVectorRegister(*Context->MRI, Reg)) {
800 for (MCRegUnit RU : regunits(Reg))
801 VMem[toVMEMID(RU)].Scores[T] = Val;
802 } else if (TRI->isSGPRReg(*Context->MRI, Reg)) {
803 auto STy = getSgprScoresIdx(T);
804 for (MCRegUnit RU : regunits(Reg))
805 SGPRs[RU].Scores[STy] = Val;
806 } else {
807 llvm_unreachable("Register cannot be tracked/unknown register!");
808 }
809 }
810
811 void setVMemScore(VMEMID TID, InstCounterType T, unsigned Val) {
812 VMem[TID].Scores[T] = Val;
813 }
814
815 void setScoreByOperand(const MachineOperand &Op, InstCounterType CntTy,
816 unsigned Val);
817
818 const SIInsertWaitcnts *Context;
819
820 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
821 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
822 unsigned PendingEvents = 0;
823 // Remember the last flat memory operation.
824 unsigned LastFlat[NUM_INST_CNTS] = {0};
825 // Remember the last GDS operation.
826 unsigned LastGDS = 0;
827
828 // The score tracking logic is fragmented as follows:
829 // - VMem: VGPR RegUnits and LDS DMA IDs, see the VMEMID encoding.
830 // - SGPRs: SGPR RegUnits
831 // - SCC: Non-allocatable and not general purpose: not a SGPR.
832 //
833 // For the VMem case, if the key is within the range of LDS DMA IDs,
834 // then the corresponding index into the `LDSDMAStores` vector below is:
835 // Key - LDSDMA_BEGIN - 1
836 // This is because LDSDMA_BEGIN is a generic entry and does not have an
837 // associated MachineInstr.
838 //
839 // TODO: Could we track SCC alongside SGPRs so it's not longer a special case?
840
841 struct VMEMInfo {
842 // Scores for all instruction counters.
843 std::array<unsigned, NUM_INST_CNTS> Scores = {0};
844 // Bitmask of the VmemTypes of VMEM instructions for this VGPR.
845 unsigned VMEMTypes = 0;
846
847 bool empty() const {
848 return all_of(Scores, [](unsigned K) { return K == 0; }) && !VMEMTypes;
849 }
850 };
851
852 struct SGPRInfo {
853 // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt
854 // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant.
855 // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps
856 // the X_CNT score.
857 std::array<unsigned, 2> Scores = {0};
858
859 bool empty() const { return !Scores[0] && !Scores[1]; }
860 };
861
862 DenseMap<VMEMID, VMEMInfo> VMem; // VGPR + LDS DMA
863 DenseMap<MCRegUnit, SGPRInfo> SGPRs;
864
865 // Reg score for SCC.
866 unsigned SCCScore = 0;
867 // The unique instruction that has an SCC write pending, if there is one.
868 const MachineInstr *PendingSCCWrite = nullptr;
869
870 // Store representative LDS DMA operations. The only useful info here is
871 // alias info. One store is kept per unique AAInfo.
872 SmallVector<const MachineInstr *> LDSDMAStores;
873};
874
875class SIInsertWaitcntsLegacy : public MachineFunctionPass {
876public:
877 static char ID;
878 SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
879
880 bool runOnMachineFunction(MachineFunction &MF) override;
881
882 StringRef getPassName() const override {
883 return "SI insert wait instructions";
884 }
885
886 void getAnalysisUsage(AnalysisUsage &AU) const override {
887 AU.setPreservesCFG();
888 AU.addRequired<MachineLoopInfoWrapperPass>();
889 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
890 AU.addUsedIfAvailable<AAResultsWrapperPass>();
891 AU.addPreserved<AAResultsWrapperPass>();
893 }
894};
895
896} // end anonymous namespace
897
898void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
899 InstCounterType CntTy, unsigned Score) {
900 setRegScore(Op.getReg().asMCReg(), CntTy, Score);
901}
902
903// Return true if the subtarget is one that enables Point Sample Acceleration
904// and the MachineInstr passed in is one to which it might be applied (the
905// hardware makes this decision based on several factors, but we can't determine
906// this at compile time, so we have to assume it might be applied if the
907// instruction supports it).
908bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
909 if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
910 return false;
911
912 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
913 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
915 return BaseInfo->PointSampleAccel;
916}
917
918// Return true if the subtarget enables Point Sample Acceleration, the supplied
919// MachineInstr is one to which it might be applied and the supplied interval is
920// one that has outstanding writes to vmem-types different than VMEM_NOSAMPLER
921// (this is the type that a point sample accelerated instruction effectively
922// becomes)
923bool WaitcntBrackets::hasPointSamplePendingVmemTypes(const MachineInstr &MI,
924 MCPhysReg Reg) const {
925 if (!hasPointSampleAccel(MI))
926 return false;
927
928 return hasOtherPendingVmemTypes(Reg, VMEM_NOSAMPLER);
929}
930
931void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
932 InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E);
933 assert(T < Context->MaxCounter);
934
935 unsigned UB = getScoreUB(T);
936 unsigned CurrScore = UB + 1;
937 if (CurrScore == 0)
938 report_fatal_error("InsertWaitcnt score wraparound");
939 // PendingEvents and ScoreUB need to be update regardless if this event
940 // changes the score of a register or not.
941 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
942 PendingEvents |= 1 << E;
943 setScoreUB(T, CurrScore);
944
945 const SIRegisterInfo *TRI = Context->TRI;
946 const MachineRegisterInfo *MRI = Context->MRI;
947 const SIInstrInfo *TII = Context->TII;
948
949 if (T == EXP_CNT) {
950 // Put score on the source vgprs. If this is a store, just use those
951 // specific register(s).
952 if (TII->isDS(Inst) && Inst.mayLoadOrStore()) {
953 // All GDS operations must protect their address register (same as
954 // export.)
955 if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
956 setScoreByOperand(*AddrOp, EXP_CNT, CurrScore);
957
958 if (Inst.mayStore()) {
959 if (const auto *Data0 =
960 TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
961 setScoreByOperand(*Data0, EXP_CNT, CurrScore);
962 if (const auto *Data1 =
963 TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
964 setScoreByOperand(*Data1, EXP_CNT, CurrScore);
965 } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
966 Inst.getOpcode() != AMDGPU::DS_APPEND &&
967 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
968 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
969 for (const MachineOperand &Op : Inst.all_uses()) {
970 if (TRI->isVectorRegister(*MRI, Op.getReg()))
971 setScoreByOperand(Op, EXP_CNT, CurrScore);
972 }
973 }
974 } else if (TII->isFLAT(Inst)) {
975 if (Inst.mayStore()) {
976 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
977 EXP_CNT, CurrScore);
978 } else if (SIInstrInfo::isAtomicRet(Inst)) {
979 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
980 EXP_CNT, CurrScore);
981 }
982 } else if (TII->isMIMG(Inst)) {
983 if (Inst.mayStore()) {
984 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
985 } else if (SIInstrInfo::isAtomicRet(Inst)) {
986 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
987 EXP_CNT, CurrScore);
988 }
989 } else if (TII->isMTBUF(Inst)) {
990 if (Inst.mayStore())
991 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
992 } else if (TII->isMUBUF(Inst)) {
993 if (Inst.mayStore()) {
994 setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
995 } else if (SIInstrInfo::isAtomicRet(Inst)) {
996 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
997 EXP_CNT, CurrScore);
998 }
999 } else if (TII->isLDSDIR(Inst)) {
1000 // LDSDIR instructions attach the score to the destination.
1001 setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
1002 EXP_CNT, CurrScore);
1003 } else {
1004 if (TII->isEXP(Inst)) {
1005 // For export the destination registers are really temps that
1006 // can be used as the actual source after export patching, so
1007 // we need to treat them like sources and set the EXP_CNT
1008 // score.
1009 for (MachineOperand &DefMO : Inst.all_defs()) {
1010 if (TRI->isVGPR(*MRI, DefMO.getReg())) {
1011 setScoreByOperand(DefMO, EXP_CNT, CurrScore);
1012 }
1013 }
1014 }
1015 for (const MachineOperand &Op : Inst.all_uses()) {
1016 if (TRI->isVectorRegister(*MRI, Op.getReg()))
1017 setScoreByOperand(Op, EXP_CNT, CurrScore);
1018 }
1019 }
1020 } else if (T == X_CNT) {
1021 WaitEventType OtherEvent = E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
1022 if (PendingEvents & (1 << OtherEvent)) {
1023 // Hardware inserts an implicit xcnt between interleaved
1024 // SMEM and VMEM operations. So there will never be
1025 // outstanding address translations for both SMEM and
1026 // VMEM at the same time.
1027 setScoreLB(T, getScoreUB(T) - 1);
1028 PendingEvents &= ~(1 << OtherEvent);
1029 }
1030 for (const MachineOperand &Op : Inst.all_uses())
1031 setScoreByOperand(Op, T, CurrScore);
1032 } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
1033 // Match the score to the destination registers.
1034 //
1035 // Check only explicit operands. Stores, especially spill stores, include
1036 // implicit uses and defs of their super registers which would create an
1037 // artificial dependency, while these are there only for register liveness
1038 // accounting purposes.
1039 //
1040 // Special cases where implicit register defs exists, such as M0 or VCC,
1041 // but none with memory instructions.
1042 for (const MachineOperand &Op : Inst.defs()) {
1043 if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
1044 if (!TRI->isVectorRegister(*MRI, Op.getReg())) // TODO: add wrapper
1045 continue;
1046 if (updateVMCntOnly(Inst)) {
1047 // updateVMCntOnly should only leave us with VGPRs
1048 // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
1049 // defs. That's required for a sane index into `VgprMemTypes` below
1050 assert(TRI->isVectorRegister(*MRI, Op.getReg()));
1051 VmemType V = getVmemType(Inst);
1052 unsigned char TypesMask = 1 << V;
1053 // If instruction can have Point Sample Accel applied, we have to flag
1054 // this with another potential dependency
1055 if (hasPointSampleAccel(Inst))
1056 TypesMask |= 1 << VMEM_NOSAMPLER;
1057 for (MCRegUnit RU : regunits(Op.getReg().asMCReg()))
1058 VMem[toVMEMID(RU)].VMEMTypes |= TypesMask;
1059 }
1060 }
1061 setScoreByOperand(Op, T, CurrScore);
1062 }
1063 if (Inst.mayStore() &&
1064 (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
1065 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
1066 // written can be accessed. A load from LDS to VMEM does not need a wait.
1067 //
1068 // The "Slot" is the offset from LDSDMA_BEGIN. If it's non-zero, then
1069 // there is a MachineInstr in LDSDMAStores used to track this LDSDMA
1070 // store. The "Slot" is the index into LDSDMAStores + 1.
1071 unsigned Slot = 0;
1072 for (const auto *MemOp : Inst.memoperands()) {
1073 if (!MemOp->isStore() ||
1074 MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
1075 continue;
1076 // Comparing just AA info does not guarantee memoperands are equal
1077 // in general, but this is so for LDS DMA in practice.
1078 auto AAI = MemOp->getAAInfo();
1079 // Alias scope information gives a way to definitely identify an
1080 // original memory object and practically produced in the module LDS
1081 // lowering pass. If there is no scope available we will not be able
1082 // to disambiguate LDS aliasing as after the module lowering all LDS
1083 // is squashed into a single big object.
1084 if (!AAI || !AAI.Scope)
1085 break;
1086 for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
1087 for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
1088 if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
1089 Slot = I + 1;
1090 break;
1091 }
1092 }
1093 }
1094 if (Slot)
1095 break;
1096 // The slot may not be valid because it can be >= NUM_LDSDMA which
1097 // means the scoreboard cannot track it. We still want to preserve the
1098 // MI in order to check alias information, though.
1099 LDSDMAStores.push_back(&Inst);
1100 Slot = LDSDMAStores.size();
1101 break;
1102 }
1103 setVMemScore(LDSDMA_BEGIN, T, CurrScore);
1104 if (Slot && Slot < NUM_LDSDMA)
1105 setVMemScore(LDSDMA_BEGIN + Slot, T, CurrScore);
1106 }
1107
1109 setRegScore(AMDGPU::SCC, T, CurrScore);
1110 PendingSCCWrite = &Inst;
1111 }
1112 }
1113}
1114
1115void WaitcntBrackets::print(raw_ostream &OS) const {
1116 const GCNSubtarget *ST = Context->ST;
1117
1118 OS << '\n';
1119 for (auto T : inst_counter_types(Context->MaxCounter)) {
1120 unsigned SR = getScoreRange(T);
1121
1122 switch (T) {
1123 case LOAD_CNT:
1124 OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
1125 << SR << "):";
1126 break;
1127 case DS_CNT:
1128 OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
1129 << SR << "):";
1130 break;
1131 case EXP_CNT:
1132 OS << " EXP_CNT(" << SR << "):";
1133 break;
1134 case STORE_CNT:
1135 OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
1136 << SR << "):";
1137 break;
1138 case SAMPLE_CNT:
1139 OS << " SAMPLE_CNT(" << SR << "):";
1140 break;
1141 case BVH_CNT:
1142 OS << " BVH_CNT(" << SR << "):";
1143 break;
1144 case KM_CNT:
1145 OS << " KM_CNT(" << SR << "):";
1146 break;
1147 case X_CNT:
1148 OS << " X_CNT(" << SR << "):";
1149 break;
1150 default:
1151 OS << " UNKNOWN(" << SR << "):";
1152 break;
1153 }
1154
1155 if (SR != 0) {
1156 // Print vgpr scores.
1157 unsigned LB = getScoreLB(T);
1158
1159 SmallVector<VMEMID> SortedVMEMIDs(VMem.keys());
1160 sort(SortedVMEMIDs);
1161
1162 for (auto ID : SortedVMEMIDs) {
1163 unsigned RegScore = VMem.at(ID).Scores[T];
1164 if (RegScore <= LB)
1165 continue;
1166 unsigned RelScore = RegScore - LB - 1;
1167 if (ID < REGUNITS_END) {
1168 OS << ' ' << RelScore << ":vRU" << ID;
1169 } else {
1170 assert(ID >= LDSDMA_BEGIN && ID < LDSDMA_END &&
1171 "Unhandled/unexpected ID value!");
1172 OS << ' ' << RelScore << ":LDSDMA" << ID;
1173 }
1174 }
1175
1176 // Also need to print sgpr scores for lgkm_cnt or xcnt.
1177 if (isSmemCounter(T)) {
1178 SmallVector<MCRegUnit> SortedSMEMIDs(SGPRs.keys());
1179 sort(SortedSMEMIDs);
1180 for (auto ID : SortedSMEMIDs) {
1181 unsigned RegScore = SGPRs.at(ID).Scores[getSgprScoresIdx(T)];
1182 if (RegScore <= LB)
1183 continue;
1184 unsigned RelScore = RegScore - LB - 1;
1185 OS << ' ' << RelScore << ":sRU" << static_cast<unsigned>(ID);
1186 }
1187 }
1188
1189 if (T == KM_CNT && SCCScore > 0)
1190 OS << ' ' << SCCScore << ":scc";
1191 }
1192 OS << '\n';
1193 }
1194
1195 OS << "Pending Events: ";
1196 if (hasPendingEvent()) {
1197 ListSeparator LS;
1198 for (unsigned I = 0; I != NUM_WAIT_EVENTS; ++I) {
1199 if (hasPendingEvent((WaitEventType)I)) {
1200 OS << LS << WaitEventTypeName[I];
1201 }
1202 }
1203 } else {
1204 OS << "none";
1205 }
1206 OS << '\n';
1207
1208 OS << '\n';
1209}
1210
1211/// Simplify the waitcnt, in the sense of removing redundant counts, and return
1212/// whether a waitcnt instruction is needed at all.
1213void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) {
1214 simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1215 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
1216 simplifyWaitcnt(DS_CNT, Wait.DsCnt);
1217 simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
1218 simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1219 simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
1220 simplifyWaitcnt(KM_CNT, Wait.KmCnt);
1221 simplifyXcnt(Wait, Wait);
1222}
1223
1224void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
1225 unsigned &Count) const {
1226 // The number of outstanding events for this type, T, can be calculated
1227 // as (UB - LB). If the current Count is greater than or equal to the number
1228 // of outstanding events, then the wait for this counter is redundant.
1229 if (Count >= getScoreRange(T))
1230 Count = ~0u;
1231}
1232
1233void WaitcntBrackets::purgeEmptyTrackingData() {
1234 for (auto &[K, V] : make_early_inc_range(VMem)) {
1235 if (V.empty())
1236 VMem.erase(K);
1237 }
1238 for (auto &[K, V] : make_early_inc_range(SGPRs)) {
1239 if (V.empty())
1240 SGPRs.erase(K);
1241 }
1242}
1243
1244void WaitcntBrackets::determineWaitForScore(InstCounterType T,
1245 unsigned ScoreToWait,
1246 AMDGPU::Waitcnt &Wait) const {
1247 const unsigned LB = getScoreLB(T);
1248 const unsigned UB = getScoreUB(T);
1249
1250 // If the score falls within the bracket, we need a waitcnt.
1251 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
1252 if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
1253 !Context->ST->hasFlatLgkmVMemCountInOrder()) {
1254 // If there is a pending FLAT operation, and this is a VMem or LGKM
1255 // waitcnt and the target can report early completion, then we need
1256 // to force a waitcnt 0.
1257 addWait(Wait, T, 0);
1258 } else if (counterOutOfOrder(T)) {
1259 // Counter can get decremented out-of-order when there
1260 // are multiple types event in the bracket. Also emit an s_wait counter
1261 // with a conservative value of 0 for the counter.
1262 addWait(Wait, T, 0);
1263 } else {
1264 // If a counter has been maxed out avoid overflow by waiting for
1265 // MAX(CounterType) - 1 instead.
1266 unsigned NeededWait =
1267 std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1);
1268 addWait(Wait, T, NeededWait);
1269 }
1270 }
1271}
1272
1273void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
1274 AMDGPU::Waitcnt &Wait) const {
1275 if (Reg == AMDGPU::SCC) {
1276 determineWaitForScore(T, SCCScore, Wait);
1277 } else {
1278 bool IsVGPR = Context->TRI->isVectorRegister(*Context->MRI, Reg);
1279 for (MCRegUnit RU : regunits(Reg))
1280 determineWaitForScore(
1281 T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
1282 Wait);
1283 }
1284}
1285
1286void WaitcntBrackets::determineWaitForLDSDMA(InstCounterType T, VMEMID TID,
1287 AMDGPU::Waitcnt &Wait) const {
1288 assert(TID >= LDSDMA_BEGIN && TID < LDSDMA_END);
1289 determineWaitForScore(T, getVMemScore(TID, T), Wait);
1290}
1291
1292void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
1293 // S_BARRIER_WAIT on the same barrier guarantees that the pending write to
1294 // SCC has landed
1295 if (PendingSCCWrite &&
1296 PendingSCCWrite->getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
1297 PendingSCCWrite->getOperand(0).getImm() == Inst->getOperand(0).getImm()) {
1298 unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
1299 // If this SCC_WRITE is the only pending KM_CNT event, clear counter.
1300 if ((PendingEvents & Context->WaitEventMaskForInst[KM_CNT]) ==
1301 SCC_WRITE_PendingEvent) {
1302 setScoreLB(KM_CNT, getScoreUB(KM_CNT));
1303 }
1304
1305 PendingEvents &= ~SCC_WRITE_PendingEvent;
1306 PendingSCCWrite = nullptr;
1307 }
1308}
1309
1310void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
1311 applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1312 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1313 applyWaitcnt(DS_CNT, Wait.DsCnt);
1314 applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1315 applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
1316 applyWaitcnt(BVH_CNT, Wait.BvhCnt);
1317 applyWaitcnt(KM_CNT, Wait.KmCnt);
1318 applyWaitcnt(X_CNT, Wait.XCnt);
1319}
1320
1321void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
1322 const unsigned UB = getScoreUB(T);
1323 if (Count >= UB)
1324 return;
1325 if (Count != 0) {
1326 if (counterOutOfOrder(T))
1327 return;
1328 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
1329 } else {
1330 setScoreLB(T, UB);
1331 PendingEvents &= ~Context->WaitEventMaskForInst[T];
1332 }
1333}
1334
1335bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) {
1336 // Wait on XCNT is redundant if we are already waiting for a load to complete.
1337 // SMEM can return out of order, so only omit XCNT wait if we are waiting till
1338 // zero.
1339 return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP);
1340}
1341
1342bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) {
1343 // If we have pending store we cannot optimize XCnt because we do not wait for
1344 // stores. VMEM loads retun in order, so if we only have loads XCnt is
1345 // decremented to the same number as LOADCnt.
1346 return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1347 !hasPendingEvent(STORE_CNT);
1348}
1349
1350void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &CheckWait,
1351 AMDGPU::Waitcnt &UpdateWait) {
1352 // Try to simplify xcnt further by checking for joint kmcnt and loadcnt
1353 // optimizations. On entry to a block with multiple predescessors, there may
1354 // be pending SMEM and VMEM events active at the same time.
1355 // In such cases, only clear one active event at a time.
1356 // TODO: Revisit xcnt optimizations for gfx1250.
1357 if (hasRedundantXCntWithKmCnt(CheckWait)) {
1358 if (!hasMixedPendingEvents(X_CNT)) {
1359 applyWaitcnt(X_CNT, 0);
1360 } else {
1361 PendingEvents &= ~(1 << SMEM_GROUP);
1362 }
1363 } else if (canOptimizeXCntWithLoadCnt(CheckWait)) {
1364 if (!hasMixedPendingEvents(X_CNT)) {
1365 applyWaitcnt(X_CNT, std::min(CheckWait.XCnt, CheckWait.LoadCnt));
1366 } else if (CheckWait.LoadCnt == 0) {
1367 PendingEvents &= ~(1 << VMEM_GROUP);
1368 }
1369 }
1370 simplifyWaitcnt(X_CNT, UpdateWait.XCnt);
1371}
1372
1373// Where there are multiple types of event in the bracket of a counter,
1374// the decrement may go out of order.
1375bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
1376 // Scalar memory read always can go out of order.
1377 if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
1378 (T == X_CNT && hasPendingEvent(SMEM_GROUP)))
1379 return true;
1380 return hasMixedPendingEvents(T);
1381}
1382
1383INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1384 false, false)
1387INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts",
1389
1390char SIInsertWaitcntsLegacy::ID = 0;
1391
1392char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID;
1393
1395 return new SIInsertWaitcntsLegacy();
1396}
1397
1398static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName,
1399 unsigned NewEnc) {
1400 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
1401 assert(OpIdx >= 0);
1402
1403 MachineOperand &MO = MI.getOperand(OpIdx);
1404
1405 if (NewEnc == MO.getImm())
1406 return false;
1407
1408 MO.setImm(NewEnc);
1409 return true;
1410}
1411
1412/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
1413/// and if so, which counter it is waiting on.
1414static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
1415 switch (Opcode) {
1416 case AMDGPU::S_WAIT_LOADCNT:
1417 return LOAD_CNT;
1418 case AMDGPU::S_WAIT_EXPCNT:
1419 return EXP_CNT;
1420 case AMDGPU::S_WAIT_STORECNT:
1421 return STORE_CNT;
1422 case AMDGPU::S_WAIT_SAMPLECNT:
1423 return SAMPLE_CNT;
1424 case AMDGPU::S_WAIT_BVHCNT:
1425 return BVH_CNT;
1426 case AMDGPU::S_WAIT_DSCNT:
1427 return DS_CNT;
1428 case AMDGPU::S_WAIT_KMCNT:
1429 return KM_CNT;
1430 case AMDGPU::S_WAIT_XCNT:
1431 return X_CNT;
1432 default:
1433 return {};
1434 }
1435}
1436
1437bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
1438 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
1439 if (Opcode == Waitcnt->getOpcode())
1440 return false;
1441
1442 Waitcnt->setDesc(TII->get(Opcode));
1443 return true;
1444}
1445
1446/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
1447/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
1448/// from \p Wait that were added by previous passes. Currently this pass
1449/// conservatively assumes that these preexisting waits are required for
1450/// correctness.
1451bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
1452 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1453 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1454 assert(ST);
1455 assert(isNormalMode(MaxCounter));
1456
1457 bool Modified = false;
1458 MachineInstr *WaitcntInstr = nullptr;
1459 MachineInstr *WaitcntVsCntInstr = nullptr;
1460
1461 LLVM_DEBUG({
1462 dbgs() << "PreGFX12::applyPreexistingWaitcnt at: ";
1463 if (It == OldWaitcntInstr.getParent()->instr_end())
1464 dbgs() << "end of block\n";
1465 else
1466 dbgs() << *It;
1467 });
1468
1469 for (auto &II :
1470 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1471 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1472 if (II.isMetaInstruction()) {
1473 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1474 continue;
1475 }
1476
1477 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1478 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1479
1480 // Update required wait count. If this is a soft waitcnt (= it was added
1481 // by an earlier pass), it may be entirely removed.
1482 if (Opcode == AMDGPU::S_WAITCNT) {
1483 unsigned IEnc = II.getOperand(0).getImm();
1484 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
1485 if (TrySimplify)
1486 ScoreBrackets.simplifyWaitcnt(OldWait);
1487 Wait = Wait.combined(OldWait);
1488
1489 // Merge consecutive waitcnt of the same type by erasing multiples.
1490 if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
1491 II.eraseFromParent();
1492 Modified = true;
1493 } else
1494 WaitcntInstr = &II;
1495 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1496 assert(ST->hasVMemToLDSLoad());
1497 LLVM_DEBUG(dbgs() << "Processing S_WAITCNT_lds_direct: " << II
1498 << "Before: " << Wait << '\n';);
1499 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, LDSDMA_BEGIN, Wait);
1500 LLVM_DEBUG(dbgs() << "After: " << Wait << '\n';);
1501
1502 // It is possible (but unlikely) that this is the only wait instruction,
1503 // in which case, we exit this loop without a WaitcntInstr to consume
1504 // `Wait`. But that works because `Wait` was passed in by reference, and
1505 // the callee eventually calls createNewWaitcnt on it. We test this
1506 // possibility in an articial MIR test since such a situation cannot be
1507 // recreated by running the memory legalizer.
1508 II.eraseFromParent();
1509 } else {
1510 assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
1511 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1512
1513 unsigned OldVSCnt =
1514 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1515 if (TrySimplify)
1516 ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
1517 Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
1518
1519 if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && TrySimplify)) {
1520 II.eraseFromParent();
1521 Modified = true;
1522 } else
1523 WaitcntVsCntInstr = &II;
1524 }
1525 }
1526
1527 if (WaitcntInstr) {
1528 Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
1530 Modified |= promoteSoftWaitCnt(WaitcntInstr);
1531
1532 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1533 ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
1534 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1535 Wait.LoadCnt = ~0u;
1536 Wait.ExpCnt = ~0u;
1537 Wait.DsCnt = ~0u;
1538
1539 LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
1540 ? dbgs()
1541 << "applied pre-existing waitcnt\n"
1542 << "New Instr at block end: " << *WaitcntInstr << '\n'
1543 : dbgs() << "applied pre-existing waitcnt\n"
1544 << "Old Instr: " << *It
1545 << "New Instr: " << *WaitcntInstr << '\n');
1546 }
1547
1548 if (WaitcntVsCntInstr) {
1549 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
1550 AMDGPU::OpName::simm16, Wait.StoreCnt);
1551 Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
1552
1553 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1554 Wait.StoreCnt = ~0u;
1555
1556 LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
1557 ? dbgs() << "applied pre-existing waitcnt\n"
1558 << "New Instr at block end: " << *WaitcntVsCntInstr
1559 << '\n'
1560 : dbgs() << "applied pre-existing waitcnt\n"
1561 << "Old Instr: " << *It
1562 << "New Instr: " << *WaitcntVsCntInstr << '\n');
1563 }
1564
1565 return Modified;
1566}
1567
1568/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
1569/// required counters in \p Wait
1570bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
1571 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1572 AMDGPU::Waitcnt Wait) {
1573 assert(ST);
1574 assert(isNormalMode(MaxCounter));
1575
1576 bool Modified = false;
1577 const DebugLoc &DL = Block.findDebugLoc(It);
1578
1579 // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
1580 // single instruction while VScnt has its own instruction.
1581 if (Wait.hasWaitExceptStoreCnt()) {
1582 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1583 [[maybe_unused]] auto SWaitInst =
1584 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1585 Modified = true;
1586
1587 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1588 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1589 dbgs() << "New Instr: " << *SWaitInst << '\n');
1590 }
1591
1592 if (Wait.hasWaitStoreCnt()) {
1593 assert(ST->hasVscnt());
1594
1595 [[maybe_unused]] auto SWaitInst =
1596 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1597 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1598 .addImm(Wait.StoreCnt);
1599 Modified = true;
1600
1601 LLVM_DEBUG(dbgs() << "PreGFX12::createNewWaitcnt\n";
1602 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1603 dbgs() << "New Instr: " << *SWaitInst << '\n');
1604 }
1605
1606 return Modified;
1607}
1608
1609AMDGPU::Waitcnt
1610WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1611 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && ST->hasVscnt() ? 0 : ~0u);
1612}
1613
1614AMDGPU::Waitcnt
1615WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(bool IncludeVSCnt) const {
1616 return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
1617 ~0u /* XCNT */);
1618}
1619
1620/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
1621/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
1622/// were added by previous passes. Currently this pass conservatively
1623/// assumes that these preexisting waits are required for correctness.
1624bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
1625 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
1626 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
1627 assert(ST);
1628 assert(!isNormalMode(MaxCounter));
1629
1630 bool Modified = false;
1631 MachineInstr *CombinedLoadDsCntInstr = nullptr;
1632 MachineInstr *CombinedStoreDsCntInstr = nullptr;
1633 MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
1634
1635 LLVM_DEBUG({
1636 dbgs() << "GFX12Plus::applyPreexistingWaitcnt at: ";
1637 if (It == OldWaitcntInstr.getParent()->instr_end())
1638 dbgs() << "end of block\n";
1639 else
1640 dbgs() << *It;
1641 });
1642
1643 for (auto &II :
1644 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
1645 LLVM_DEBUG(dbgs() << "pre-existing iter: " << II);
1646 if (II.isMetaInstruction()) {
1647 LLVM_DEBUG(dbgs() << "skipped meta instruction\n");
1648 continue;
1649 }
1650
1651 MachineInstr **UpdatableInstr;
1652
1653 // Update required wait count. If this is a soft waitcnt (= it was added
1654 // by an earlier pass), it may be entirely removed.
1655
1656 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
1657 bool TrySimplify = Opcode != II.getOpcode() && !OptNone;
1658
1659 // Don't crash if the programmer used legacy waitcnt intrinsics, but don't
1660 // attempt to do more than that either.
1661 if (Opcode == AMDGPU::S_WAITCNT)
1662 continue;
1663
1664 if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
1665 unsigned OldEnc =
1666 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1667 AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
1668 if (TrySimplify)
1669 ScoreBrackets.simplifyWaitcnt(OldWait);
1670 Wait = Wait.combined(OldWait);
1671 UpdatableInstr = &CombinedLoadDsCntInstr;
1672 } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
1673 unsigned OldEnc =
1674 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1675 AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
1676 if (TrySimplify)
1677 ScoreBrackets.simplifyWaitcnt(OldWait);
1678 Wait = Wait.combined(OldWait);
1679 UpdatableInstr = &CombinedStoreDsCntInstr;
1680 } else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
1681 // Architectures higher than GFX10 do not have direct loads to
1682 // LDS, so no work required here yet.
1683 II.eraseFromParent();
1684 continue;
1685 } else {
1686 std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
1687 assert(CT.has_value());
1688 unsigned OldCnt =
1689 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
1690 if (TrySimplify)
1691 ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
1692 addWait(Wait, CT.value(), OldCnt);
1693 UpdatableInstr = &WaitInstrs[CT.value()];
1694 }
1695
1696 // Merge consecutive waitcnt of the same type by erasing multiples.
1697 if (!*UpdatableInstr) {
1698 *UpdatableInstr = &II;
1699 } else {
1700 II.eraseFromParent();
1701 Modified = true;
1702 }
1703 }
1704
1705 // Save the pre combine waitcnt in order to make xcnt checks.
1706 AMDGPU::Waitcnt PreCombine = Wait;
1707 if (CombinedLoadDsCntInstr) {
1708 // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
1709 // to be waited for. Otherwise, let the instruction be deleted so
1710 // the appropriate single counter wait instruction can be inserted
1711 // instead, when new S_WAIT_*CNT instructions are inserted by
1712 // createNewWaitcnt(). As a side effect, resetting the wait counts will
1713 // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
1714 // the loop below that deals with single counter instructions.
1715 if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
1716 unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1717 Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
1718 AMDGPU::OpName::simm16, NewEnc);
1719 Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
1720 ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
1721 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1722 Wait.LoadCnt = ~0u;
1723 Wait.DsCnt = ~0u;
1724
1725 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1726 ? dbgs() << "applied pre-existing waitcnt\n"
1727 << "New Instr at block end: "
1728 << *CombinedLoadDsCntInstr << '\n'
1729 : dbgs() << "applied pre-existing waitcnt\n"
1730 << "Old Instr: " << *It << "New Instr: "
1731 << *CombinedLoadDsCntInstr << '\n');
1732 } else {
1733 CombinedLoadDsCntInstr->eraseFromParent();
1734 Modified = true;
1735 }
1736 }
1737
1738 if (CombinedStoreDsCntInstr) {
1739 // Similarly for S_WAIT_STORECNT_DSCNT.
1740 if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
1741 unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1742 Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
1743 AMDGPU::OpName::simm16, NewEnc);
1744 Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
1745 ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
1746 ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
1747 Wait.StoreCnt = ~0u;
1748 Wait.DsCnt = ~0u;
1749
1750 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1751 ? dbgs() << "applied pre-existing waitcnt\n"
1752 << "New Instr at block end: "
1753 << *CombinedStoreDsCntInstr << '\n'
1754 : dbgs() << "applied pre-existing waitcnt\n"
1755 << "Old Instr: " << *It << "New Instr: "
1756 << *CombinedStoreDsCntInstr << '\n');
1757 } else {
1758 CombinedStoreDsCntInstr->eraseFromParent();
1759 Modified = true;
1760 }
1761 }
1762
1763 // Look for an opportunity to convert existing S_WAIT_LOADCNT,
1764 // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
1765 // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
1766 // instructions so that createNewWaitcnt() will create new combined
1767 // instructions to replace them.
1768
1769 if (Wait.DsCnt != ~0u) {
1770 // This is a vector of addresses in WaitInstrs pointing to instructions
1771 // that should be removed if they are present.
1773
1774 // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
1775 // both) need to be waited for, ensure that there are no existing
1776 // individual wait count instructions for these.
1777
1778 if (Wait.LoadCnt != ~0u) {
1779 WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
1780 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1781 } else if (Wait.StoreCnt != ~0u) {
1782 WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
1783 WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
1784 }
1785
1786 for (MachineInstr **WI : WaitsToErase) {
1787 if (!*WI)
1788 continue;
1789
1790 (*WI)->eraseFromParent();
1791 *WI = nullptr;
1792 Modified = true;
1793 }
1794 }
1795
1796 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1797 if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(PreCombine)) ||
1798 (CT == LOAD_CNT &&
1799 ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine))) {
1800 // Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT
1801 // due to taking the backedge of a block.
1802 ScoreBrackets.simplifyXcnt(PreCombine, Wait);
1803 }
1804 if (!WaitInstrs[CT])
1805 continue;
1806
1807 unsigned NewCnt = getWait(Wait, CT);
1808 if (NewCnt != ~0u) {
1809 Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
1810 AMDGPU::OpName::simm16, NewCnt);
1811 Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
1812
1813 ScoreBrackets.applyWaitcnt(CT, NewCnt);
1814 setNoWait(Wait, CT);
1815
1816 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
1817 ? dbgs() << "applied pre-existing waitcnt\n"
1818 << "New Instr at block end: " << *WaitInstrs[CT]
1819 << '\n'
1820 : dbgs() << "applied pre-existing waitcnt\n"
1821 << "Old Instr: " << *It
1822 << "New Instr: " << *WaitInstrs[CT] << '\n');
1823 } else {
1824 WaitInstrs[CT]->eraseFromParent();
1825 Modified = true;
1826 }
1827 }
1828
1829 return Modified;
1830}
1831
1832/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
1833bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
1834 MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
1835 AMDGPU::Waitcnt Wait) {
1836 assert(ST);
1837 assert(!isNormalMode(MaxCounter));
1838
1839 bool Modified = false;
1840 const DebugLoc &DL = Block.findDebugLoc(It);
1841
1842 // Check for opportunities to use combined wait instructions.
1843 if (Wait.DsCnt != ~0u) {
1844 MachineInstr *SWaitInst = nullptr;
1845
1846 if (Wait.LoadCnt != ~0u) {
1847 unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
1848
1849 SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
1850 .addImm(Enc);
1851
1852 Wait.LoadCnt = ~0u;
1853 Wait.DsCnt = ~0u;
1854 } else if (Wait.StoreCnt != ~0u) {
1855 unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
1856
1857 SWaitInst =
1858 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
1859 .addImm(Enc);
1860
1861 Wait.StoreCnt = ~0u;
1862 Wait.DsCnt = ~0u;
1863 }
1864
1865 if (SWaitInst) {
1866 Modified = true;
1867
1868 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
1869 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1870 dbgs() << "New Instr: " << *SWaitInst << '\n');
1871 }
1872 }
1873
1874 // Generate an instruction for any remaining counter that needs
1875 // waiting for.
1876
1877 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
1878 unsigned Count = getWait(Wait, CT);
1879 if (Count == ~0u)
1880 continue;
1881
1882 [[maybe_unused]] auto SWaitInst =
1883 BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
1884 .addImm(Count);
1885
1886 Modified = true;
1887
1888 LLVM_DEBUG(dbgs() << "GFX12Plus::createNewWaitcnt\n";
1889 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1890 dbgs() << "New Instr: " << *SWaitInst << '\n');
1891 }
1892
1893 return Modified;
1894}
1895
1896/// \returns true if the callee inserts an s_waitcnt 0 on function entry.
1898 // Currently all conventions wait, but this may not always be the case.
1899 //
1900 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
1901 // senses to omit the wait and do it in the caller.
1902 return true;
1903}
1904
1905/// \returns true if the callee is expected to wait for any outstanding waits
1906/// before returning.
1907static bool callWaitsOnFunctionReturn(const MachineInstr &MI) { return true; }
1908
1909/// Generate s_waitcnt instruction to be placed before cur_Inst.
1910/// Instructions of a given type are returned in order,
1911/// but instructions of different types can complete out of order.
1912/// We rely on this in-order completion
1913/// and simply assign a score to the memory access instructions.
1914/// We keep track of the active "score bracket" to determine
1915/// if an access of a memory read requires an s_waitcnt
1916/// and if so what the value of each counter is.
1917/// The "score bracket" is bound by the lower bound and upper bound
1918/// scores (*_score_LB and *_score_ub respectively).
1919/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1920/// flush the vmcnt counter here.
1921bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1922 WaitcntBrackets &ScoreBrackets,
1923 MachineInstr *OldWaitcntInstr,
1924 bool FlushVmCnt) {
1925 setForceEmitWaitcnt();
1926
1927 assert(!MI.isMetaInstruction());
1928
1929 AMDGPU::Waitcnt Wait;
1930 const unsigned Opc = MI.getOpcode();
1931
1932 // FIXME: This should have already been handled by the memory legalizer.
1933 // Removing this currently doesn't affect any lit tests, but we need to
1934 // verify that nothing was relying on this. The number of buffer invalidates
1935 // being handled here should not be expanded.
1936 if (Opc == AMDGPU::BUFFER_WBINVL1 || Opc == AMDGPU::BUFFER_WBINVL1_SC ||
1937 Opc == AMDGPU::BUFFER_WBINVL1_VOL || Opc == AMDGPU::BUFFER_GL0_INV ||
1938 Opc == AMDGPU::BUFFER_GL1_INV) {
1939 Wait.LoadCnt = 0;
1940 }
1941
1942 // All waits must be resolved at call return.
1943 // NOTE: this could be improved with knowledge of all call sites or
1944 // with knowledge of the called routines.
1945 if (Opc == AMDGPU::SI_RETURN_TO_EPILOG || Opc == AMDGPU::SI_RETURN ||
1946 Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
1947 Opc == AMDGPU::S_SETPC_B64_return ||
1948 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1949 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
1950 }
1951 // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
1952 // Technically the hardware will do this on its own if we don't, but that
1953 // might cost extra cycles compared to doing it explicitly.
1954 // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
1955 // have to wait for outstanding VMEM stores. In this case it can be useful to
1956 // send a message to explicitly release all VGPRs before the stores have
1957 // completed, but it is only safe to do this if there are no outstanding
1958 // scratch stores.
1959 else if (Opc == AMDGPU::S_ENDPGM || Opc == AMDGPU::S_ENDPGM_SAVED) {
1960 if (!WCG->isOptNone() &&
1961 (MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
1962 (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
1963 ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1964 !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
1965 ReleaseVGPRInsts.insert(&MI);
1966 }
1967 // Resolve vm waits before gs-done.
1968 else if ((Opc == AMDGPU::S_SENDMSG || Opc == AMDGPU::S_SENDMSGHALT) &&
1969 ST->hasLegacyGeometry() &&
1970 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1972 Wait.LoadCnt = 0;
1973 }
1974
1975 // Export & GDS instructions do not read the EXEC mask until after the export
1976 // is granted (which can occur well after the instruction is issued).
1977 // The shader program must flush all EXP operations on the export-count
1978 // before overwriting the EXEC mask.
1979 else {
1980 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1981 // Export and GDS are tracked individually, either may trigger a waitcnt
1982 // for EXEC.
1983 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1984 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1985 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1986 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1987 Wait.ExpCnt = 0;
1988 }
1989 }
1990
1991 // Wait for any pending GDS instruction to complete before any
1992 // "Always GDS" instruction.
1993 if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
1994 addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
1995
1996 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1997 // The function is going to insert a wait on everything in its prolog.
1998 // This still needs to be careful if the call target is a load (e.g. a GOT
1999 // load). We also need to check WAW dependency with saved PC.
2000 Wait = AMDGPU::Waitcnt();
2001
2002 const auto &CallAddrOp = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
2003 if (CallAddrOp.isReg()) {
2004 ScoreBrackets.determineWaitForPhysReg(
2005 SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait);
2006
2007 if (const auto *RtnAddrOp =
2008 TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
2009 ScoreBrackets.determineWaitForPhysReg(
2010 SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait);
2011 }
2012 }
2013 } else if (Opc == AMDGPU::S_BARRIER_WAIT) {
2014 ScoreBrackets.tryClearSCCWriteEvent(&MI);
2015 } else {
2016 // FIXME: Should not be relying on memoperands.
2017 // Look at the source operands of every instruction to see if
2018 // any of them results from a previous memory operation that affects
2019 // its current usage. If so, an s_waitcnt instruction needs to be
2020 // emitted.
2021 // If the source operand was defined by a load, add the s_waitcnt
2022 // instruction.
2023 //
2024 // Two cases are handled for destination operands:
2025 // 1) If the destination operand was defined by a load, add the s_waitcnt
2026 // instruction to guarantee the right WAW order.
2027 // 2) If a destination operand that was used by a recent export/store ins,
2028 // add s_waitcnt on exp_cnt to guarantee the WAR order.
2029
2030 for (const MachineMemOperand *Memop : MI.memoperands()) {
2031 const Value *Ptr = Memop->getValue();
2032 if (Memop->isStore()) {
2033 if (auto It = SLoadAddresses.find(Ptr); It != SLoadAddresses.end()) {
2034 addWait(Wait, SmemAccessCounter, 0);
2035 if (PDT->dominates(MI.getParent(), It->second))
2036 SLoadAddresses.erase(It);
2037 }
2038 }
2039 unsigned AS = Memop->getAddrSpace();
2041 continue;
2042 // No need to wait before load from VMEM to LDS.
2043 if (TII->mayWriteLDSThroughDMA(MI))
2044 continue;
2045
2046 // LOAD_CNT is only relevant to vgpr or LDS.
2047 unsigned TID = LDSDMA_BEGIN;
2048 if (Ptr && Memop->getAAInfo()) {
2049 const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
2050 for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
2051 if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
2052 if ((I + 1) >= NUM_LDSDMA) {
2053 // We didn't have enough slot to track this LDS DMA store, it
2054 // has been tracked using the common RegNo (FIRST_LDS_VGPR).
2055 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
2056 break;
2057 }
2058
2059 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID + I + 1, Wait);
2060 }
2061 }
2062 } else {
2063 ScoreBrackets.determineWaitForLDSDMA(LOAD_CNT, TID, Wait);
2064 }
2065 if (Memop->isStore()) {
2066 ScoreBrackets.determineWaitForLDSDMA(EXP_CNT, TID, Wait);
2067 }
2068 }
2069
2070 // Loop over use and def operands.
2071 for (const MachineOperand &Op : MI.operands()) {
2072 if (!Op.isReg())
2073 continue;
2074
2075 // If the instruction does not read tied source, skip the operand.
2076 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
2077 continue;
2078
2079 MCPhysReg Reg = Op.getReg().asMCReg();
2080
2081 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
2082 if (IsVGPR) {
2083 // Implicit VGPR defs and uses are never a part of the memory
2084 // instructions description and usually present to account for
2085 // super-register liveness.
2086 // TODO: Most of the other instructions also have implicit uses
2087 // for the liveness accounting only.
2088 if (Op.isImplicit() && MI.mayLoadOrStore())
2089 continue;
2090
2091 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
2092 // previous write and this write are the same type of VMEM
2093 // instruction, in which case they are (in some architectures)
2094 // guaranteed to write their results in order anyway.
2095 // Additionally check instructions where Point Sample Acceleration
2096 // might be applied.
2097 if (Op.isUse() || !updateVMCntOnly(MI) ||
2098 ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||
2099 ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
2100 !ST->hasVmemWriteVgprInOrder()) {
2101 ScoreBrackets.determineWaitForPhysReg(LOAD_CNT, Reg, Wait);
2102 ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT, Reg, Wait);
2103 ScoreBrackets.determineWaitForPhysReg(BVH_CNT, Reg, Wait);
2104 ScoreBrackets.clearVgprVmemTypes(Reg);
2105 }
2106
2107 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
2108 ScoreBrackets.determineWaitForPhysReg(EXP_CNT, Reg, Wait);
2109 }
2110 ScoreBrackets.determineWaitForPhysReg(DS_CNT, Reg, Wait);
2111 } else if (Op.getReg() == AMDGPU::SCC) {
2112 ScoreBrackets.determineWaitForPhysReg(KM_CNT, Reg, Wait);
2113 } else {
2114 ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait);
2115 }
2116
2117 if (ST->hasWaitXCnt() && Op.isDef())
2118 ScoreBrackets.determineWaitForPhysReg(X_CNT, Reg, Wait);
2119 }
2120 }
2121 }
2122
2123 // Ensure safety against exceptions from outstanding memory operations while
2124 // waiting for a barrier:
2125 //
2126 // * Some subtargets safely handle backing off the barrier in hardware
2127 // when an exception occurs.
2128 // * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2129 // there can be no outstanding memory operations during the wait.
2130 // * Subtargets with split barriers don't need to back off the barrier; it
2131 // is up to the trap handler to preserve the user barrier state correctly.
2132 //
2133 // In all other cases, ensure safety by ensuring that there are no outstanding
2134 // memory operations.
2135 if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() &&
2136 !ST->supportsBackOffBarrier()) {
2137 Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
2138 }
2139
2140 // TODO: Remove this work-around, enable the assert for Bug 457939
2141 // after fixing the scheduler. Also, the Shader Compiler code is
2142 // independent of target.
2143 if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() &&
2144 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2145 Wait.DsCnt = 0;
2146 }
2147
2148 // Verify that the wait is actually needed.
2149 ScoreBrackets.simplifyWaitcnt(Wait);
2150
2151 // Since the translation for VMEM addresses occur in-order, we can apply the
2152 // XCnt if the current instruction is of VMEM type and has a memory
2153 // dependency with another VMEM instruction in flight.
2154 if (Wait.XCnt != ~0u && isVmemAccess(MI)) {
2155 ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt);
2156 Wait.XCnt = ~0u;
2157 }
2158
2159 // When forcing emit, we need to skip terminators because that would break the
2160 // terminators of the MBB if we emit a waitcnt between terminators.
2161 if (ForceEmitZeroFlag && !MI.isTerminator())
2162 Wait = WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false);
2163
2164 if (ForceEmitWaitcnt[LOAD_CNT])
2165 Wait.LoadCnt = 0;
2166 if (ForceEmitWaitcnt[EXP_CNT])
2167 Wait.ExpCnt = 0;
2168 if (ForceEmitWaitcnt[DS_CNT])
2169 Wait.DsCnt = 0;
2170 if (ForceEmitWaitcnt[SAMPLE_CNT])
2171 Wait.SampleCnt = 0;
2172 if (ForceEmitWaitcnt[BVH_CNT])
2173 Wait.BvhCnt = 0;
2174 if (ForceEmitWaitcnt[KM_CNT])
2175 Wait.KmCnt = 0;
2176 if (ForceEmitWaitcnt[X_CNT])
2177 Wait.XCnt = 0;
2178
2179 if (FlushVmCnt) {
2180 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2181 Wait.LoadCnt = 0;
2182 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2183 Wait.SampleCnt = 0;
2184 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2185 Wait.BvhCnt = 0;
2186 }
2187
2188 if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
2189 Wait.LoadCnt = 0;
2190
2191 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
2192 OldWaitcntInstr);
2193}
2194
2195bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
2197 MachineBasicBlock &Block,
2198 WaitcntBrackets &ScoreBrackets,
2199 MachineInstr *OldWaitcntInstr) {
2200 bool Modified = false;
2201
2202 if (OldWaitcntInstr)
2203 // Try to merge the required wait with preexisting waitcnt instructions.
2204 // Also erase redundant waitcnt.
2205 Modified =
2206 WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
2207
2208 // Any counts that could have been applied to any existing waitcnt
2209 // instructions will have been done so, now deal with any remaining.
2210 ScoreBrackets.applyWaitcnt(Wait);
2211
2212 // ExpCnt can be merged into VINTERP.
2213 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
2215 MachineOperand *WaitExp =
2216 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
2217 if (Wait.ExpCnt < WaitExp->getImm()) {
2218 WaitExp->setImm(Wait.ExpCnt);
2219 Modified = true;
2220 }
2221 Wait.ExpCnt = ~0u;
2222
2223 LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
2224 << "Update Instr: " << *It);
2225 }
2226
2227 if (WCG->createNewWaitcnt(Block, It, Wait))
2228 Modified = true;
2229
2230 return Modified;
2231}
2232
2233bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
2234 return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||
2235 (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
2236}
2237
2238// Return true if the next instruction is S_ENDPGM, following fallthrough
2239// blocks if necessary.
2240bool SIInsertWaitcnts::isNextENDPGM(MachineBasicBlock::instr_iterator It,
2241 MachineBasicBlock *Block) const {
2242 auto BlockEnd = Block->getParent()->end();
2243 auto BlockIter = Block->getIterator();
2244
2245 while (true) {
2246 if (It.isEnd()) {
2247 if (++BlockIter != BlockEnd) {
2248 It = BlockIter->instr_begin();
2249 continue;
2250 }
2251
2252 return false;
2253 }
2254
2255 if (!It->isMetaInstruction())
2256 break;
2257
2258 It++;
2259 }
2260
2261 assert(!It.isEnd());
2262
2263 return It->getOpcode() == AMDGPU::S_ENDPGM;
2264}
2265
2266// Add a wait after an instruction if architecture requirements mandate one.
2267bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
2268 MachineBasicBlock &Block,
2269 WaitcntBrackets &ScoreBrackets) {
2270 AMDGPU::Waitcnt Wait;
2271 bool NeedsEndPGMCheck = false;
2272
2273 if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
2274 Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
2276
2277 if (TII->isAlwaysGDS(Inst.getOpcode())) {
2278 Wait.DsCnt = 0;
2279 NeedsEndPGMCheck = true;
2280 }
2281
2282 ScoreBrackets.simplifyWaitcnt(Wait);
2283
2284 auto SuccessorIt = std::next(Inst.getIterator());
2285 bool Result = generateWaitcnt(Wait, SuccessorIt, Block, ScoreBrackets,
2286 /*OldWaitcntInstr=*/nullptr);
2287
2288 if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
2289 BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
2290 .addImm(0);
2291 }
2292
2293 return Result;
2294}
2295
2296void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
2297 WaitcntBrackets *ScoreBrackets) {
2298 // Now look at the instruction opcode. If it is a memory access
2299 // instruction, update the upper-bound of the appropriate counter's
2300 // bracket and the destination operand scores.
2301 // For architectures with X_CNT, mark the source address operands
2302 // with the appropriate counter values.
2303 // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
2304
2305 bool IsVMEMAccess = false;
2306 bool IsSMEMAccess = false;
2307 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
2308 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
2309 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
2310 ScoreBrackets->updateByEvent(GDS_ACCESS, Inst);
2311 ScoreBrackets->updateByEvent(GDS_GPR_LOCK, Inst);
2312 ScoreBrackets->setPendingGDS();
2313 } else {
2314 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2315 }
2316 } else if (TII->isFLAT(Inst)) {
2318 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2319 return;
2320 }
2321
2322 assert(Inst.mayLoadOrStore());
2323
2324 int FlatASCount = 0;
2325
2326 if (TII->mayAccessVMEMThroughFlat(Inst)) {
2327 ++FlatASCount;
2328 IsVMEMAccess = true;
2329 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2330 }
2331
2332 if (TII->mayAccessLDSThroughFlat(Inst)) {
2333 ++FlatASCount;
2334 ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
2335 }
2336
2337 // Async/LDSDMA operations have FLAT encoding but do not actually use flat
2338 // pointers. They do have two operands that each access global and LDS, thus
2339 // making it appear at this point that they are using a flat pointer. Filter
2340 // them out, and for the rest, generate a dependency on flat pointers so
2341 // that both VM and LGKM counters are flushed.
2342 if (!SIInstrInfo::isLDSDMA(Inst) && FlatASCount > 1)
2343 ScoreBrackets->setPendingFlat();
2344 } else if (SIInstrInfo::isVMEM(Inst) &&
2346 IsVMEMAccess = true;
2347 ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
2348
2349 if (ST->vmemWriteNeedsExpWaitcnt() &&
2350 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
2351 ScoreBrackets->updateByEvent(VMW_GPR_LOCK, Inst);
2352 }
2353 } else if (TII->isSMRD(Inst)) {
2354 IsSMEMAccess = true;
2355 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2356 } else if (Inst.isCall()) {
2357 if (callWaitsOnFunctionReturn(Inst)) {
2358 // Act as a wait on everything
2359 ScoreBrackets->applyWaitcnt(
2360 WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
2361 ScoreBrackets->setStateOnFunctionEntryOrReturn();
2362 } else {
2363 // May need to way wait for anything.
2364 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
2365 }
2366 } else if (SIInstrInfo::isLDSDIR(Inst)) {
2367 ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst);
2368 } else if (TII->isVINTERP(Inst)) {
2369 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
2370 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
2371 } else if (SIInstrInfo::isEXP(Inst)) {
2372 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
2374 ScoreBrackets->updateByEvent(EXP_PARAM_ACCESS, Inst);
2375 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
2376 ScoreBrackets->updateByEvent(EXP_POS_ACCESS, Inst);
2377 else
2378 ScoreBrackets->updateByEvent(EXP_GPR_LOCK, Inst);
2379 } else if (SIInstrInfo::isSBarrierSCCWrite(Inst.getOpcode())) {
2380 ScoreBrackets->updateByEvent(SCC_WRITE, Inst);
2381 } else {
2382 switch (Inst.getOpcode()) {
2383 case AMDGPU::S_SENDMSG:
2384 case AMDGPU::S_SENDMSG_RTN_B32:
2385 case AMDGPU::S_SENDMSG_RTN_B64:
2386 case AMDGPU::S_SENDMSGHALT:
2387 ScoreBrackets->updateByEvent(SQ_MESSAGE, Inst);
2388 break;
2389 case AMDGPU::S_MEMTIME:
2390 case AMDGPU::S_MEMREALTIME:
2391 case AMDGPU::S_GET_BARRIER_STATE_M0:
2392 case AMDGPU::S_GET_BARRIER_STATE_IMM:
2393 ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
2394 break;
2395 }
2396 }
2397
2398 if (!ST->hasWaitXCnt())
2399 return;
2400
2401 if (IsVMEMAccess)
2402 ScoreBrackets->updateByEvent(VMEM_GROUP, Inst);
2403
2404 if (IsSMEMAccess)
2405 ScoreBrackets->updateByEvent(SMEM_GROUP, Inst);
2406}
2407
2408bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
2409 unsigned OtherScore) {
2410 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
2411 unsigned OtherShifted =
2412 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
2413 Score = std::max(MyShifted, OtherShifted);
2414 return OtherShifted > MyShifted;
2415}
2416
2417/// Merge the pending events and associater score brackets of \p Other into
2418/// this brackets status.
2419///
2420/// Returns whether the merge resulted in a change that requires tighter waits
2421/// (i.e. the merged brackets strictly dominate the original brackets).
2422bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
2423 bool StrictDom = false;
2424
2425 // Check if "other" has keys we don't have, and create default entries for
2426 // those. If they remain empty after merging, we will clean it up after.
2427 for (auto K : Other.VMem.keys())
2428 VMem.try_emplace(K);
2429 for (auto K : Other.SGPRs.keys())
2430 SGPRs.try_emplace(K);
2431
2432 for (auto T : inst_counter_types(Context->MaxCounter)) {
2433 // Merge event flags for this counter
2434 const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst;
2435 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
2436 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
2437 if (OtherEvents & ~OldEvents)
2438 StrictDom = true;
2439 PendingEvents |= OtherEvents;
2440
2441 // Merge scores for this counter
2442 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
2443 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
2444 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
2445 if (NewUB < ScoreLBs[T])
2446 report_fatal_error("waitcnt score overflow");
2447
2448 MergeInfo M;
2449 M.OldLB = ScoreLBs[T];
2450 M.OtherLB = Other.ScoreLBs[T];
2451 M.MyShift = NewUB - ScoreUBs[T];
2452 M.OtherShift = NewUB - Other.ScoreUBs[T];
2453
2454 ScoreUBs[T] = NewUB;
2455
2456 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
2457
2458 if (T == DS_CNT)
2459 StrictDom |= mergeScore(M, LastGDS, Other.LastGDS);
2460
2461 if (T == KM_CNT) {
2462 StrictDom |= mergeScore(M, SCCScore, Other.SCCScore);
2463 if (Other.hasPendingEvent(SCC_WRITE)) {
2464 unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
2465 if (!OldEventsHasSCCWrite) {
2466 PendingSCCWrite = Other.PendingSCCWrite;
2467 } else if (PendingSCCWrite != Other.PendingSCCWrite) {
2468 PendingSCCWrite = nullptr;
2469 }
2470 }
2471 }
2472
2473 for (auto &[RegID, Info] : VMem)
2474 StrictDom |= mergeScore(M, Info.Scores[T], Other.getVMemScore(RegID, T));
2475
2476 if (isSmemCounter(T)) {
2477 unsigned Idx = getSgprScoresIdx(T);
2478 for (auto &[RegID, Info] : SGPRs) {
2479 auto It = Other.SGPRs.find(RegID);
2480 unsigned OtherScore =
2481 (It != Other.SGPRs.end()) ? It->second.Scores[Idx] : 0;
2482 StrictDom |= mergeScore(M, Info.Scores[Idx], OtherScore);
2483 }
2484 }
2485 }
2486
2487 for (auto &[TID, Info] : VMem) {
2488 if (auto It = Other.VMem.find(TID); It != Other.VMem.end()) {
2489 unsigned char NewVmemTypes = Info.VMEMTypes | It->second.VMEMTypes;
2490 StrictDom |= NewVmemTypes != Info.VMEMTypes;
2491 Info.VMEMTypes = NewVmemTypes;
2492 }
2493 }
2494
2495 purgeEmptyTrackingData();
2496 return StrictDom;
2497}
2498
2499static bool isWaitInstr(MachineInstr &Inst) {
2500 unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
2501 return Opcode == AMDGPU::S_WAITCNT ||
2502 (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
2503 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
2504 Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
2505 Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
2506 Opcode == AMDGPU::S_WAITCNT_lds_direct ||
2507 counterTypeForInstr(Opcode).has_value();
2508}
2509
2510// Generate s_waitcnt instructions where needed.
2511bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
2512 MachineBasicBlock &Block,
2513 WaitcntBrackets &ScoreBrackets) {
2514 bool Modified = false;
2515
2516 LLVM_DEBUG({
2517 dbgs() << "*** Begin Block: ";
2518 Block.printName(dbgs());
2519 ScoreBrackets.dump();
2520 });
2521
2522 // Track the correctness of vccz through this basic block. There are two
2523 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
2524 // ST->partialVCCWritesUpdateVCCZ().
2525 bool VCCZCorrect = true;
2526 if (ST->hasReadVCCZBug()) {
2527 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2528 // to vcc and then issued an smem load.
2529 VCCZCorrect = false;
2530 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
2531 // vccz could be incorrect at a basic block boundary if a predecessor wrote
2532 // to vcc_lo or vcc_hi.
2533 VCCZCorrect = false;
2534 }
2535
2536 // Walk over the instructions.
2537 MachineInstr *OldWaitcntInstr = nullptr;
2538
2539 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
2540 E = Block.instr_end();
2541 Iter != E;) {
2542 MachineInstr &Inst = *Iter;
2543 if (Inst.isMetaInstruction()) {
2544 ++Iter;
2545 continue;
2546 }
2547
2548 // Track pre-existing waitcnts that were added in earlier iterations or by
2549 // the memory legalizer.
2550 if (isWaitInstr(Inst)) {
2551 if (!OldWaitcntInstr)
2552 OldWaitcntInstr = &Inst;
2553 ++Iter;
2554 continue;
2555 }
2556
2557 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
2558 isPreheaderToFlush(Block, ScoreBrackets);
2559
2560 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
2561 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
2562 FlushVmCnt);
2563 OldWaitcntInstr = nullptr;
2564
2565 // Restore vccz if it's not known to be correct already.
2566 bool RestoreVCCZ = !VCCZCorrect && SIInstrInfo::isCBranchVCCZRead(Inst);
2567
2568 // Don't examine operands unless we need to track vccz correctness.
2569 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
2570 if (Inst.definesRegister(AMDGPU::VCC_LO, /*TRI=*/nullptr) ||
2571 Inst.definesRegister(AMDGPU::VCC_HI, /*TRI=*/nullptr)) {
2572 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
2573 if (!ST->partialVCCWritesUpdateVCCZ())
2574 VCCZCorrect = false;
2575 } else if (Inst.definesRegister(AMDGPU::VCC, /*TRI=*/nullptr)) {
2576 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
2577 // vccz bit, so when we detect that an instruction may read from a
2578 // corrupt vccz bit, we need to:
2579 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
2580 // operations to complete.
2581 // 2. Restore the correct value of vccz by writing the current value
2582 // of vcc back to vcc.
2583 if (ST->hasReadVCCZBug() &&
2584 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
2585 // Writes to vcc while there's an outstanding smem read may get
2586 // clobbered as soon as any read completes.
2587 VCCZCorrect = false;
2588 } else {
2589 // Writes to vcc will fix any incorrect value in vccz.
2590 VCCZCorrect = true;
2591 }
2592 }
2593 }
2594
2595 if (TII->isSMRD(Inst)) {
2596 for (const MachineMemOperand *Memop : Inst.memoperands()) {
2597 // No need to handle invariant loads when avoiding WAR conflicts, as
2598 // there cannot be a vector store to the same memory location.
2599 if (!Memop->isInvariant()) {
2600 const Value *Ptr = Memop->getValue();
2601 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
2602 }
2603 }
2604 if (ST->hasReadVCCZBug()) {
2605 // This smem read could complete and clobber vccz at any time.
2606 VCCZCorrect = false;
2607 }
2608 }
2609
2610 updateEventWaitcntAfter(Inst, &ScoreBrackets);
2611
2612 Modified |= insertForcedWaitAfter(Inst, Block, ScoreBrackets);
2613
2614 LLVM_DEBUG({
2615 Inst.print(dbgs());
2616 ScoreBrackets.dump();
2617 });
2618
2619 // TODO: Remove this work-around after fixing the scheduler and enable the
2620 // assert above.
2621 if (RestoreVCCZ) {
2622 // Restore the vccz bit. Any time a value is written to vcc, the vcc
2623 // bit is updated, so we can restore the bit by reading the value of
2624 // vcc and then writing it back to the register.
2625 BuildMI(Block, Inst, Inst.getDebugLoc(),
2626 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
2627 TRI->getVCC())
2628 .addReg(TRI->getVCC());
2629 VCCZCorrect = true;
2630 Modified = true;
2631 }
2632
2633 ++Iter;
2634 }
2635
2636 // Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
2637 // needed.
2638 AMDGPU::Waitcnt Wait;
2639 if (Block.getFirstTerminator() == Block.end() &&
2640 isPreheaderToFlush(Block, ScoreBrackets)) {
2641 if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2642 Wait.LoadCnt = 0;
2643 if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2644 Wait.SampleCnt = 0;
2645 if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2646 Wait.BvhCnt = 0;
2647 }
2648
2649 // Combine or remove any redundant waitcnts at the end of the block.
2650 Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
2651 OldWaitcntInstr);
2652
2653 LLVM_DEBUG({
2654 dbgs() << "*** End Block: ";
2655 Block.printName(dbgs());
2656 ScoreBrackets.dump();
2657 });
2658
2659 return Modified;
2660}
2661
2662// Return true if the given machine basic block is a preheader of a loop in
2663// which we want to flush the vmcnt counter, and false otherwise.
2664bool SIInsertWaitcnts::isPreheaderToFlush(
2665 MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets) {
2666 auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
2667 if (!IsInserted)
2668 return Iterator->second;
2669
2670 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
2671 if (!Succ)
2672 return false;
2673
2674 MachineLoop *Loop = MLI->getLoopFor(Succ);
2675 if (!Loop)
2676 return false;
2677
2678 if (Loop->getLoopPreheader() == &MBB &&
2679 shouldFlushVmCnt(Loop, ScoreBrackets)) {
2680 Iterator->second = true;
2681 return true;
2682 }
2683
2684 return false;
2685}
2686
2687bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
2689 return TII->mayAccessVMEMThroughFlat(MI);
2690 return SIInstrInfo::isVMEM(MI);
2691}
2692
2693// Return true if it is better to flush the vmcnt counter in the preheader of
2694// the given loop. We currently decide to flush in two situations:
2695// 1. The loop contains vmem store(s), no vmem load and at least one use of a
2696// vgpr containing a value that is loaded outside of the loop. (Only on
2697// targets with no vscnt counter).
2698// 2. The loop contains vmem load(s), but the loaded values are not used in the
2699// loop, and at least one use of a vgpr containing a value that is loaded
2700// outside of the loop.
2701bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
2702 const WaitcntBrackets &Brackets) {
2703 bool HasVMemLoad = false;
2704 bool HasVMemStore = false;
2705 bool UsesVgprLoadedOutside = false;
2706 DenseSet<MCRegUnit> VgprUse;
2707 DenseSet<MCRegUnit> VgprDef;
2708
2709 for (MachineBasicBlock *MBB : ML->blocks()) {
2710 for (MachineInstr &MI : *MBB) {
2711 if (isVMEMOrFlatVMEM(MI)) {
2712 HasVMemLoad |= MI.mayLoad();
2713 HasVMemStore |= MI.mayStore();
2714 }
2715
2716 for (const MachineOperand &Op : MI.all_uses()) {
2717 if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
2718 continue;
2719 // Vgpr use
2720 for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
2721 // If we find a register that is loaded inside the loop, 1. and 2.
2722 // are invalidated and we can exit.
2723 if (VgprDef.contains(RU))
2724 return false;
2725 VgprUse.insert(RU);
2726 // If at least one of Op's registers is in the score brackets, the
2727 // value is likely loaded outside of the loop.
2728 VMEMID ID = toVMEMID(RU);
2729 if (Brackets.getVMemScore(ID, LOAD_CNT) >
2730 Brackets.getScoreLB(LOAD_CNT) ||
2731 Brackets.getVMemScore(ID, SAMPLE_CNT) >
2732 Brackets.getScoreLB(SAMPLE_CNT) ||
2733 Brackets.getVMemScore(ID, BVH_CNT) >
2734 Brackets.getScoreLB(BVH_CNT)) {
2735 UsesVgprLoadedOutside = true;
2736 break;
2737 }
2738 }
2739 }
2740
2741 // VMem load vgpr def
2742 if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
2743 for (const MachineOperand &Op : MI.all_defs()) {
2744 for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
2745 // If we find a register that is loaded inside the loop, 1. and 2.
2746 // are invalidated and we can exit.
2747 if (VgprUse.contains(RU))
2748 return false;
2749 VgprDef.insert(RU);
2750 }
2751 }
2752 }
2753 }
2754 }
2755 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
2756 return true;
2757 return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder();
2758}
2759
2760bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
2761 auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
2762 auto *PDT =
2763 &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
2764 AliasAnalysis *AA = nullptr;
2765 if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
2766 AA = &AAR->getAAResults();
2767
2768 return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
2769}
2770
2771PreservedAnalyses
2774 auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF);
2775 auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF);
2777 .getManager()
2778 .getCachedResult<AAManager>(MF.getFunction());
2779
2780 if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF))
2781 return PreservedAnalyses::all();
2782
2785 .preserve<AAManager>();
2786}
2787
2788bool SIInsertWaitcnts::run(MachineFunction &MF) {
2789 ST = &MF.getSubtarget<GCNSubtarget>();
2790 TII = ST->getInstrInfo();
2791 TRI = &TII->getRegisterInfo();
2792 MRI = &MF.getRegInfo();
2794
2796
2797 if (ST->hasExtendedWaitCounts()) {
2798 MaxCounter = NUM_EXTENDED_INST_CNTS;
2799 WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
2800 WCG = &WCGGFX12Plus;
2801 } else {
2802 MaxCounter = NUM_NORMAL_INST_CNTS;
2803 WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF, MaxCounter);
2804 WCG = &WCGPreGFX12;
2805 }
2806
2807 for (auto T : inst_counter_types())
2808 ForceEmitWaitcnt[T] = false;
2809
2810 WaitEventMaskForInst = WCG->getWaitEventMask();
2811
2812 SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
2813
2814 if (ST->hasExtendedWaitCounts()) {
2815 Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
2816 Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
2817 } else {
2818 Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
2819 Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
2820 }
2821 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
2822 Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
2823 Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
2824 Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
2825 Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
2826 Limits.XcntMax = AMDGPU::getXcntBitMask(IV);
2827
2828 BlockInfos.clear();
2829 bool Modified = false;
2830
2831 MachineBasicBlock &EntryBB = MF.front();
2833
2834 if (!MFI->isEntryFunction()) {
2835 // Wait for any outstanding memory operations that the input registers may
2836 // depend on. We can't track them and it's better to do the wait after the
2837 // costly call sequence.
2838
2839 // TODO: Could insert earlier and schedule more liberally with operations
2840 // that only use caller preserved registers.
2841 for (MachineBasicBlock::iterator E = EntryBB.end();
2842 I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
2843 ;
2844
2845 if (ST->hasExtendedWaitCounts()) {
2846 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
2847 .addImm(0);
2848 for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
2849 if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
2850 continue;
2851
2852 if (!ST->hasImageInsts() &&
2853 (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
2854 continue;
2855
2856 BuildMI(EntryBB, I, DebugLoc(),
2857 TII->get(instrsForExtendedCounterTypes[CT]))
2858 .addImm(0);
2859 }
2860 } else {
2861 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
2862 }
2863
2864 auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
2865 NonKernelInitialState->setStateOnFunctionEntryOrReturn();
2866 BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
2867
2868 Modified = true;
2869 }
2870
2871 // Keep iterating over the blocks in reverse post order, inserting and
2872 // updating s_waitcnt where needed, until a fix point is reached.
2873 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
2874 BlockInfos.try_emplace(MBB);
2875
2876 std::unique_ptr<WaitcntBrackets> Brackets;
2877 bool Repeat;
2878 do {
2879 Repeat = false;
2880
2881 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
2882 ++BII) {
2883 MachineBasicBlock *MBB = BII->first;
2884 BlockInfo &BI = BII->second;
2885 if (!BI.Dirty)
2886 continue;
2887
2888 if (BI.Incoming) {
2889 if (!Brackets)
2890 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
2891 else
2892 *Brackets = *BI.Incoming;
2893 } else {
2894 if (!Brackets) {
2895 Brackets = std::make_unique<WaitcntBrackets>(this);
2896 } else {
2897 // Reinitialize in-place. N.B. do not do this by assigning from a
2898 // temporary because the WaitcntBrackets class is large and it could
2899 // cause this function to use an unreasonable amount of stack space.
2900 Brackets->~WaitcntBrackets();
2901 new (Brackets.get()) WaitcntBrackets(this);
2902 }
2903 }
2904
2905 Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
2906 BI.Dirty = false;
2907
2908 if (Brackets->hasPendingEvent()) {
2909 BlockInfo *MoveBracketsToSucc = nullptr;
2910 for (MachineBasicBlock *Succ : MBB->successors()) {
2911 auto *SuccBII = BlockInfos.find(Succ);
2912 BlockInfo &SuccBI = SuccBII->second;
2913 if (!SuccBI.Incoming) {
2914 SuccBI.Dirty = true;
2915 if (SuccBII <= BII) {
2916 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
2917 Repeat = true;
2918 }
2919 if (!MoveBracketsToSucc) {
2920 MoveBracketsToSucc = &SuccBI;
2921 } else {
2922 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
2923 }
2924 } else if (SuccBI.Incoming->merge(*Brackets)) {
2925 SuccBI.Dirty = true;
2926 if (SuccBII <= BII) {
2927 LLVM_DEBUG(dbgs() << "repeat on backedge\n");
2928 Repeat = true;
2929 }
2930 }
2931 }
2932 if (MoveBracketsToSucc)
2933 MoveBracketsToSucc->Incoming = std::move(Brackets);
2934 }
2935 }
2936 } while (Repeat);
2937
2938 if (ST->hasScalarStores()) {
2939 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
2940 bool HaveScalarStores = false;
2941
2942 for (MachineBasicBlock &MBB : MF) {
2943 for (MachineInstr &MI : MBB) {
2944 if (!HaveScalarStores && TII->isScalarStore(MI))
2945 HaveScalarStores = true;
2946
2947 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
2948 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
2949 EndPgmBlocks.push_back(&MBB);
2950 }
2951 }
2952
2953 if (HaveScalarStores) {
2954 // If scalar writes are used, the cache must be flushed or else the next
2955 // wave to reuse the same scratch memory can be clobbered.
2956 //
2957 // Insert s_dcache_wb at wave termination points if there were any scalar
2958 // stores, and only if the cache hasn't already been flushed. This could
2959 // be improved by looking across blocks for flushes in postdominating
2960 // blocks from the stores but an explicitly requested flush is probably
2961 // very rare.
2962 for (MachineBasicBlock *MBB : EndPgmBlocks) {
2963 bool SeenDCacheWB = false;
2964
2965 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
2966 I != E; ++I) {
2967 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
2968 SeenDCacheWB = true;
2969 else if (TII->isScalarStore(*I))
2970 SeenDCacheWB = false;
2971
2972 // FIXME: It would be better to insert this before a waitcnt if any.
2973 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
2974 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
2975 !SeenDCacheWB) {
2976 Modified = true;
2977 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
2978 }
2979 }
2980 }
2981 }
2982 }
2983
2984 // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
2985 // This is done in different ways depending on how the VGPRs were allocated
2986 // (i.e. whether we're in dynamic VGPR mode or not).
2987 // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
2988 // waveslot limited kernel runs slower with the deallocation.
2989 if (MFI->isDynamicVGPREnabled()) {
2990 for (MachineInstr *MI : ReleaseVGPRInsts) {
2991 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2992 TII->get(AMDGPU::S_ALLOC_VGPR))
2993 .addImm(0);
2994 Modified = true;
2995 }
2996 } else {
2997 if (!ReleaseVGPRInsts.empty() &&
2998 (MF.getFrameInfo().hasCalls() ||
2999 ST->getOccupancyWithNumVGPRs(
3000 TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
3001 /*IsDynamicVGPR=*/false) <
3003 for (MachineInstr *MI : ReleaseVGPRInsts) {
3004 if (ST->requiresNopBeforeDeallocVGPRs()) {
3005 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3006 TII->get(AMDGPU::S_NOP))
3007 .addImm(0);
3008 }
3009 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3010 TII->get(AMDGPU::S_SENDMSG))
3012 Modified = true;
3013 }
3014 }
3015 }
3016 ReleaseVGPRInsts.clear();
3017 PreheadersToFlush.clear();
3018 SLoadAddresses.clear();
3019
3020 return Modified;
3021}
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
static bool isOptNone(const MachineFunction &MF)
IRTranslator LLVM IR MI
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
#define T
static bool isReg(const MCInst &MI, unsigned OpNo)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
#define AMDGPU_EVENT_NAME(Name)
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
static bool isWaitInstr(MachineInstr &Inst)
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
#define AMDGPU_EVENT_ENUM(Name)
Provides some synthesis utilities to produce sequences of values.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static const uint32_t IV[8]
Definition blake3_impl.h:83
A manager for alias analyses.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
static bool shouldExecute(CounterInfo &Counter)
static bool isCounterSet(CounterInfo &Info)
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool erase(const KeyT &Val)
Definition DenseMap.h:330
iterator end()
Definition DenseMap.h:81
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
mop_range defs()
Returns all explicit operands that are register definitions.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
const MachineBasicBlock * getParent() const
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
bool isCall(QueryType Type=AnyInBundle) const
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
const MachineOperand & getOperand(unsigned i) const
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
Analysis pass that exposes the MachineLoopInfo for a machine function.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
Register getReg() const
getReg - Returns the register number.
iterator end()
Definition MapVector.h:67
iterator find(const KeyT &Key)
Definition MapVector.h:154
iterator begin()
Definition MapVector.h:65
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
static bool isCBranchVCCZRead(const MachineInstr &MI)
static bool isVMEM(const MachineInstr &MI)
static bool isFLATScratch(const MachineInstr &MI)
static bool isEXP(const MachineInstr &MI)
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isLDSDIR(const MachineInstr &MI)
static bool isGWS(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isImage(const MachineInstr &MI)
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
static bool isVINTERP(const MachineInstr &MI)
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
static bool isSBarrierSCCWrite(unsigned Opcode)
static bool isMIMG(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
static bool isAtomicNoRet(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void push_back(const T &Elt)
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:854
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
unsigned getStorecntBitMask(const IsaVersion &Version)
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getSamplecntBitMask(const IsaVersion &Version)
unsigned getKmcntBitMask(const IsaVersion &Version)
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getXcntBitMask(const IsaVersion &Version)
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getBvhcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
bool getMUBUFIsBufferInv(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getLoadcntBitMask(const IsaVersion &Version)
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
unsigned getDscntBitMask(const IsaVersion &Version)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
bool empty() const
Definition BasicBlock.h:101
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
Definition Sequence.h:337
@ Wait
Definition Threading.h:60
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
char & SIInsertWaitcntsID
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
iterator_range(Container &&) -> iterator_range< llvm::detail::IterOfRange< Container > >
@ Other
Any other memory.
Definition ModRef.h:68
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
FunctionPass * createSIInsertWaitcntsPass()
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
Instruction set architecture version.
Represents the counter values to wait for in an s_waitcnt instruction.